源于前几天帮组内同学落地一模型,之前Python代码比较慢,结合业务无法落地,所以将核心脚本全部转为Julia,当时在Ubuntu上只启动8核性能提升约10倍。这里仅仅使用计算余弦相似度这个简单的例子来对比,其实还有其它的并行方案,特别采用GPU对于这类矩阵计算提升是非常大的。需要说明的是,这里是通过一台2015老款MBP测试,貌似环境还是有些问题,因为使用Julia多线程时基本没起作用,但前几天在Ubuntu上没出现这个问题,具体原因未知,所以以下时间没有太大意义,主要基于测试代码方便感兴趣的同学可以使用自己的电脑进行测试。
from numba import jit
import numpy as np
import pandas as pd
def cos_sim(cluster, document):
denom = np.linalg.norm(cluster)*np.linalg.norm(document)
return np.dot(cluster, document)/denom
@jit(nopython=True)
def cos_sim_numba(cluster, document):
denom = np.linalg.norm(cluster)*np.linalg.norm(document)
return np.dot(cluster, document) / denom
data = np.random.randn(1000000,100)
%timeit [cos_sim_numba(data[0],data[i]) for i in range(1, 1000000)]
%timeit [cos_sim(data[0],data[i]) for i in range(1, 1000000)]
using Base.Threads
using Distributed
using Random
addprocs(4)
@everywhere using LinearAlgebra
data = randn(1000000, 500);
@everywhere function cos_sim_julia(x::Vector{Float64}, y::Vector{Float64})
return dot(x, y) / (norm(x) * norm(y))
end
@time Threads.@threads for i in 2:size(data,1)
cos_sim_julia(data[1,:], data[i,:])
end
@time dot(data[1,:], data[2,:])
function _dot(x,y)
sum(((xi,yi),)-> xi*yi ,zip(x,y))
end
@time _dot(data[1,:], data[2,:])
function _norm(x::Vector{Float64})
return _dot(x,x) |> sqrt
end
@time norm(data[1,:])
@time _norm(data[1,:])
@everywhere function cos_sim_julia2(x::Vector{Float64}, y::Vector{Float64})
return _dot(x, y) / (_norm(x) * _norm(y))
end
@time Threads.@threads for i in 2:size(data,1)
cos_sim_julia2(data[1,:], data[i,:])
end