Reputation: 880
I'm trying to learn to use parallel computing in julia and I'm trying to execute this code (found [here][1])
using Distributed
addprocs(2, exeflags="--project=.")
@everywhere begin
using Distributed
using StatsBase
using BenchmarkTools
end
data = rand(1000,2000)
@everywhere function t2(d1,d2)
append!(d1,d2)
d1
end
@btime begin
res = @distributed (t2) for col = 1:size(data)[2]
[(myid(),col, StatsBase.mean(data[:,col]))]
end
end
The result is in my laptop with 4 cores and 8 threads (2.21 GHz) is
11.836 ms (182 allocations: 78.06 KiB)
But when I try to scale up, adding 2 more core, the timing doesn't seem to improve:
addprocs(2, exeflags="--project=.")
nworkers() # result 4
@everywhere begin
using Distributed
using StatsBase
using BenchmarkTools
end
data = rand(1000,2000)
@everywhere function t2(d1,d2)
append!(d1,d2)
d1
end
@btime begin
res = @distributed (t2) for col = 1:size(data)[2]
[(myid(),col, StatsBase.mean(data[:,col]))]
end
end
The final computing time is:
15.449 ms (340 allocations: 132.34 KiB)
Do you have some idea of what I'm doing wrong? Thank you [1]: @distributed seems to work, function return is wonky
Upvotes: 3
Views: 390
Reputation: 158
Here is an example with GroupedDataFrames:
using DataFrames
using CSV
function main(input_file::String, output_file::String, by_index::Array{Symbol,1})
data = DataFrame(CSV.File(input_file))
grouped_rows = groupby(data, by_index)
Threads.@threads for group in collect(SubDataFrame, grouped_rows)
index_value = group[1, by_index]
println(index_value)
# compute slow function for group of rows in dataframe
output_vector = costly_function(group)
# copy vector to elements in the dataframe
group[:, :p] .= output_vector
end
CSV.write(output_file, data)
end
Upvotes: 0