How to get near optimal parallel efficiency for this simple Julia code?

Question

I have the following simple code:

function hamming4(bits1::Integer, bits2::Integer)
    return count_ones(bits1 ⊻ bits2)
end

function random_strings2(n, N)
    mask = UInt128(1) << n - 1
    return [rand(UInt128) & mask for i in 1:N]
end




function find_min(strings, n, N)
    minsofar = fill(n, Threads.nthreads())
    # minsofar = n
    Threads.@threads for i in 1:N 
    # for i in 1:N
        for j in i+1:N
            dist = hamming4(strings[i], strings[j])
            if dist < minsofar[Threads.threadid()]
                    minsofar[Threads.threadid()] = dist

            end
        end
    end
    return minimum(minsofar)
    #return minsofar
end


function ave_min(n, N)
    ITER = 10
    strings = random_strings2(n, N)
    new_min = find_min(strings, n, N)
    avesofar = new_min
    # print("New min ", new_min, ". New ave ", avesofar, "
")
    total = avesofar
    for i in 1:ITER-1
        strings = random_strings2(n, N)
        new_min = find_min(strings, n, N)
        avesofar = avesofar*(i/(i+1)) + new_min/(i+1)
        print("Iteration ", i, ". New min ", new_min, ". New ave ", round(avesofar; digits=2), "
")
    end
    return avesofar
end

N = 2^16
n = 99

print("Overall average ", ave_min(n, N), "
")

When I run it on an AMD 8350 in linux the CPU usage is around 430% (instead of close to 800%).

Is it possible to make the parallelisation work more efficiently?

Also, I noticed a new very impressive looking package called LoopVectorization.jl. As I am computing the Hamming distance in what looks like a vectorizable way, is it possible to speed up the code this way too?

Can the code be vectorized using LoopVectorization.jl?

(I am completely new to Julia)

How to get near optimal parallel efficiency for this simple Julia code?

Answers (1)

EDIT

Related Questions