Performance differences when looping over arrays of structs

Question

I have recently started using Julia to speed up some code previously written in Python. I only have prior experience with Python, so this is my first time caring about performance and I have found some strange behavior when looping over an array of structs. I am defining a new struct Gaussian, which represent a 2d Gaussian function and a function intensity() which calculates the amplitude of the function at a given position:

struct Gaussian{T<:Float32}
    x0::T
    y0::T
    A::T
    a::T
    b::T
    c::T
end

function intensity(
    model::Gaussian,
    x::Float32,
    y::Float32
    )
    gaussian_value::Float32 = model.A*exp(
        -( 
            model.a * (x - model.x0)^2 +
            2 * model.b * (x - model.x0) * (y - model.y0) +
            model.c * (y - model.y0)^2
        )
    )
    return gaussian_value
end

Then, I make an array of 2000 random instances of Gaussian:

function build_array()
    length = 2000
    random_pos = [rand(Float32, (1, 2)) for i in 1:length]
    random_A = rand(Float32, (length, 1))
    random_a = rand(Float32, (length, 1))
    random_b = rand(Float32, (length, 1))
    random_c = rand(Float32, (length, 1));

    gaussians::Array{Gaussian} = []

    for (pos, A, a, b, c) in zip(
        random_pos,
        random_A,
        random_a,
        random_b,
        random_c
        )
        new_gaussian = Gaussian(pos..., A, a, b, c)
        push!(gaussians, new_gaussian)
    end
    
    return gaussians
end

gaussians = build_array()

When I benchmark a single call to the intensity() function, it takes ~100 ns with 1 allocation (makes sense). I would expect that looping over the array of Gaussians should then take 2000*100 ns = 200 us. However, it actually takes about twice as long:

function total_intensity1(gaussian_list::Array{Gaussian})
    total = sum(intensity.(gaussian_list, Float32(0.75), Float32(0.11)))
end

function total_intensity2(gaussian_list::Array{Gaussian})
    total::Float32 = 0.
    for gaussian in gaussian_list
        total += intensity(gaussian, Float32(0.75), Float32(0.11))
    end
    return total 
end

@btime sum(intensity.(gaussians, Float32(0.75), Float32(0.11)))
@btime begin
    total::Float32 = 0.
    for gauss in gaussians
        total += intensity(gauss, Float32(0.75), Float32(0.11))
    end
    total
end
@btime total_intensity1(gaussians)
@btime total_intensity2(gaussians)

  397.700 μs (16004 allocations: 258.02 KiB)
  285.800 μs (8980 allocations: 234.06 KiB)
  396.100 μs (16002 allocations: 257.95 KiB)
  396.700 μs (16001 allocations: 250.02 KiB)

The number of allocations is also much larger than I would expect and there is a difference between the second and fourth method even though the code is pretty much the same. My questions:

Where do these differences come from?
How can I improve the performance of the code?

EDIT: For reference, I ended up changing my code to the following:

struct Gaussian
    x0::Float32
    y0::Float32
    A::Float32
    a::Float32
    b::Float32
    c::Float32
end

function build_array()
    N = 2000
    random_pos = [rand(Float32, (1, 2)) for i in 1:N]
    random_A = rand(Float32, N)
    random_a = rand(Float32, N)
    random_b = rand(Float32, N)
    random_c = rand(Float32, N);

    gaussians = Gaussian[]

    for (pos, A, a, b, c) in zip(
        random_pos,
        random_A,
        random_a,
        random_b,
        random_c
        )
        new_gaussian = Gaussian(pos..., A, a, b, c)
        push!(gaussians, new_gaussian)
    end
    
    return gaussians
end

gaussians = build_array()

function intensity(
    model::Gaussian,
    x,
    y
    )
    (;x0, y0, A, a, b, c) = model
    A*exp(-(a * (x - x0)^2 + 2 * b * (x - x0) * (y - y0) + c * (y - y0)^2))
end

function total_intensity(gaussian_list::Vector{<:Gaussian})
    total = sum(g->intensity(g, Float32(0.75), Float32(0.11)), gaussian_list)
end

@btime total_intensity($gaussians)

Which runs much faster:

10.900 μs (0 allocations: 0 bytes)

Thank you to Nils Gudat and DNF for their suggestions!

DNF · Accepted Answer

TLDR version: Vector{Gaussian} should be Vector{Gaussian{Float32}}.

Your struct definition Gaussian{T<:Float32} is somewhat nonsensical. Float32 cannot have any subtypes, so T can only be a Float32. Therefore, either remove the restriction, replace it with something else (e.g. Real), or just take away the type parameter entirely.

This is bad:

gaussians::Array{Gaussian} = []

It creates a Vector{Any} which it then converts to a Vector{Gaussian}. Worse, Vector{Gaussian} is not a Vector{Gaussian{Float32}}. So either remove the whole type parameter, or make sure to use it. So,

# good:
gaussians = Vector{Gaussian{Float32}}()
gaussians = Gaussian{Float32}[] # same as above

# bad
gaussians = Vector{Gaussian}()
# very bad, don't use this style, put types on the right hand side when constructing.
gaussians::Vector{Gaussian} = []

Same here, bad style

total::Float32 = 0.

Do this instead

total = Float32(0.0)
# or use Float32 literal
total = 0.0f0
# or the generic way
total = zero(Float32)

In dynamic languages, types belong to values, not to variables.

BTW, you'll have to modify some of your function definitions:

total_intensity1(gaussian_list::Array{Gaussian})

should be

total_intensity1(gaussian_list::Array{<:Gaussian})

There's more, but this is a start.

Edit: OK, a few more things:

rand(Float32, (length, 1)) length is a super important function in Base, so it's normally good not to shadow it like this. And, make vectors instead of matrices:

rand(Float32, (N, 1)) # this is an Nx1 matrix

rand(Float32, N) # this is a length-N vector
push!(gaussians, new_gaussian) This iteratively resizes the vector over and over. When you know the size of the vector as in your case, it is better to pre-allocate:

gaussians = Vector{Gaussian{Float32}}(undef, 2000)
You can avoid an unnecessary allocation here:

total = sum(intensity.(gaussian_list, Float32(0.75), Float32(0.11))) like this:

total = sum(g->intensity(g, 0.75f0, 0.11f0), gaussian_list)

Explanation: sum(f.(x)) first creates the array f.(x), then sums it, while sum(f, x) just applies f to each element before adding it to the sum.

Here's an implementation with benchmarks:

struct Gaussian{T<:Real}
    x0::T
    y0::T
    A::T
    a::T
    b::T
    c::T
end
Gaussian(x::Real...) = Gaussian(promote(x...)...)

function intensity(model::Gaussian, x::Real, y::Real)
    val = model.A * exp(
        -( 
            model.a * (x - model.x0)^2 +
            2 * model.b * (x - model.x0) * (y - model.y0) +
            model.c * (y - model.y0)^2
        )
    )
    return val
end

function build_array(N=2000)
    return [Gaussian(ntuple(_->rand(Float32), 6)...) for _ in 1:N]
end

Benchmarks (remember to interpolate variables, and avoid global scope):

julia> gaussians = build_array(2000);

julia> @btime sum(intensity.($gaussians, Float32(0.75), Float32(0.11)))
  14.600 μs (1 allocation: 7.94 KiB)
947.5305f0

julia> @btime sum(g->intensity(g, 0.75f0, 0.11f0), $gaussians)
  12.600 μs (0 allocations: 0 bytes)
947.5309f0

There's a slight difference in the final sums, since sum of vector uses a numerically superior method called pairwise summation.

Final bonus: Try Tullio.jl, which also uses multithreading. It doesn't make any difference for 2000 elements, but it does for larger arrays (using 12 threads here):

julia> using Tullio, LoopVectorization

julia> gaussians = build_array(200_000);

julia> @btime sum(g->intensity(g, 0.75f0, 0.11f0), $gaussians)
  1.228 ms (0 allocations: 0 bytes)
92722.7f0

julia> @btime @tullio s := intensity($gaussians[i], 0.75f0, 0.11f0)
  330.100 μs (197 allocations: 11.53 KiB)
92722.79f0

Performance differences when looping over arrays of structs

Answers (2)

Related Questions