Julia: Avoid memory allocation due to nested function calls in for loop

Question

I've seen multiple questions addressing memory allocation in Julia in general, however none of these examples helped me. I provide a minimal example that shall illustrate my problem. I implemented a finite volume solver that computes the solution of an advection equation. Long story short here the (self contained) code:

function dummyexample()
    nx = 100
    Δx = 1.0/nx
    x = range(Δx/2.0, length=nx, step=Δx) 
    ρ = sin.(2π*x)
    for i=1:floor(1.0/Δx / 0.5)
        shu_osher_step!(ρ)      # This part is executed several times
    end
    println(sum(Δx*abs.(ρ .- sin.(2π*x))))
end

function shu_osher_step!(ρ::AbstractArray)
    ρ₁ = euler_step(ρ)                         # array allocation
    ρ₂ = 3.0/4.0*ρ .+ 1.0/4.0*euler_step(ρ₁)   # array allocation
    ρ .= 1.0/3.0*ρ .+ 2.0/3.0*euler_step(ρ₂)   # array allocation 
end

function euler_step(ρ::AbstractArray)
    return ρ .+ 0.5*rhs(ρ) 
end

function rhs(ρ::AbstractArray)
    ρₗ  = circshift(ρ,+1)                  # array allocation
    ρᵣ  = circshift(ρ,-1)                  # array allocation
    Δρₗ = ρ.-ρₗ                            # array allocation
    Δρᵣ = ρᵣ .-ρ                           # array allocation
    vᵣ  = ρ .+ 1.0/2.0 .* H(Δρₗ,Δρᵣ)       # array allocation
    return  -(vᵣ .- circshift(vᵣ,+1))      # array allocation
end

function H(Δρₗ::AbstractArray,Δρᵣ::AbstractArray)
    σ = Δρₗ ./ Δρᵣ
    σ̃ = max.(abs.(σ),1e-12) .* (2.0 .* (σ .>= 0.0) .- 1.0)
    for i=1:100
        if isnan(σ̃[i])
            σ̃[i] = 1e-12
        end
    end
    return Δρₗ .* (2.0/3.0*(1.0 ./ σ̃) .+ 1.0/3.0)
end

My problem is, that deep down in the call tree the function rhs allocates several arrays in every iteration of the most upper time loop. These arrays are temporary and I do not like the fact that they have to be reallocated every iteration. Here the output from @time:

julia> include("dummyexample.jl");

julia> @time dummyexample()
8.780349744014917e-5       # <- just to check that the error is almost zero
  0.362833 seconds (627.38 k allocations: 39.275 MiB, 1.95% gc time)

Now in the real code, there is actually a struct p passed down the whole calltree that contains attributes which I hardcoded here (basically every of the explicitly stated numbers would be referenced by p.n, etc.) I could probably also pass down preallocated arrays like but that seems to get messy and I would have to change that every time I want to do extra computations. Global arrays are discouraged in the Julia documentation but wouldn't that do the trick here? Are there any other obvious things I am missing? I am considering Julia 1.0.

phipsgabler · Accepted Answer

Passing down preallocated arrays, as you say in the last paragraph, is exactly the right thing in this kind of situation. Additional to that, I would devectorize the code into a manual loop containing a stencil and more indexing math instead of circshift.

Applying both ideas results in the following:

function dummyexample()
    nx = 100
    Δx = 1.0 / nx
    steps = 2 ÷ Δx

    x = range(Δx ÷ 2, length = nx, step = Δx)
    ρ = sin.(2π .* x)

    run!(ρ, steps)
    println(sum(@. Δx * abs(ρ - sin(2π * x))))
end

function run!(ρ, steps)
    ρ₁, ρ₂, v = similar(ρ), similar(ρ), similar(ρ)

    for i = 1:steps
        shu_osher_step!(ρ₁, ρ₂, v, ρ)
    end

    return ρ
end

function shu_osher_step!(ρ₁, ρ₂, v, ρ)
    euler_step!(ρ₁, v, ρ)
    ρ₂ .= 3.0/4.0 .* ρ .+ 1.0/4.0 .* euler_step!(ρ₂, v, ρ₁)
    ρ .= 1.0/3.0 .* ρ .+ 2.0/3.0 .* euler_step!(ρ, v, ρ₂)
end

function euler_step!(ρₒ, v, ρ)
    cycle(i) = mod(i - 1, length(ρ)) + 1

    # two steps of calculating v fused into one -- could be replaced by 
    # an extra loop for v.
    for I in 1:2:size(ρ, 1)
        v[I] = rhs(ρ[cycle(I-1)], ρ[I], ρ[cycle(I+1)])
        v[cycle(I+1)] = rhs(ρ[cycle(I)], ρ[I+1], ρ[cycle(I+2)])
        ρₒ[I] += 0.5 * (v[cycle(I+1)] - v[I])
    end

    return ρₒ
end

function rhs(ρₗ, ρᵢ, ρᵣ)
    Δρₗ = ρᵢ - ρₗ
    Δρᵣ = ρᵣ - ρᵢ
    return ρᵢ + 1/2 * H(Δρₗ, Δρᵣ)
end

function H(Δρₗ, Δρᵣ)
    σ = Δρₗ / Δρᵣ
    σ̃ = max(abs(σ), 1e-12) * (2.0 * (σ >= 0.0) - 1.0)
    isnan(σ̃) && (σ̃ = 1e-12)
    return Δρₗ * (2.0 / 3.0 * (1.0 / σ̃) + 1.0 / 3.0)
end

The above might still contain some logic errors due to my lack of domain knowledge (dummyexample() prints 0.02984422033942575), but you see the pattern. And it benchmarks well:

julia> @benchmark run!($ρ, $steps)
BenchmarkTools.Trial: 
  memory estimate:  699.13 KiB
  allocs estimate:  799
  --------------
  minimum time:     3.024 ms (0.00% GC)
  median time:      3.164 ms (0.00% GC)
  mean time:        3.760 ms (1.69% GC)
  maximum time:     57.105 ms (94.41% GC)
  --------------
  samples:          1327
  evals/sample:     1

Julia: Avoid memory allocation due to nested function calls in for loop

Answers (1)

Related Questions