coolsv
coolsv

Reputation: 781

Merge large number of arrays by common column values in julia

Expanding a previous question I put here before, suppose we have a large number of arrays (say 500 arrays), like the following 3 first ones

5.0 3.5 6.0 3.6 7.0 3.0

5.0 4.5 6.0 4.7 8.0 3.0

5.0 4.0 6.0 3.2 8.0 4.0

and so on, stored in one array, so that we have an array of 500 arrays of the type above. I want to merge the 500 arrays into one array, by common values of the first column, calculating the mean values of the corresponding elements of the second column. The result must be the following array:

5.0 mean of all 5's values 6.0 mean of all 6's values 7.0 mean of all 7's values 8.0 mean of all 8's values

How can I achieve that? Thank you!

Upvotes: 1

Views: 173

Answers (2)

DNF
DNF

Reputation: 12653

Here's a version that is ~6 times faster than the answer from @PicaudVincent (based on his input data), but which doesn't sort the keys, so the rows of the return matrix is in arbitrary order:

function accumarrays(A::Vector{Matrix{T}}) where {T}
    d = Dict{T, Tuple{T, Int}}()
    for a in A
        for i in indices(a, 1)
            ai = a[i, 1]
            d[ai] = get(d, ai, (zero(T), 0)) .+ (a[i, 2], 1)
        end
    end
    Aout = Matrix{typeof(one(T)/1)}(length(d), 2)
    i = 0
    for (key, val) in d
        Aout[i+=1, 1] = key
        Aout[i, 2] = val[1] / val[2]
    end
    return Aout
end

If you need the rows to be sorted, this works, but is just 4-5 times faster:

function accumarrays_(A::Vector{Matrix{T}}) where {T}
    d = Dict{T, Tuple{T, Int}}()
    for a in A
        for i in indices(a, 1)
            ai = a[i, 1]
            d[ai] = get(d, ai, (zero(T), 0)) .+ (a[i, 2], 1)
        end
    end
    dkeys = sort!(collect(keys(d)))
    Aout = Matrix{typeof(one(T)/1)}(length(dkeys), 2)
    for i in eachindex(dkeys)
        val = d[dkeys[i]]
        Aout[i, 1] = dkeys[i]
        Aout[i, 2] = val[1] / val[2]
    end
    return Aout
end

Upvotes: 1

Picaud Vincent
Picaud Vincent

Reputation: 10982

Also back with a slight modification of https://stackoverflow.com/a/50842721/2001017

function aggregate(m::Array{<:Array{<:Number,2},1})

    result=sortrows(vcat(m...))

    n = size(result,1)
    if n <= 1
        return result
    end 

    key_idx = 1
    key     = result[key_idx,1]
    count   = 1

    for i in 2:n
      if key == result[i,1]
          result[key_idx,2:end] += result[i,2:end]
          count                 += 1
      else
          result[key_idx,2:end] /= count
          count                  = 1
          key                    = result[i,1]
          key_idx               += 1
          result[key_idx,1]      = key 
          result[key_idx,2:end]  = result[i,2:end]
      end
    end

    result[key_idx,2:end] /= count

    return result[1:key_idx,:]
end   

Demo:

x = [5.0  3.5
     6.0  3.6
     7.0  3.0]

y = [5.0  4.5
     6.0  4.7
     8.0  3.0]

z = [5.0  4.0
     6.0  3.2
     8.0  4.0]

a=[x,y,z]

julia> a
3-element Array{Array{Float64,2},1}:
 [5.0 3.5; 6.0 3.6; 7.0 3.0]
 [5.0 4.5; 6.0 4.7; 8.0 3.0]
 [5.0 4.0; 6.0 3.2; 8.0 4.0]

julia> aggregate(a)
4×2 Array{Float64,2}:
 5.0  4.0    
 6.0  3.83333
 7.0  3.0    
 8.0  3.5

Upvotes: 2

Related Questions