Tlatwork
Tlatwork

Reputation: 1525

Memory efficient creation of sparse matrix

I have a list of 50000 string vectors, consisting of various combinations of 6000 unique strings.

Goal: I want to transform them in "relative frequencies" (table(x)/length(x)) and store them in a sparse matrix. Low memory consumption is more important than speed. Currently memory is the bottleneck. (Even though source data has about ~50 mb and data in target format ~10mb --> Transformation seems to be inefficient,...)

Generate sample data

dims <- c(50000, 6000)
nms <- paste0("A", 1:dims[2])
lengths <- sample(5:30, dims[1], replace = T)
data <- lapply(lengths, sample, x = nms, replace = T)

Possible attempts:

1) sapply() with simplify to sparse matrix?

library(Matrix)  
sparseRow <- function(stringVec){
  relFreq <- c(table(factor(stringVec, levels = nms)) / length(stringVec))
  Matrix(relFreq, 1, dims[2], sparse = TRUE)
}
sparseRows <- sapply(data[1:5], sparseRow)
sparseMat <- do.call(rbind, sparseRows)

Problem: My bottleneck seems to be the sparseRows as the rows are not directly combined to a sparse matrix. (If i run the code above on the full sample, i get an Error: cannot allocate vector of size 194 Kb Error during wrapup: memory exhausted (limit reached?) - my hardware has 8 GB RAM.)

Obviously there is more memory consumption for creating a list of rows, before combining them instead of filling the sparse matrix directly. --> so using (s/l)apply is not memory friendly in my case?

object.size(sparseRows)
object.size(sparseMat)

2) Dirty workaround(?)

My goal seems to be to create an empty sparse matrix and fill it row wise. Below is a dirty way to do it (which works on my hardware).

indxs <- lapply(data, function(data) sapply(data, function(x) which(x == nms), 
   USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))

mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for(idx in 1:dims[1]){
  mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
#sapply(1:dims[1], function(idx) mm[idx, 
#     as.numeric(names(relFreq[[idx]]))] <<- as.numeric(relFreq[[idx]]))

I would like to ask if there is a more elegant/efficient way to achieve that with lowest amount of RAM possible.

Upvotes: 18

Views: 1532

Answers (2)

minem
minem

Reputation: 3650

I would convert to data.table and then do the necessary calculations:

ld <- lengths(data)
D <- data.table(val = unlist(data),
                id = rep(1:length(data), times = ld),
                Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sp <- with(D, sparseMatrix(i = id, j = ind, x = freq,
                           dims = c(max(id), length(nms))))

Benchmarks for n = 100

data2 <- data[1:100]
Unit: milliseconds
      expr        min         lq       mean    median        uq        max neval cld
        OP 102.150200 106.235148 113.117848 109.98310 116.79734 142.859832    10  b 
  F. Privé 122.314496 123.804442 149.999595 126.76936 164.97166 233.034447    10   c
     minem   5.617658   5.827209   6.307891   6.10946   6.15137   9.199257    10 a  
 user20650  11.012509  11.752350  13.580099  12.59034  14.31870  21.961725    10 a  

Benchmarks on all data

Lets benchmark 3 of the fastest functions, because rest of them (OP's, user20650_v1 and F.Privé's) would be to slow on all of the data.

user20650_v2 <- function(x) {
  dt2 = data.table(lst = rep(1:length(x), lengths(x)),
                   V1 = unlist(x))
  dt2[, V1 := factor(V1, levels = nms)]
  x3 = xtabs(~ lst + V1, data = dt2, sparse = TRUE)
  x3/rowSums(x3)
}
user20650_v3 <- function(x) {
  x3 = xtabs(~ rep(1:length(x), lengths(x)) + factor(unlist(x), levels = nms),
             sparse = TRUE)
  x3/rowSums(x3)
}
minem <- function(x) {
  ld <- lengths(x)
  D <- data.table(val = unlist(x), id = rep(1:length(x), times = ld),
                  Ntotal = rep(ld, times = ld))
  D <- D[, .N, keyby = .(id, val, Ntotal)]
  D[, freq := N/Ntotal]
  ii <- data.table(val = nms, ind = seq_along(nms))
  D <- ii[D, on = 'val']
  sparseMatrix(i = D$id, j = D$ind, x = D$freq,
               dims = c(max(D$id), length(nms)))
}

Compare the results of minem and user20650_v3:

x1 <- minem(data)
x2 <- user20650_v3(data)
all.equal(x1, x2)
# [1] "Component “Dimnames”: names for current but not for target"             
# [2] "Component “Dimnames”: Component 1: target is NULL, current is character"
# [3] "Component “Dimnames”: Component 2: target is NULL, current is character"
# [4] "names for target but not for current"  

x2 has additional names. remove them:

dimnames(x2) <- names(x2@x) <- NULL
all.equal(x1, x2)
# [1] TRUE # all equal

Timings:

x <- bench::mark(minem(data),
            user20650_v2(data),
            user20650_v3(data),
            iterations = 5, check = F)
as.data.table(x)[, 1:10]

#            expression   min  mean median   max  itr/sec mem_alloc n_gc n_itr total_time
# 1:        minem(data) 324ms 345ms  352ms 371ms 2.896187     141MB    7     5      1.73s
# 2: user20650_v2(data) 604ms 648ms  624ms 759ms 1.544380     222MB   10     5      3.24s
# 3: user20650_v3(data) 587ms 607ms  605ms 633ms 1.646977     209MB   10     5      3.04s

relating memory:

OPdirty <- function(x) {
  indxs <- lapply(x, function(x) sapply(x, function(x) which(x == nms), 
                                        USE.NAMES = FALSE))
  relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
  dims <- c(length(indxs), length(nms))
  mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
  for (idx in 1:dims[1]) {
    mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
  }
  mm
}


xx <- data[1:1000]
all.equal(OPdirty(xx), minem(xx))
# true
x <- bench::mark(minem(xx),
                 FPrive(xx),
                 OPdirty(xx),
                 iterations = 3, check = T)
as.data.table(x)[, 1:10]
    expression     min    mean  median     max    itr/sec mem_alloc n_gc n_itr total_time
1:   minem(xx) 12.69ms 14.11ms 12.71ms 16.93ms 70.8788647    3.04MB    0     3    42.33ms
2:  FPrive(xx)   1.46s   1.48s   1.47s   1.52s  0.6740317  214.95MB    4     3      4.45s
3: OPdirty(xx)   2.12s   2.14s   2.15s   2.16s  0.4666106  914.91MB    9     3      6.43s

See column mem_alloc...

Upvotes: 13

F. Priv&#233;
F. Priv&#233;

Reputation: 11738

Use a loop to fill a pre-allocated sparse matrix column-wise (and then transpose it):

res <- Matrix(0, dims[2], length(data), sparse = TRUE)
for (i in seq_along(data)) {
  ind.match <- match(data[[i]], nms)
  tab.match <- table(ind.match)
  res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data[[i]])
}
# Verif
stopifnot(identical(t(res), sparseMat))

Benchmark:

data2 <- data[1:50]
microbenchmark::microbenchmark(
  OP = {
    sparseMat <- do.call(rbind, sapply(data2, sparseRow))
  },
  ME = {
    res <- Matrix(0, dims[2], length(data2), sparse = TRUE)
    for (i in seq_along(data2)) {
      ind.match <- match(data2[[i]], nms)
      tab.match <- table(ind.match)
      res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data2[[i]])
    }
    res2 <- t(res)
  }
)
stopifnot(identical(res2, sparseMat))

Unit: milliseconds
 expr      min       lq     mean   median       uq       max neval cld
   OP 56.28020 59.61689 63.24816 61.16986 62.80294 206.18689   100   b
   ME 46.60318 48.27268 49.77190 49.50714 50.92287  55.23727   100  a 

So, it's memory-efficient and not that slow.

Upvotes: 7

Related Questions