On_an_island
On_an_island

Reputation: 509

Speed up row and column wise operation in R

Using the reprex data below, the operation is straightforward:

  1. For row,i and column,j combination in df, subset the corresponding cluster associated with df[i,j]
  2. Pull a sample size of 1 or size >1 for the Value variable in the cluster_data dataframe from cluster id in step 1.
  3. Repeat steps 1 and 2 until i equals nrow(df)

My working for loop code is posted below. The for loop I wrote works fine for smaller datasets but takes quite a while larger datasets. I imagine there's a much better and faster way (performance based) to code this operation for large datasets but the best I've come up with is to loop through each row / column given that my sampling in Step 2 is conditional on the value corresponding to the row / column index from Step 1. I would like someone to suggest a faster approach than my for loop solution. I don't have any requirements on whether it needs to be base R or rely on other libraries. I'm mainly after the fastest achievable solution.

My actual dataset can range anywhere from 500,000 rows to 2 million rows and the number of clusters varies quite a bit but doesn't typically exceed more than 50. It is likely, however, that I may have datasets that have more than 50 clusters.

Sample dataframe:

df <- structure(list(V1 = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
8, 7, 8, 7, 8, 7, 8, 8, 8, 7, 7, 7, 8, 7, 8, 8, 7, 7, 7, 7, 7, 
7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 8, 7, 7, 7, 7, 8, 8, 8, 
7, 8, 4, 7, 8, 7, 3, 8, 8, 7, 8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 
7, 8, 7, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 8, 8, 7, 7, 7, 7, 7, 
7, 7, 7), V2 = c(7, 6, 7, 4, 4, 7, 7, 7, 7, 7, 3, 3, 4, 7, 7, 
7, 6, 7, 4, 7, 6, 7, 6, 4, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 4, 8, 
7, 7, 7, 6, 7, 7, 8, 6, 6, 7, 3, 7, 6, 7, 7, 6, 6, 7, 7, 7, 7, 
6, 4, 6, 7, 6, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 4, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 6, 7, 7, 6, 4, 7, 7, 7, 7, 6, 
7), V3 = c(7, 7, 7, 4, 7, 7, 7, 7, 7, 4, 7, 7, 4, 4, 4, 4, 7, 
7, 7, 6, 4, 7, 7, 4, 7, 7, 7, 4, 7, 7, 4, 6, 4, 4, 7, 4, 4, 7, 
7, 7, 6, 7, 4, 4, 7, 7, 7, 4, 7, 7, 4, 7, 7, 4, 7, 7, 7, 7, 4, 
7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7, 4, 4, 7, 7, 7, 7, 4, 4, 7, 4, 
7, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 6, 4, 4, 7, 7, 6, 7, 7, 7), 
    V4 = c(4, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 6, 4, 6, 4, 6, 
    7, 4, 3, 6, 3, 4, 4, 6, 4, 4, 4, 3, 3, 6, 6, 3, 6, 6, 4, 
    4, 4, 4, 4, 1, 4, 3, 6, 4, 4, 7, 6, 3, 4, 6, 4, 4, 4, 7, 
    6, 7, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 6, 4, 4, 6, 4, 4, 4, 
    4, 4, 6, 6, 4, 6, 6, 4, 6, 7, 6, 4, 4, 6, 6, 4, 6, 6, 4, 
    6, 4, 1, 6, 6, 6, 7), V5 = c(4, 4, 8, 8, 7, 4, 7, 4, 8, 4, 
    4, 8, 8, 8, 4, 4, 4, 6, 4, 4, 7, 7, 7, 8, 7, 4, 4, 8, 7, 
    7, 8, 4, 8, 6, 7, 7, 7, 7, 8, 7, 8, 4, 4, 7, 4, 4, 8, 8, 
    7, 4, 8, 7, 4, 4, 7, 7, 4, 4, 7, 7, 7, 7, 4, 4, 7, 7, 6, 
    7, 8, 6, 4, 4, 8, 6, 7, 8, 8, 7, 7, 8, 8, 7, 7, 4, 6, 4, 
    7, 7, 4, 4, 4, 7, 8, 4, 4, 7, 4, 7, 7, 4), V6 = c(6, 6, 7, 
    6, 6, 4, 4, 3, 4, 4, 3, 7, 4, 4, 6, 6, 6, 3, 4, 4, 4, 4, 
    4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 4, 3, 6, 4, 4, 6, 4, 4, 4, 
    4, 6, 7, 4, 6, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 7, 6, 
    4, 6, 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 7, 4, 6, 
    6, 4, 4, 6, 6, 6, 6, 6, 6, 4, 6, 4, 3, 4, 4, 6, 6, 6, 7, 
    4, 6), V7 = c(9, 5, 9, 5, 5, 8, 9, 5, 9, 8, 8, 9, 4, 5, 8, 
    5, 8, 9, 5, 8, 8, 9, 5, 8, 8, 8, 5, 9, 8, 8, 9, 8, 5, 8, 
    8, 9, 8, 9, 5, 5, 5, 5, 9, 8, 8, 9, 4, 4, 8, 4, 9, 8, 8, 
    5, 9, 8, 9, 8, 8, 8, 9, 8, 4, 5, 5, 9, 8, 5, 9, 8, 5, 5, 
    9, 8, 8, 6, 5, 9, 4, 9, 5, 9, 4, 9, 5, 5, 4, 9, 8, 8, 4, 
    8, 5, 8, 8, 5, 8, 5, 8, 9), V8 = c(9, 9, 9, 9, 9, 6, 4, 9, 
    3, 9, 9, 9, 4, 6, 4, 9, 5, 6, 4, 9, 4, 4, 9, 6, 6, 6, 9, 
    9, 6, 4, 9, 4, 4, 6, 9, 4, 2, 6, 9, 4, 4, 4, 9, 5, 6, 4, 
    2, 9, 4, 9, 4, 9, 2, 9, 9, 9, 2, 9, 4, 9, 9, 9, 6, 4, 9, 
    9, 6, 9, 9, 9, 6, 2, 9, 2, 3, 4, 9, 9, 9, 6, 9, 4, 6, 9, 
    9, 4, 9, 9, 9, 9, 9, 9, 9, 9, 4, 2, 5, 4, 4, 9), V9 = c(5, 
    5, 9, 5, 5, 2, 5, 5, 9, 5, 5, 5, 9, 9, 5, 5, 9, 9, 2, 2, 
    5, 5, 5, 2, 9, 2, 5, 5, 9, 5, 5, 5, 9, 5, 5, 9, 9, 9, 2, 
    5, 5, 9, 5, 9, 5, 5, 5, 9, 9, 5, 9, 9, 9, 5, 9, 2, 9, 5, 
    9, 9, 5, 5, 5, 9, 9, 9, 9, 9, 9, 9, 5, 5, 9, 5, 9, 9, 2, 
    5, 5, 5, 9, 9, 9, 5, 5, 9, 5, 5, 9, 2, 5, 5, 5, 9, 9, 5, 
    9, 9, 5, 5)), row.names = c(NA, 100L), class = "data.frame")

Sample cluster data:

cluster_data <- structure(list(cluster = c(4L, 6L, 2L, 9L, 3L, 3L, 5L, 5L, 3L, 
3L, 7L, 1L, 4L, 4L, 4L, 3L, 7L, 2L, 1L, 6L, 7L, 1L, 1L, 4L, 1L, 
3L, 9L, 4L, 7L, 6L, 2L, 4L, 6L, 3L, 3L, 6L, 7L, 6L, 6L, 3L, 4L, 
1L, 6L, 5L, 7L, 3L, 4L, 2L, 1L, 2L, 6L, 6L, 3L, 6L, 1L, 1L, 4L, 
6L, 3L, 4L, 6L, 2L, 6L, 1L, 8L, 9L, 6L, 2L, 4L, 7L, 3L, 4L, 6L, 
5L, 9L, 1L, 6L, 6L, 6L, 3L, 8L, 7L, 1L, 1L, 3L, 6L, 5L, 3L, 8L, 
4L, 7L, 6L, 4L, 3L, 3L, 1L, 3L, 4L, 4L, 1L), Value = c(30.6, 
24.3, 35.1, 30.3, 28.1, 23.5, 39, 37, 23.5, 23.6, 39.2, 18.1, 
33.6, 32.1, 20.2, 15.7, 66.6, 18.8, 21.7, 17.7, 33.1, 5.4, 3.2, 
44.1, 14.5, 30.7, 21.9, 14.5, 21.9, 22.6, 24.4, 36.4, 18.8, 6.2, 
20.3, 26, 31.9, 13.2, 16.9, 14.2, 38, 10, 29.4, 34.1, 54.3, 22, 
37.2, 14.4, 8.2, 15.5, 25.1, 36.3, 6, 18.3, 14.1, 16.8, 27.6, 
21.7, 9.3, 39.8, 28, 8.4, 7.3, 26, 70.1, 24.9, 26.1, 11.9, 43.1, 
42.1, 5.7, 29.7, 9.4, 50.4, 23.6, 23.3, 14.7, 24.2, 33.6, 20.5, 
48.5, 38.5, 21.6, 11.6, 26.4, 18.5, 38.2, 35.9, 57.3, 21.5, 8.2, 
23.3, 31.6, 27.4, 17.9, 0.5, 6, 26.3, 10.9, 1.6)), row.names = c(NA, 
100L), class = "data.frame")

Working for loop code:

for (i in 1:nrow(df)) {
  for (j in 1:ncol(df)) {
    clusterId <- df[i,j]
    df[i,j] <- sample(cluster_data[cluster_data$cluster==clusterId,"Value"], size = 1)
  }
  print(paste0("Finished iterating over row ",i,"."))
}
df$clusterSum <- rowSums(df)

Upvotes: 1

Views: 250

Answers (1)

det
det

Reputation: 5232

You can speed up by grouping cluster values (no need to filter multiple times for same thing) using split:

cluster_data <- lapply(split(cluster_data, cluster_data$cluster), function(x) x$Value)
df <- t(apply(df, 1, function(x) sapply(x, function(x) sample(cluster_data[[x]], 1))))
df <- as.data.frame(df)
df$row_sum <- rowSums(df)

Another way is to find indices of clusters within df and then sample only once for each cluster:

df <- as.matrix(df)
indices <- split(seq_along(df), df)
cluster_data <- lapply(split(cluster_data, cluster_data$cluster), function(x) x$Value)
for(i in seq_along(indices))
  df[indices[[i]]] <- sample(cluster_data[[names(indices)[[i]]]], length(indices[[i]]), TRUE)
df <- as.data.frame(df)
df$row_sum <- rowSums(df)

Upvotes: 1

Related Questions