Reputation: 509
Using the reprex data below, the operation is straightforward:
row,i
and column,j
combination in df
, subset the corresponding cluster associated with df[i,j]
Value
variable in the cluster_data
dataframe from cluster id in step 1.i
equals nrow(df)
My working for loop code is posted below. The for loop I wrote works fine for smaller datasets but takes quite a while larger datasets. I imagine there's a much better and faster way (performance based) to code this operation for large datasets but the best I've come up with is to loop through each row / column given that my sampling in Step 2 is conditional on the value corresponding to the row / column index from Step 1. I would like someone to suggest a faster approach than my for loop solution. I don't have any requirements on whether it needs to be base R
or rely on other libraries. I'm mainly after the fastest achievable solution.
My actual dataset can range anywhere from 500,000 rows to 2 million rows and the number of clusters varies quite a bit but doesn't typically exceed more than 50. It is likely, however, that I may have datasets that have more than 50 clusters.
Sample dataframe:
df <- structure(list(V1 = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 7, 8, 7, 8, 7, 8, 8, 8, 7, 7, 7, 8, 7, 8, 8, 7, 7, 7, 7, 7,
7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 8, 7, 7, 7, 7, 8, 8, 8,
7, 8, 4, 7, 8, 7, 3, 8, 8, 7, 8, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7,
7, 8, 7, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 8, 8, 7, 7, 7, 7, 7,
7, 7, 7), V2 = c(7, 6, 7, 4, 4, 7, 7, 7, 7, 7, 3, 3, 4, 7, 7,
7, 6, 7, 4, 7, 6, 7, 6, 4, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 4, 8,
7, 7, 7, 6, 7, 7, 8, 6, 6, 7, 3, 7, 6, 7, 7, 6, 6, 7, 7, 7, 7,
6, 4, 6, 7, 6, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 4, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 6, 7, 7, 6, 4, 7, 7, 7, 7, 6,
7), V3 = c(7, 7, 7, 4, 7, 7, 7, 7, 7, 4, 7, 7, 4, 4, 4, 4, 7,
7, 7, 6, 4, 7, 7, 4, 7, 7, 7, 4, 7, 7, 4, 6, 4, 4, 7, 4, 4, 7,
7, 7, 6, 7, 4, 4, 7, 7, 7, 4, 7, 7, 4, 7, 7, 4, 7, 7, 7, 7, 4,
7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 7, 4, 4, 7, 7, 7, 7, 4, 4, 7, 4,
7, 7, 7, 7, 7, 7, 7, 7, 4, 7, 7, 6, 4, 4, 7, 7, 6, 7, 7, 7),
V4 = c(4, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 6, 4, 6, 4, 6,
7, 4, 3, 6, 3, 4, 4, 6, 4, 4, 4, 3, 3, 6, 6, 3, 6, 6, 4,
4, 4, 4, 4, 1, 4, 3, 6, 4, 4, 7, 6, 3, 4, 6, 4, 4, 4, 7,
6, 7, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 6, 4, 4, 6, 4, 4, 4,
4, 4, 6, 6, 4, 6, 6, 4, 6, 7, 6, 4, 4, 6, 6, 4, 6, 6, 4,
6, 4, 1, 6, 6, 6, 7), V5 = c(4, 4, 8, 8, 7, 4, 7, 4, 8, 4,
4, 8, 8, 8, 4, 4, 4, 6, 4, 4, 7, 7, 7, 8, 7, 4, 4, 8, 7,
7, 8, 4, 8, 6, 7, 7, 7, 7, 8, 7, 8, 4, 4, 7, 4, 4, 8, 8,
7, 4, 8, 7, 4, 4, 7, 7, 4, 4, 7, 7, 7, 7, 4, 4, 7, 7, 6,
7, 8, 6, 4, 4, 8, 6, 7, 8, 8, 7, 7, 8, 8, 7, 7, 4, 6, 4,
7, 7, 4, 4, 4, 7, 8, 4, 4, 7, 4, 7, 7, 4), V6 = c(6, 6, 7,
6, 6, 4, 4, 3, 4, 4, 3, 7, 4, 4, 6, 6, 6, 3, 4, 4, 4, 4,
4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 4, 3, 6, 4, 4, 6, 4, 4, 4,
4, 6, 7, 4, 6, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 7, 6,
4, 6, 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 7, 4, 6,
6, 4, 4, 6, 6, 6, 6, 6, 6, 4, 6, 4, 3, 4, 4, 6, 6, 6, 7,
4, 6), V7 = c(9, 5, 9, 5, 5, 8, 9, 5, 9, 8, 8, 9, 4, 5, 8,
5, 8, 9, 5, 8, 8, 9, 5, 8, 8, 8, 5, 9, 8, 8, 9, 8, 5, 8,
8, 9, 8, 9, 5, 5, 5, 5, 9, 8, 8, 9, 4, 4, 8, 4, 9, 8, 8,
5, 9, 8, 9, 8, 8, 8, 9, 8, 4, 5, 5, 9, 8, 5, 9, 8, 5, 5,
9, 8, 8, 6, 5, 9, 4, 9, 5, 9, 4, 9, 5, 5, 4, 9, 8, 8, 4,
8, 5, 8, 8, 5, 8, 5, 8, 9), V8 = c(9, 9, 9, 9, 9, 6, 4, 9,
3, 9, 9, 9, 4, 6, 4, 9, 5, 6, 4, 9, 4, 4, 9, 6, 6, 6, 9,
9, 6, 4, 9, 4, 4, 6, 9, 4, 2, 6, 9, 4, 4, 4, 9, 5, 6, 4,
2, 9, 4, 9, 4, 9, 2, 9, 9, 9, 2, 9, 4, 9, 9, 9, 6, 4, 9,
9, 6, 9, 9, 9, 6, 2, 9, 2, 3, 4, 9, 9, 9, 6, 9, 4, 6, 9,
9, 4, 9, 9, 9, 9, 9, 9, 9, 9, 4, 2, 5, 4, 4, 9), V9 = c(5,
5, 9, 5, 5, 2, 5, 5, 9, 5, 5, 5, 9, 9, 5, 5, 9, 9, 2, 2,
5, 5, 5, 2, 9, 2, 5, 5, 9, 5, 5, 5, 9, 5, 5, 9, 9, 9, 2,
5, 5, 9, 5, 9, 5, 5, 5, 9, 9, 5, 9, 9, 9, 5, 9, 2, 9, 5,
9, 9, 5, 5, 5, 9, 9, 9, 9, 9, 9, 9, 5, 5, 9, 5, 9, 9, 2,
5, 5, 5, 9, 9, 9, 5, 5, 9, 5, 5, 9, 2, 5, 5, 5, 9, 9, 5,
9, 9, 5, 5)), row.names = c(NA, 100L), class = "data.frame")
Sample cluster data:
cluster_data <- structure(list(cluster = c(4L, 6L, 2L, 9L, 3L, 3L, 5L, 5L, 3L,
3L, 7L, 1L, 4L, 4L, 4L, 3L, 7L, 2L, 1L, 6L, 7L, 1L, 1L, 4L, 1L,
3L, 9L, 4L, 7L, 6L, 2L, 4L, 6L, 3L, 3L, 6L, 7L, 6L, 6L, 3L, 4L,
1L, 6L, 5L, 7L, 3L, 4L, 2L, 1L, 2L, 6L, 6L, 3L, 6L, 1L, 1L, 4L,
6L, 3L, 4L, 6L, 2L, 6L, 1L, 8L, 9L, 6L, 2L, 4L, 7L, 3L, 4L, 6L,
5L, 9L, 1L, 6L, 6L, 6L, 3L, 8L, 7L, 1L, 1L, 3L, 6L, 5L, 3L, 8L,
4L, 7L, 6L, 4L, 3L, 3L, 1L, 3L, 4L, 4L, 1L), Value = c(30.6,
24.3, 35.1, 30.3, 28.1, 23.5, 39, 37, 23.5, 23.6, 39.2, 18.1,
33.6, 32.1, 20.2, 15.7, 66.6, 18.8, 21.7, 17.7, 33.1, 5.4, 3.2,
44.1, 14.5, 30.7, 21.9, 14.5, 21.9, 22.6, 24.4, 36.4, 18.8, 6.2,
20.3, 26, 31.9, 13.2, 16.9, 14.2, 38, 10, 29.4, 34.1, 54.3, 22,
37.2, 14.4, 8.2, 15.5, 25.1, 36.3, 6, 18.3, 14.1, 16.8, 27.6,
21.7, 9.3, 39.8, 28, 8.4, 7.3, 26, 70.1, 24.9, 26.1, 11.9, 43.1,
42.1, 5.7, 29.7, 9.4, 50.4, 23.6, 23.3, 14.7, 24.2, 33.6, 20.5,
48.5, 38.5, 21.6, 11.6, 26.4, 18.5, 38.2, 35.9, 57.3, 21.5, 8.2,
23.3, 31.6, 27.4, 17.9, 0.5, 6, 26.3, 10.9, 1.6)), row.names = c(NA,
100L), class = "data.frame")
Working for loop code:
for (i in 1:nrow(df)) {
for (j in 1:ncol(df)) {
clusterId <- df[i,j]
df[i,j] <- sample(cluster_data[cluster_data$cluster==clusterId,"Value"], size = 1)
}
print(paste0("Finished iterating over row ",i,"."))
}
df$clusterSum <- rowSums(df)
Upvotes: 1
Views: 250
Reputation: 5232
You can speed up by grouping cluster values (no need to filter multiple times for same thing) using split
:
cluster_data <- lapply(split(cluster_data, cluster_data$cluster), function(x) x$Value)
df <- t(apply(df, 1, function(x) sapply(x, function(x) sample(cluster_data[[x]], 1))))
df <- as.data.frame(df)
df$row_sum <- rowSums(df)
Another way is to find indices of clusters within df
and then sample only once for each cluster:
df <- as.matrix(df)
indices <- split(seq_along(df), df)
cluster_data <- lapply(split(cluster_data, cluster_data$cluster), function(x) x$Value)
for(i in seq_along(indices))
df[indices[[i]]] <- sample(cluster_data[[names(indices)[[i]]]], length(indices[[i]]), TRUE)
df <- as.data.frame(df)
df$row_sum <- rowSums(df)
Upvotes: 1