Reputation: 12703
I would like to eliminate nested loop used to create data in m
inside the function. The missing_entries()
function is trying to identify and return values of col1
missing for groups in col2
. Is there any alternative way to approach this problem in order to improve efficiency?
missing_entries <- function( data, x, y )
{
# find missing entries in x for the group in y
# by comparing other groups in y
require( 'data.table' )
# require class of data to be data.table
stopifnot( "data.table" %in% class(data) )
# the outer loop with i refers to each unique value of column y
# the inner loop with j refers to all unique values of column y
# except the value in i under current iteration
uniq_col2 <- unique( data[, get(y) ] )
m <- lapply( uniq_col2, function(i){
lapply( setdiff( uniq_col2, i ), function( j ) {
z <- setdiff( data[ get(y) == i, get(x)], data[ get(y) == j, get(x)])
if( length(z) > 0 ){
return( data.frame( v1 = z, v2 = j,
stringsAsFactors = FALSE ) )
} else{
return()
}
} )
})
# row bind
m <- lapply( m, function(k) {
if(!is.null(k)) {
rbindlist(l = k)
}})
# collect only not null data
m <- rbindlist( l = m[lapply(m, nrow) > 0] )
colnames( m ) <- c( x, y )
return( m )
}
# testing
test_data <- structure(list(cardnty = c("many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many",
"many", "many", "many", "many", "many", "many"),
col1 = c(2L, 4L, 3L, 13L, 5L, 6L, 7L, 17L, 9L, 4L, 3L,
2L, 8L, 5L, 6L, 7L, 14L, 17L, 19L, 13L,
9L, 12L, 11L, 20L),
N = c(599L, 43L, 111L, 12L, 11L, 5L, 4L,
8L, 2L, 72L, 230L, 617L, 13L, 58L, 19L, 9L,
5L, 3L, 2L, 1L, 11L, 1L, 1L, 1L),
col2 = c("cat", "cat", "cat", "cat", "cat",
"cat", "cat", "cat", "cat", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog", "dog",
"dog", "dog" )),
.Names = c("cardnty", "col1", "N", "col2"),
row.names = c(NA, -24L),
class = "data.frame")
require('data.table')
setDT(test_data)
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)
test_results <- structure(list(col1 = c(8L, 14L, 19L, 12L, 11L, 20L),
col2 = c("cat", "cat", "cat", "cat", "cat", "cat")),
.Names = c("col1", "col2"),
row.names = c(NA, -6L),
class = "data.frame")
identical( results, test_results)
# TRUE
Upvotes: 0
Views: 63
Reputation: 42544
As far as I have understood the question, the OP is seeking for missing combinations of col1
and col2
in test_data
.
We can get all unique combinations of col1
and col2
using data.table
's CJ()
(cross join) or expand.grid()
from base R. Then we can find the missing elements using an anti-join which removes the already existing combinations.
library(data.table)
setDT(test_data)[, CJ(col1 = col1, col2 = col2, unique = TRUE)][
!test_data, on = .(col1, col2)]
col1 col2 1: 8 cat 2: 11 cat 3: 12 cat 4: 14 cat 5: 19 cat 6: 20 cat
Upvotes: 1
Reputation: 581
Here's a suggestion: Use dcast to convert the dataframe to wide, which fills in 0 for missing values, then converting back to long and pulling out the zero's.
library(reshape2)
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
The time comparison suggests this runs a bit quicker.
start_time <- Sys.time()
for (x in c(1:10000)){
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)
}
end_time <- Sys.time()
timeA <- end_time-start_time
# Time difference of 1.725317 mins
start_time <- Sys.time()
for (x in c(1:10000)){
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
}
end_time <- Sys.time()
timeB <- end_time-start_time
# Time difference of 1.368845 mins
Upvotes: 1