Sathish
Sathish

Reputation: 12703

find missing entries - improve efficiency by eliminating nested loop

I would like to eliminate nested loop used to create data in m inside the function. The missing_entries() function is trying to identify and return values of col1 missing for groups in col2. Is there any alternative way to approach this problem in order to improve efficiency?

missing_entries <- function( data, x, y )
{
  # find missing entries in x for the group in y
  # by comparing other groups in y
  require( 'data.table' )

  # require class of data to be data.table
  stopifnot( "data.table" %in% class(data) ) 

  # the outer loop with i refers to each unique value of column y
  # the inner loop with j refers to all unique values of column y
  # except the value in i under current iteration
  uniq_col2 <- unique( data[, get(y) ] )
  m <- lapply( uniq_col2, function(i){
    lapply( setdiff( uniq_col2, i ), function( j ) {
      z <- setdiff( data[ get(y) == i, get(x)], data[ get(y) == j, get(x)])
      if( length(z) > 0 ){
        return( data.frame( v1 = z, v2 = j,
                            stringsAsFactors = FALSE ) )
      } else{
        return()
      }
    } )
  })

  # row bind
  m <- lapply( m, function(k) {
    if(!is.null(k)) {
      rbindlist(l = k)
    }})

  # collect only not null data
  m <- rbindlist( l = m[lapply(m, nrow) > 0] )
  colnames( m ) <- c( x, y )

  return( m )
}

# testing
test_data <- structure(list(cardnty = c("many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many"), 
                            col1 = c(2L, 4L, 3L, 13L, 5L, 6L, 7L, 17L, 9L, 4L, 3L, 
                                           2L, 8L, 5L, 6L, 7L, 14L, 17L, 19L, 13L, 
                                           9L, 12L, 11L, 20L), 
                            N = c(599L, 43L, 111L, 12L, 11L, 5L, 4L, 
                                  8L, 2L, 72L, 230L, 617L, 13L, 58L, 19L, 9L, 
                                  5L, 3L, 2L, 1L, 11L, 1L, 1L, 1L), 
                            col2 = c("cat", "cat", "cat", "cat", "cat", 
                                   "cat", "cat", "cat", "cat", "dog",  
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog",
                                   "dog", "dog" )), 
                       .Names = c("cardnty", "col1", "N", "col2"), 
                       row.names = c(NA, -24L),
                       class = "data.frame")

require('data.table')
setDT(test_data)
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)

test_results <- structure(list(col1 = c(8L, 14L, 19L, 12L, 11L, 20L), 
                               col2 = c("cat", "cat", "cat", "cat", "cat", "cat")), 
                          .Names = c("col1", "col2"),
                          row.names = c(NA, -6L), 
                          class = "data.frame")

identical( results, test_results)
# TRUE

Upvotes: 0

Views: 63

Answers (2)

Uwe
Uwe

Reputation: 42544

As far as I have understood the question, the OP is seeking for missing combinations of col1 and col2 in test_data.

We can get all unique combinations of col1 and col2 using data.table's CJ() (cross join) or expand.grid() from base R. Then we can find the missing elements using an anti-join which removes the already existing combinations.

library(data.table)
setDT(test_data)[, CJ(col1 = col1, col2 = col2, unique = TRUE)][
  !test_data, on = .(col1, col2)]
   col1 col2
1:    8  cat
2:   11  cat
3:   12  cat
4:   14  cat
5:   19  cat
6:   20  cat

Upvotes: 1

Jason
Jason

Reputation: 581

Here's a suggestion: Use dcast to convert the dataframe to wide, which fills in 0 for missing values, then converting back to long and pulling out the zero's.

library(reshape2)
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]

The time comparison suggests this runs a bit quicker.

start_time <- Sys.time()
for (x in c(1:10000)){
  results <- missing_entries(data = test_data, x = "col1", y = "col2")
  setDF(results)
}
end_time <- Sys.time()
timeA <- end_time-start_time
# Time difference of 1.725317 mins


start_time <- Sys.time()
for (x in c(1:10000)){
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
}
end_time <- Sys.time()
timeB <- end_time-start_time
# Time difference of 1.368845 mins

Upvotes: 1

Related Questions