dmacfour
dmacfour

Reputation: 113

Is there a computationally faster way of doing this data transformation?

I'm really struggling with how to pull this transformation without resorting to loops. My dataframe is in a long format so that each year, grade, and school combination has its own row.

I'm wanting to divide a value for each year + grade + school by the value for the previous year and grade. The code below should do that for me, but with 500k rows, it will take days to complete.

Any ideas on how I could do this faster?

I've tried using dplyr but wasn't getting anywhere with it. Same with standard base R transformation approaches.

for (i in 1:NROW(df)) {
  for (j in 1:NROW(df)) {
    if(df$COUNTY[i] == df$COUNTY[j] & 
       df$YEAR[i] == (df$YEAR[j] + 1) & 
       df$Grade[i] == (df$Grade[j] + 1)){

      df$RATE[i] <- df$value[i] / df$value[j]

    } else{

      next

    }
  }
  if(i %% 10 == 0){print(i)}
}

Data:

structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0, 
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007", 
"008", "009", "010", "011", "012", "013", "014", "015", "016", 
"017", "018", "019", "020", "021", "022", "023", "024", "025", 
"026", "027", "028", "029", "030", "031", "032", "033", "034", 
"035", "036", "037", "038", "039", "040", "041", "042", "043", 
"044", "045", "046", "047", "048", "049", "050", "051", "052", 
"053", "054", "055", "056", "057", "058", "059", "060", "061", 
"062", "063", "064", "065", "066", "067", "068", "069", "070", 
"071", "072", "073", "074", "075", "076", "077", "078", "079", 
"080", "081", "082", "083", "084", "085", "086", "087", "088", 
"089", "090", "091", "092", "093", "094", "095", "096", "097", 
"098", "099", "100", "101", "102", "103", "104", "105", "106", 
"107", "108", "109", "110", "111", "112", "113", "114", "115", 
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"), 
    value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208, 
    187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA, 
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
    YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012, 
    2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
    2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 
    0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L, 
        5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
        17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame")))

Upvotes: 1

Views: 108

Answers (4)

Parfait
Parfait

Reputation: 107767

Simply use the vectorized ifelse off of shifted columns. Below assumes COUNTY is a factor variable (not character):

# SHIFT COLUMNS FORWARD
df$COUNTY_SHIFT <- factor(levels(df$COUNTY)[c(0, df$COUNTY[1:(nrow(df)-1)])]
df$YEAR_SHIFT <- c(NA, df$YEAR[1:(nrow(df)-1)])
df$Grade_SHIFT <- c(NA, df$Grade[1:(nrow(df)-1)])
df$value_SHIFT <- c(NA, df$value[1:(nrow(df)-1)])

# CONDITIONALLY ASSIGN
df$RATE <- ifelse(df$COUNTY == df$COUNTY_SHIFT & 
                  df$YEAR == df$YEAR_SHIFT & 
                  df$Grade == df$Grade_SHIFT,
                  df$value / df$value_SHIFT,
                  NA)

Or all inside a within context:

df <- within(df, {
         # SHIFT COLUMNS FORWARD
         COUNTY_SHIFT <- factor(levels(COUNTY)[c(0, COUNTY[1:(nrow(df)-1)])]
         YEAR_SHIFT <- c(NA, YEAR[1:(nrow(df)-1)])
         Grade_SHIFT <- c(NA, Grade[1:(nrow(df)-1)])
         value_SHIFT <- c(NA, value[1:(nrow(df)-1)])

         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})

Alternatively, merge on a shifted data frame:

df$ID <- 1:nrow(df)
shifted_df <- merge(transform(df, ID=ID-1), df[-1,], by="ID", suffixes=c("", "_SHIFT"))

final_df <- within(shifted_df , {   
         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})

Upvotes: 1

akrun
akrun

Reputation: 887881

Using data.table

library(data.table)
setDT(df)[order(YEAR),  Ratio := value/shift(value) , .(COUNTY, Grade)]

Upvotes: 2

Andrew Gustar
Andrew Gustar

Reputation: 18435

In dplyr you can just do this, which should be pretty quick...

library(dplyr)
df <- df %>% group_by(COUNTY, Grade) %>%      #for your df above, but replace with SCHOOL or whatever
             arrange(YEAR) %>%                #sort by increasing year
             mutate(Ratio = value/lag(value)) #value for year / value for previous year

df
    YEAR Grade COUNTY value  Ratio
   <dbl> <dbl> <fct>  <dbl>  <dbl>
 1  2011    -1 001      178 NA    
 2  2011     0 001      212 NA    
 3  2011     1 001      208 NA    
 4  2011     2 001      208 NA    
 5  2011     3 001      242 NA    
 6  2012    -1 001      199  1.12 
 7  2012     0 001      230  1.08 
 8  2012     1 001      227  1.09 
 9  2012     2 001      208  1    
10  2012     3 001      208  0.860
11  2013    -1 001      187  0.940
12  2013     0 001      245  1.07 
13  2013     1 001      235  1.04 
14  2013     2 001      216  1.04 
15  2013     3 001      204  0.981
16  2014    -1 001      189  1.01 
17  2014     0 001      235  0.959
18  2014     1 001      250  1.06 
19  2014     2 001      226  1.05 
20  2014     3 001      217  1.06 

Upvotes: 1

Oliver
Oliver

Reputation: 8602

The simplest way to speed this up, is by eliminating the inner for-loop and using vectorized functions instead.

Example:

df$RATE2 <- 0
for(i in seq(nrow(df))){
    indx <- which(df$COUNTY[i] == df$COUNTY & 
                  df$YEAR[i] == (df$YEAR + 1) & 
                  df$Grade[i] == (df$Grade + 1))
    if((n <- length(indx)) > 1)
        stop("Error, rowcount too great!")
    else if(n == 1)
        df$RATE2[i] <- df$value[i] / df$value[indx]
}
all.equal(df$RATE, df$RATE2)
[1] TRUE

Note that & will perform comparison across all elements in the vectors, thus the logical statement will return a TRUE or FALSE for each row in the data frame. For convenience (and usualy without loosing much if any time) i convert it to an index vector using which, and if the length is only 1 (not overwriting 1 element with multiple) i overwrite the value of RATE2 with the appropriate index.

Comparison:

microbenchmark:::microbenchmark(original = {
    df$RATE <- 0
    for (i in 1:NROW(df)) {
        for (j in 1:NROW(df)) {
            if(df$COUNTY[i] == df$COUNTY[j] && 
               df$YEAR[i] == (df$YEAR[j] + 1) && 
               df$Grade[i] == (df$Grade[j] + 1)){
                df$RATE[i] <- df$value[i] / df$value[j]
            }
        }
    }
}, improved = {
    df$RATE2 <- 0
    for(i in seq(nrow(df))){
        indx <- which(df$COUNTY[i] == df$COUNTY & df$YEAR[i] == (df$YEAR + 1) & df$Grade[i] == (df$Grade + 1))
        if((n <- length(indx)) > 1)
            stop("Error, rowcount too great!")
        else if(n == 1)
            df$RATE2[i] <- df$value[i] / df$value[indx]
    }

})
#output:
Unit: milliseconds
     expr       min        lq      mean    median        uq      max neval
 original 15.452877 19.751258 26.944155 23.750028 33.886566 70.93348   100
 improved  1.020224  1.221664  2.121345  1.730265  2.311173 17.56658   100

From the median time we see that we now only use 1.73/23.75 * 100 = 7.3 % of the original time to perform the calculations.

Note that using apply would not speed up this process, it is the usage of vectorized functions that does the job here. Further note that i slightly altered the code for the original function to use && and removed the redundant else part. This speeds up this version of the code slightly.

Upvotes: 1

Related Questions