Reputation: 113
I'm really struggling with how to pull this transformation without resorting to loops. My dataframe is in a long format so that each year, grade, and school combination has its own row.
I'm wanting to divide a value for each year + grade + school by the value for the previous year and grade. The code below should do that for me, but with 500k rows, it will take days to complete.
Any ideas on how I could do this faster?
I've tried using dplyr but wasn't getting anywhere with it. Same with standard base R transformation approaches.
for (i in 1:NROW(df)) {
for (j in 1:NROW(df)) {
if(df$COUNTY[i] == df$COUNTY[j] &
df$YEAR[i] == (df$YEAR[j] + 1) &
df$Grade[i] == (df$Grade[j] + 1)){
df$RATE[i] <- df$value[i] / df$value[j]
} else{
next
}
}
if(i %% 10 == 0){print(i)}
}
Data:
structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012,
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0,
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007",
"008", "009", "010", "011", "012", "013", "014", "015", "016",
"017", "018", "019", "020", "021", "022", "023", "024", "025",
"026", "027", "028", "029", "030", "031", "032", "033", "034",
"035", "036", "037", "038", "039", "040", "041", "042", "043",
"044", "045", "046", "047", "048", "049", "050", "051", "052",
"053", "054", "055", "056", "057", "058", "059", "060", "061",
"062", "063", "064", "065", "066", "067", "068", "069", "070",
"071", "072", "073", "074", "075", "076", "077", "078", "079",
"080", "081", "082", "083", "084", "085", "086", "087", "088",
"089", "090", "091", "092", "093", "094", "095", "096", "097",
"098", "099", "100", "101", "102", "103", "104", "105", "106",
"107", "108", "109", "110", "111", "112", "113", "114", "115",
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"),
value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208,
187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA,
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012,
2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014,
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1,
0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame")))
Upvotes: 1
Views: 108
Reputation: 107767
Simply use the vectorized ifelse
off of shifted columns. Below assumes COUNTY is a factor variable (not character):
# SHIFT COLUMNS FORWARD
df$COUNTY_SHIFT <- factor(levels(df$COUNTY)[c(0, df$COUNTY[1:(nrow(df)-1)])]
df$YEAR_SHIFT <- c(NA, df$YEAR[1:(nrow(df)-1)])
df$Grade_SHIFT <- c(NA, df$Grade[1:(nrow(df)-1)])
df$value_SHIFT <- c(NA, df$value[1:(nrow(df)-1)])
# CONDITIONALLY ASSIGN
df$RATE <- ifelse(df$COUNTY == df$COUNTY_SHIFT &
df$YEAR == df$YEAR_SHIFT &
df$Grade == df$Grade_SHIFT,
df$value / df$value_SHIFT,
NA)
Or all inside a within
context:
df <- within(df, {
# SHIFT COLUMNS FORWARD
COUNTY_SHIFT <- factor(levels(COUNTY)[c(0, COUNTY[1:(nrow(df)-1)])]
YEAR_SHIFT <- c(NA, YEAR[1:(nrow(df)-1)])
Grade_SHIFT <- c(NA, Grade[1:(nrow(df)-1)])
value_SHIFT <- c(NA, value[1:(nrow(df)-1)])
# CONDITIONALLY ASSIGN
RATE <- ifelse(COUNTY == COUNTY_SHIFT &
YEAR == YEAR_SHIFT &
Grade == Grade_SHIFT,
value / value_SHIFT,
NA)
# REMOVE HELPER COLUMNS
rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})
Alternatively, merge on a shifted data frame:
df$ID <- 1:nrow(df)
shifted_df <- merge(transform(df, ID=ID-1), df[-1,], by="ID", suffixes=c("", "_SHIFT"))
final_df <- within(shifted_df , {
# CONDITIONALLY ASSIGN
RATE <- ifelse(COUNTY == COUNTY_SHIFT &
YEAR == YEAR_SHIFT &
Grade == Grade_SHIFT,
value / value_SHIFT,
NA)
# REMOVE HELPER COLUMNS
rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})
Upvotes: 1
Reputation: 887881
Using data.table
library(data.table)
setDT(df)[order(YEAR), Ratio := value/shift(value) , .(COUNTY, Grade)]
Upvotes: 2
Reputation: 18435
In dplyr
you can just do this, which should be pretty quick...
library(dplyr)
df <- df %>% group_by(COUNTY, Grade) %>% #for your df above, but replace with SCHOOL or whatever
arrange(YEAR) %>% #sort by increasing year
mutate(Ratio = value/lag(value)) #value for year / value for previous year
df
YEAR Grade COUNTY value Ratio
<dbl> <dbl> <fct> <dbl> <dbl>
1 2011 -1 001 178 NA
2 2011 0 001 212 NA
3 2011 1 001 208 NA
4 2011 2 001 208 NA
5 2011 3 001 242 NA
6 2012 -1 001 199 1.12
7 2012 0 001 230 1.08
8 2012 1 001 227 1.09
9 2012 2 001 208 1
10 2012 3 001 208 0.860
11 2013 -1 001 187 0.940
12 2013 0 001 245 1.07
13 2013 1 001 235 1.04
14 2013 2 001 216 1.04
15 2013 3 001 204 0.981
16 2014 -1 001 189 1.01
17 2014 0 001 235 0.959
18 2014 1 001 250 1.06
19 2014 2 001 226 1.05
20 2014 3 001 217 1.06
Upvotes: 1
Reputation: 8602
The simplest way to speed this up, is by eliminating the inner for-loop
and using vectorized functions instead.
Example:
df$RATE2 <- 0
for(i in seq(nrow(df))){
indx <- which(df$COUNTY[i] == df$COUNTY &
df$YEAR[i] == (df$YEAR + 1) &
df$Grade[i] == (df$Grade + 1))
if((n <- length(indx)) > 1)
stop("Error, rowcount too great!")
else if(n == 1)
df$RATE2[i] <- df$value[i] / df$value[indx]
}
all.equal(df$RATE, df$RATE2)
[1] TRUE
Note that &
will perform comparison across all elements in the vectors, thus the logical statement will return a TRUE
or FALSE
for each row in the data frame. For convenience (and usualy without loosing much if any time) i convert it to an index vector using which
, and if the length is only 1 (not overwriting 1 element with multiple) i overwrite the value of RATE2
with the appropriate index.
Comparison:
microbenchmark:::microbenchmark(original = {
df$RATE <- 0
for (i in 1:NROW(df)) {
for (j in 1:NROW(df)) {
if(df$COUNTY[i] == df$COUNTY[j] &&
df$YEAR[i] == (df$YEAR[j] + 1) &&
df$Grade[i] == (df$Grade[j] + 1)){
df$RATE[i] <- df$value[i] / df$value[j]
}
}
}
}, improved = {
df$RATE2 <- 0
for(i in seq(nrow(df))){
indx <- which(df$COUNTY[i] == df$COUNTY & df$YEAR[i] == (df$YEAR + 1) & df$Grade[i] == (df$Grade + 1))
if((n <- length(indx)) > 1)
stop("Error, rowcount too great!")
else if(n == 1)
df$RATE2[i] <- df$value[i] / df$value[indx]
}
})
#output:
Unit: milliseconds
expr min lq mean median uq max neval
original 15.452877 19.751258 26.944155 23.750028 33.886566 70.93348 100
improved 1.020224 1.221664 2.121345 1.730265 2.311173 17.56658 100
From the median time we see that we now only use 1.73/23.75 * 100 = 7.3 %
of the original time to perform the calculations.
Note that using apply
would not speed up this process, it is the usage of vectorized functions that does the job here. Further note that i slightly altered the code for the original function to use &&
and removed the redundant else
part. This speeds up this version of the code slightly.
Upvotes: 1