MSasta
MSasta

Reputation: 86

Finding statistics after grouping in data.table

I had a small question in regards to data.table. Since i'm not so good at it i'm not quite sure how I can do this in data.table.

Basically I have 3 columns and want to group by the first two columns ( key and date ) and then for each key and each date, find the maximum and minimum that occurred in the third column ( fare)

I tried doing this but it gives me an error

flights[, c("max_day", "min_day") := unlist(lapply(gross_fare, findr)), by = c("key", "created_date")]

Error in `[.data.table`(flights, , `:=`(c("max_day", "min_day"), unlist(lapply(gross_fare,  : 
  Supplied 18 items to be assigned to group 1 of size 9 in column 'max_day'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code.

findr is a function which just finds the max and min i.e.

findr <- function(x) {list(max = max(x), min = min(x)}

I've done what I want to do in dplyr and I'll attach the code for that, but since i have millions of rows, dplyr eats up my ram so data.table would help

test <- flights %>%
  select(key, created_date, gross_fare) %>%
  group_by(key, created_date) %>%
  summarise(
            max_day = max(gross_fare),
            min_day = min(gross_fare),
            diff = max_day - min_day) %>%
  arrange(created_date)

I've put the dput output if anyone wants to use that If anyone can help that'd be great, thank you :)

data.table::setDT(structure(list(key = c("LHE_KHI_LHE+KHI_PA-405_15.0_1", "KHI_ISB_KHI+ISB_PK-370_20.0_0", 
"LHE_KHI_LHE+KHI_PK-307_20.0_0", "ISB_KHI_ISB+KHI_PF-124_20.0_1", 
"LHE_KHI_LHE+KHI_PK-307_20.0_0", "LHE_KHI_LHE+KHI_PA-405_15.0_1", 
"KHI_LHE_KHI+LHE_PK-304_20.0_0", "KHI_ISB_KHI+ISB_PA-204_15.0_1", 
"ISB_KHI_ISB+KHI_PA-207_15.0_1", "KHI_ISB_KHI+ISB_PA-200_20.0_1", 
"KHI_LHE_KHI+LHE_PK-304_40.0_0", "ISB_KHI_ISB+KHI_PA-201_35.0_1", 
"ISB_KHI_ISB+KHI_ER-501_20.0_1", "KHI_LHE_KHI+LHE_PF-145_20.0_2", 
"KHI_ISB_KHI+ISB_PA-204_20.0_1", "LHE_KHI_LHE+KHI_PA-401_0.0_0", 
"ISB_KHI_ISB+KHI_PK-309_40.0_0", "KHI_ISB_KHI+ISB_PF-123_20.0_2", 
"ISB_KHI_ISB+KHI_PA-205_15.0_1", "LHE_KHI_LHE+KHI_PF-142_0.0_0", 
"ISB_KHI_ISB+KHI_PA-223_15.0_1", "ISB_KHI_ISB+KHI_PF-126_20.0_2", 
"ISB_KHI_ISB+KHI_PK-309_20.0_0", "KHI_ISB_KHI+ISB_PF-121_20.0_2", 
"ISB_KHI_ISB+KHI_PK-373_20.0_0", "KHI_LHE_KHI+LHE_PF-145_20.0_2", 
"KHI_LHE_KHI+LHE_PA-402_15.0_1", "LHE_KHI_LHE+KHI_PA-407_20.0_1", 
"KHI_ISB_KHI+ISB_PK-308_40.0_0", "KHI_LHE_KHI+LHE_PF-145_20.0_2", 
"LHE_KHI_LHE+KHI_PF-144_0.0_0", "ISB_KHI_ISB+KHI_PK-369_40.0_0", 
"ISB_KHI_ISB+KHI_PF-124_20.0_2", "KHI_ISB_KHI+ISB_PA-204_15.0_1", 
"KHI_ISB_KHI+ISB_PA-200_15.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1", 
"KHI_ISB_KHI+ISB_PK-300_20.0_0", "ISB_KHI_ISB+KHI_PF-122_20.0_2", 
"KHI_ISB_KHI+ISB_PK-368_20.0_0", "KHI_ISB_KHI+ISB_PA-204_15.0_1", 
"ISB_KHI_ISB+KHI_ER-503_20.0_1", "ISB_KHI_ISB+KHI_PA-209_15.0_1", 
"KHI_ISB_KHI+ISB_PK-308_40.0_0", "ISB_KHI_ISB+KHI_PF-124_20.0_1", 
"ISB_KHI_ISB+KHI_PK-301_40.0_0", "KHI_LHE_KHI+LHE_PA-408_35.0_1", 
"LHE_KHI_LHE+KHI_PF-144_20.0_2", "KHI_ISB_KHI+ISB_PF-121_20.0_2", 
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "ISB_KHI_ISB+KHI_PK-309_40.0_0", 
"ISB_KHI_ISB+KHI_PA-223_20.0_1", "KHI_ISB_KHI+ISB_PA-206_35.0_1", 
"LHE_KHI_LHE+KHI_PF-142_32.0_1", "LHE_KHI_LHE+KHI_PF-142_20.0_1", 
"KHI_ISB_KHI+ISB_PF-123_20.0_2", "ISB_KHI_ISB+KHI_PA-209_15.0_1", 
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "ISB_KHI_ISB+KHI_PA-201_20.0_1", 
"KHI_ISB_KHI+ISB_PK-368_20.0_0", "ISB_KHI_ISB+KHI_PA-205_20.0_1", 
"KHI_ISB_KHI+ISB_PF-121_20.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1", 
"ISB_KHI_ISB+KHI_PA-205_15.0_1", "KHI_LHE_KHI+LHE_PF-145_20.0_2", 
"KHI_LHE_KHI+LHE_PA-406_35.0_1", "KHI_ISB_KHI+ISB_PK-308_20.0_0", 
"LHE_KHI_LHE+KHI_PA-401_20.0_1", "LHE_KHI_LHE+KHI_PA-401_15.0_1", 
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "KHI_LHE_KHI+LHE_PA-406_35.0_1", 
"KHI_ISB_KHI+ISB_PA-206_35.0_1", "KHI_ISB_KHI+ISB_PF-121_20.0_1", 
"ISB_KHI_ISB+KHI_PA-205_20.0_1", "LHE_KHI_LHE+KHI_PF-142_20.0_1", 
"LHE_KHI_LHE+KHI_PF-146_20.0_2", "LHE_KHI_LHE+KHI_PA-401_35.0_1", 
"ISB_KHI_ISB+KHI_PA-209_15.0_1", "ISB_KHI_ISB+KHI_PK-301_40.0_0", 
"ISB_KHI_ISB+KHI_PA-205_35.0_1", "KHI_LHE_KHI+LHE_PA-406_15.0_1", 
"KHI_ISB_KHI+ISB_PF-123_20.0_1", "ISB_KHI_ISB+KHI_PA-201_35.0_1", 
"KHI_ISB_KHI+ISB_PK-300_40.0_0", "KHI_LHE_KHI+LHE_PA-402_35.0_1", 
"ISB_KHI_ISB+KHI_ER-505_20.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2", 
"ISB_KHI_ISB+KHI_PA-207_15.0_1", "KHI_LHE_KHI+LHE_PA-404_35.0_1", 
"KHI_ISB_KHI+ISB_PF-123_20.0_1", "ISB_KHI_ISB+KHI_ER-503_20.0_1", 
"ISB_GIL_ISB+GIL_PK-605_20.0_0", "KHI_ISB_KHI+ISB_PF-123_20.0_1", 
"KHI_ISB_KHI+ISB_PA-200_15.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2", 
"KHI_LHE_KHI+LHE_PA-404_35.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2", 
"PEW_KHI_PEW+KHI_PF-152_20.0_1", "LHE_KHI_LHE+KHI_PK-303_20.0_0", 
"KHI_ISB_KHI+ISB_PA-222_35.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1"
), created_date = c("2021-04-20", "2021-05-27", "2021-02-13", 
"2021-08-14", "2021-08-11", "2021-08-21", "2021-01-26", "2021-08-21", 
"2021-05-24", "2021-09-15", "2021-06-05", "2021-07-19", "2021-09-29", 
"2021-07-02", "2021-08-10", "2021-01-04", "2021-07-15", "2021-07-14", 
"2021-08-13", "2021-01-11", "2021-09-13", "2021-09-20", "2021-05-27", 
"2021-02-20", "2021-08-15", "2021-07-27", "2021-08-26", "2021-09-15", 
"2021-08-02", "2021-06-25", "2021-05-15", "2021-08-26", "2021-07-30",
"2021-06-27", "2021-08-07", "2021-03-19", "2021-03-02", "2021-06-06", 
"2021-08-15", "2021-06-27", "2021-09-19", "2021-07-28", "2021-08-09", 
"2021-08-16", "2021-09-09", "2021-06-04", "2021-08-12", "2021-05-15", 
"2021-07-26", "2021-05-27", "2021-08-12", "2021-08-02", "2021-01-26", 
"2021-04-20", "2021-08-26", "2021-08-26", "2021-03-21", "2021-01-09", 
"2021-04-23", "2021-01-04", "2021-08-13", "2021-06-22", "2021-05-31", 
"2021-08-18", "2021-06-16", "2021-08-14", "2021-08-10", "2021-06-16", 
"2021-04-08", "2021-05-20", "2021-06-22", "2021-04-20", "2021-01-05", 
"2021-02-27", "2021-07-07", "2021-03-26", "2021-08-16", "2021-05-01", 
"2021-07-31", "2021-06-14", "2021-06-16", "2021-03-25", "2021-09-14", 
"2021-06-06", "2021-09-02", "2021-08-06", "2021-07-18", "2021-02-28", 
"2021-04-28", "2021-09-19", "2021-08-25", "2021-06-17", "2021-06-07", 
"2021-06-17", "2021-07-07", "2021-08-23", "2021-07-09", "2021-07-19", 
"2021-07-14", "2021-05-21"), gross_fare = c(7796, 7427, 11504, 
6870, 6580, 14945, 8697, 7524, 7124, 6785, 11858, 7524, 11500, 
9525, 6785, 8739, 8200, 13560, 9045, 7400, 7524, 12500, 7458, 
14000, 6570, 9525, 6220, 10545, 8310, 7900, 7820, 8410, 11285, 
19892, 6810, 9800, 11441, 11900, 6570, 13592, 11500, 8300, 20380, 
8525, 7340, 9707, 7870, 10655, 10545, 11798, 14645, 10545, 8650, 
8650, 7870, 12945, 10799, 10227, 6765, 10227, 20120, 11045, 9403, 
7870, 7124, 6570, 6810, 6531, 8605, 7124, 11072, 7390, 10227, 
13435, 10530, 12280, 18945, 11147, 10545, 6531, 6620, 10799, 
18480, 32702, 5606, 13560, 23895, 8027, 9655, 11500, 11990, 6620, 
9403, 7620, 14645, 19105, 9000, 6440, 12645, 8025)), row.names = c(NA, 
-100L), class = c("data.table", "data.frame")))

Upvotes: 1

Views: 64

Answers (2)

Rui Barradas
Rui Barradas

Reputation: 76402

Since the function findr returns a list, there's no need to complicate things:

findr <- function(x) {list(max = max(x), min = min(x))}

flights[, c("max_day", "min_day") := findr(gross_fare), by = list(key, created_date)][]

To also return the difference between max and min, use

findr2 <- function(x) {
  list(max = max(x), min = min(x), diff = diff(range(x)))
}

flights[, c("max_day", "min_day", "diff_day") := findr2(gross_fare), by = list(key, created_date)][]

Upvotes: 0

lovalery
lovalery

Reputation: 4652

I guess this line of code should do the job:

library(data.table)

flights[,  .(min_day = min(gross_fare), max_day = max(gross_fare), diff = max(gross_fare) - min(gross_fare)), by = .(key, created_date)][]

Upvotes: 2

Related Questions