Reputation: 86
I had a small question in regards to data.table. Since i'm not so good at it i'm not quite sure how I can do this in data.table.
Basically I have 3 columns and want to group by the first two columns ( key and date ) and then for each key and each date, find the maximum and minimum that occurred in the third column ( fare)
I tried doing this but it gives me an error
flights[, c("max_day", "min_day") := unlist(lapply(gross_fare, findr)), by = c("key", "created_date")]
Error in `[.data.table`(flights, , `:=`(c("max_day", "min_day"), unlist(lapply(gross_fare, :
Supplied 18 items to be assigned to group 1 of size 9 in column 'max_day'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code.
findr is a function which just finds the max and min i.e.
findr <- function(x) {list(max = max(x), min = min(x)}
I've done what I want to do in dplyr and I'll attach the code for that, but since i have millions of rows, dplyr eats up my ram so data.table would help
test <- flights %>%
select(key, created_date, gross_fare) %>%
group_by(key, created_date) %>%
summarise(
max_day = max(gross_fare),
min_day = min(gross_fare),
diff = max_day - min_day) %>%
arrange(created_date)
I've put the dput output if anyone wants to use that If anyone can help that'd be great, thank you :)
data.table::setDT(structure(list(key = c("LHE_KHI_LHE+KHI_PA-405_15.0_1", "KHI_ISB_KHI+ISB_PK-370_20.0_0",
"LHE_KHI_LHE+KHI_PK-307_20.0_0", "ISB_KHI_ISB+KHI_PF-124_20.0_1",
"LHE_KHI_LHE+KHI_PK-307_20.0_0", "LHE_KHI_LHE+KHI_PA-405_15.0_1",
"KHI_LHE_KHI+LHE_PK-304_20.0_0", "KHI_ISB_KHI+ISB_PA-204_15.0_1",
"ISB_KHI_ISB+KHI_PA-207_15.0_1", "KHI_ISB_KHI+ISB_PA-200_20.0_1",
"KHI_LHE_KHI+LHE_PK-304_40.0_0", "ISB_KHI_ISB+KHI_PA-201_35.0_1",
"ISB_KHI_ISB+KHI_ER-501_20.0_1", "KHI_LHE_KHI+LHE_PF-145_20.0_2",
"KHI_ISB_KHI+ISB_PA-204_20.0_1", "LHE_KHI_LHE+KHI_PA-401_0.0_0",
"ISB_KHI_ISB+KHI_PK-309_40.0_0", "KHI_ISB_KHI+ISB_PF-123_20.0_2",
"ISB_KHI_ISB+KHI_PA-205_15.0_1", "LHE_KHI_LHE+KHI_PF-142_0.0_0",
"ISB_KHI_ISB+KHI_PA-223_15.0_1", "ISB_KHI_ISB+KHI_PF-126_20.0_2",
"ISB_KHI_ISB+KHI_PK-309_20.0_0", "KHI_ISB_KHI+ISB_PF-121_20.0_2",
"ISB_KHI_ISB+KHI_PK-373_20.0_0", "KHI_LHE_KHI+LHE_PF-145_20.0_2",
"KHI_LHE_KHI+LHE_PA-402_15.0_1", "LHE_KHI_LHE+KHI_PA-407_20.0_1",
"KHI_ISB_KHI+ISB_PK-308_40.0_0", "KHI_LHE_KHI+LHE_PF-145_20.0_2",
"LHE_KHI_LHE+KHI_PF-144_0.0_0", "ISB_KHI_ISB+KHI_PK-369_40.0_0",
"ISB_KHI_ISB+KHI_PF-124_20.0_2", "KHI_ISB_KHI+ISB_PA-204_15.0_1",
"KHI_ISB_KHI+ISB_PA-200_15.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1",
"KHI_ISB_KHI+ISB_PK-300_20.0_0", "ISB_KHI_ISB+KHI_PF-122_20.0_2",
"KHI_ISB_KHI+ISB_PK-368_20.0_0", "KHI_ISB_KHI+ISB_PA-204_15.0_1",
"ISB_KHI_ISB+KHI_ER-503_20.0_1", "ISB_KHI_ISB+KHI_PA-209_15.0_1",
"KHI_ISB_KHI+ISB_PK-308_40.0_0", "ISB_KHI_ISB+KHI_PF-124_20.0_1",
"ISB_KHI_ISB+KHI_PK-301_40.0_0", "KHI_LHE_KHI+LHE_PA-408_35.0_1",
"LHE_KHI_LHE+KHI_PF-144_20.0_2", "KHI_ISB_KHI+ISB_PF-121_20.0_2",
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "ISB_KHI_ISB+KHI_PK-309_40.0_0",
"ISB_KHI_ISB+KHI_PA-223_20.0_1", "KHI_ISB_KHI+ISB_PA-206_35.0_1",
"LHE_KHI_LHE+KHI_PF-142_32.0_1", "LHE_KHI_LHE+KHI_PF-142_20.0_1",
"KHI_ISB_KHI+ISB_PF-123_20.0_2", "ISB_KHI_ISB+KHI_PA-209_15.0_1",
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "ISB_KHI_ISB+KHI_PA-201_20.0_1",
"KHI_ISB_KHI+ISB_PK-368_20.0_0", "ISB_KHI_ISB+KHI_PA-205_20.0_1",
"KHI_ISB_KHI+ISB_PF-121_20.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1",
"ISB_KHI_ISB+KHI_PA-205_15.0_1", "KHI_LHE_KHI+LHE_PF-145_20.0_2",
"KHI_LHE_KHI+LHE_PA-406_35.0_1", "KHI_ISB_KHI+ISB_PK-308_20.0_0",
"LHE_KHI_LHE+KHI_PA-401_20.0_1", "LHE_KHI_LHE+KHI_PA-401_15.0_1",
"KHI_ISB_KHI+ISB_PA-204_35.0_1", "KHI_LHE_KHI+LHE_PA-406_35.0_1",
"KHI_ISB_KHI+ISB_PA-206_35.0_1", "KHI_ISB_KHI+ISB_PF-121_20.0_1",
"ISB_KHI_ISB+KHI_PA-205_20.0_1", "LHE_KHI_LHE+KHI_PF-142_20.0_1",
"LHE_KHI_LHE+KHI_PF-146_20.0_2", "LHE_KHI_LHE+KHI_PA-401_35.0_1",
"ISB_KHI_ISB+KHI_PA-209_15.0_1", "ISB_KHI_ISB+KHI_PK-301_40.0_0",
"ISB_KHI_ISB+KHI_PA-205_35.0_1", "KHI_LHE_KHI+LHE_PA-406_15.0_1",
"KHI_ISB_KHI+ISB_PF-123_20.0_1", "ISB_KHI_ISB+KHI_PA-201_35.0_1",
"KHI_ISB_KHI+ISB_PK-300_40.0_0", "KHI_LHE_KHI+LHE_PA-402_35.0_1",
"ISB_KHI_ISB+KHI_ER-505_20.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2",
"ISB_KHI_ISB+KHI_PA-207_15.0_1", "KHI_LHE_KHI+LHE_PA-404_35.0_1",
"KHI_ISB_KHI+ISB_PF-123_20.0_1", "ISB_KHI_ISB+KHI_ER-503_20.0_1",
"ISB_GIL_ISB+GIL_PK-605_20.0_0", "KHI_ISB_KHI+ISB_PF-123_20.0_1",
"KHI_ISB_KHI+ISB_PA-200_15.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2",
"KHI_LHE_KHI+LHE_PA-404_35.0_1", "ISB_KHI_ISB+KHI_PF-122_20.0_2",
"PEW_KHI_PEW+KHI_PF-152_20.0_1", "LHE_KHI_LHE+KHI_PK-303_20.0_0",
"KHI_ISB_KHI+ISB_PA-222_35.0_1", "ISB_KHI_ISB+KHI_PF-124_20.0_1"
), created_date = c("2021-04-20", "2021-05-27", "2021-02-13",
"2021-08-14", "2021-08-11", "2021-08-21", "2021-01-26", "2021-08-21",
"2021-05-24", "2021-09-15", "2021-06-05", "2021-07-19", "2021-09-29",
"2021-07-02", "2021-08-10", "2021-01-04", "2021-07-15", "2021-07-14",
"2021-08-13", "2021-01-11", "2021-09-13", "2021-09-20", "2021-05-27",
"2021-02-20", "2021-08-15", "2021-07-27", "2021-08-26", "2021-09-15",
"2021-08-02", "2021-06-25", "2021-05-15", "2021-08-26", "2021-07-30",
"2021-06-27", "2021-08-07", "2021-03-19", "2021-03-02", "2021-06-06",
"2021-08-15", "2021-06-27", "2021-09-19", "2021-07-28", "2021-08-09",
"2021-08-16", "2021-09-09", "2021-06-04", "2021-08-12", "2021-05-15",
"2021-07-26", "2021-05-27", "2021-08-12", "2021-08-02", "2021-01-26",
"2021-04-20", "2021-08-26", "2021-08-26", "2021-03-21", "2021-01-09",
"2021-04-23", "2021-01-04", "2021-08-13", "2021-06-22", "2021-05-31",
"2021-08-18", "2021-06-16", "2021-08-14", "2021-08-10", "2021-06-16",
"2021-04-08", "2021-05-20", "2021-06-22", "2021-04-20", "2021-01-05",
"2021-02-27", "2021-07-07", "2021-03-26", "2021-08-16", "2021-05-01",
"2021-07-31", "2021-06-14", "2021-06-16", "2021-03-25", "2021-09-14",
"2021-06-06", "2021-09-02", "2021-08-06", "2021-07-18", "2021-02-28",
"2021-04-28", "2021-09-19", "2021-08-25", "2021-06-17", "2021-06-07",
"2021-06-17", "2021-07-07", "2021-08-23", "2021-07-09", "2021-07-19",
"2021-07-14", "2021-05-21"), gross_fare = c(7796, 7427, 11504,
6870, 6580, 14945, 8697, 7524, 7124, 6785, 11858, 7524, 11500,
9525, 6785, 8739, 8200, 13560, 9045, 7400, 7524, 12500, 7458,
14000, 6570, 9525, 6220, 10545, 8310, 7900, 7820, 8410, 11285,
19892, 6810, 9800, 11441, 11900, 6570, 13592, 11500, 8300, 20380,
8525, 7340, 9707, 7870, 10655, 10545, 11798, 14645, 10545, 8650,
8650, 7870, 12945, 10799, 10227, 6765, 10227, 20120, 11045, 9403,
7870, 7124, 6570, 6810, 6531, 8605, 7124, 11072, 7390, 10227,
13435, 10530, 12280, 18945, 11147, 10545, 6531, 6620, 10799,
18480, 32702, 5606, 13560, 23895, 8027, 9655, 11500, 11990, 6620,
9403, 7620, 14645, 19105, 9000, 6440, 12645, 8025)), row.names = c(NA,
-100L), class = c("data.table", "data.frame")))
Upvotes: 1
Views: 64
Reputation: 76402
Since the function findr
returns a list, there's no need to complicate things:
findr <- function(x) {list(max = max(x), min = min(x))}
flights[, c("max_day", "min_day") := findr(gross_fare), by = list(key, created_date)][]
To also return the difference between max
and min
, use
findr2 <- function(x) {
list(max = max(x), min = min(x), diff = diff(range(x)))
}
flights[, c("max_day", "min_day", "diff_day") := findr2(gross_fare), by = list(key, created_date)][]
Upvotes: 0
Reputation: 4652
I guess this line of code should do the job:
library(data.table)
flights[, .(min_day = min(gross_fare), max_day = max(gross_fare), diff = max(gross_fare) - min(gross_fare)), by = .(key, created_date)][]
Upvotes: 2