Reputation: 400
I have a chronologically ordered data.frame like this one:
d1 <- data.frame(date = as.POSIXct(c("2010-05-21 08:40:30",
"2010-05-21 09:02:06",
"2010-05-21 09:21:00",
"2010-05-21 09:25:00",
"2010-05-21 09:41:53",
"2010-05-21 11:27:34",
"2010-05-21 15:01:29",
"2010-05-21 15:16:01",
"2010-05-21 18:25:14",
"2010-05-21 19:59:37",
"2010-05-21 22:29:50"), format ="%Y-%m-%d %H:%M:%S"),
price = c(5, 5.2, 6, 8, 7, 5, 6, 6, 6, 6.5, 7.4),
value = c(11313,42423,64645,20000,643426,1313313,1313,3535,6476,11313,9875))
How to aggregate the value column by the following rules:
The resulting data.frame should be aggregated:
Resulting data.frame:
53736 row 1+2
64645 row 3
663426 row 4+5
1313313 row 6
4848 row 7+8
6476 row 9
11313 row 10
9875 row 11
time_diff; price_diff
true; true -> aggregate
true; false -> leave
false; true -> leave
false; false -> leave
Thank you!
UPDATE:
An additional example data.frame
d1 <- data.frame(date = as.POSIXct(c("2010-02-09 14:05:45", "2010-02-09 14:05:52",
"2010-02-09 14:37:31", "2010-02-09 14:43:37", "2010-02-09 14:44:15", "2010-02-09 15:10:37",
"2010-02-09 15:10:44", "2010-02-09 15:12:29", "2010-02-09 15:13:48", "2010-02-09 15:21:53",
"2010-02-09 15:33:40", "2010-02-09 15:33:46", "2010-02-09 15:42:26", "2010-02-09 15:42:38",
"2010-02-13 11:06:31", "2010-03-16 15:48:42", "2010-03-19 08:23:01", "2010-03-19 11:29:58",
"2010-03-22 14:28:24", "2010-04-10 11:08:21"), format ="%Y-%m-%d %H:%M:%S"),
value = c(1074, 1075, 1500, 3000, 3000, 2500, 2500, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 6000, 5000, 1000, 5000, 3500, 1000),
price = c(154.1, 154, 128.9, 131.8, 131.7, 131.7, 131.6, 131.7,
131.8, 131.8, 129.2, 129.2, 127.8, 127.7, 120.9, 29.1, 29, 35.6, 69.8, 11.6))
EXPECTED RESULT:
row 1+2
row 3
row 4 to 8
row 9+10
row 11+12
row 13+14
row 15
row 16
row 17
row 18
row 19
row 20
UPDATE 2 For the additional dataset I wrote a For Loop which goes row by row. It is not a elegant solution but it seems to work. And I think that I still have a problem with the last row (I am hard coding it at the beginning of the For Loop).
## init of an empty list
ids_in_current_backet <- list()
## loop row by roe
for (cur_row in seq(1, nrow(d1), 1)) {
# if it is last row, break the for loop
if(cur_row == nrow(d1)){
d1$ids_in_current_backet[[cur_row]] <- list(nrow(d1))
break}
# collect ids in the current bucket
ids_in_current_backet <- c(ids_in_current_backet, cur_row)
# calc of differences
time_diff <- (as.numeric(d1$date[[last(ids_in_current_backet)]] - d1$date[[first(ids_in_current_backet)]], units = 'mins'))
price_diff <- abs(d1$price[[last(ids_in_current_backet)]] - d1$price[[first(ids_in_current_backet)]])
# conditions not met: more than 30 mins time OR price diff more than one
if(time_diff > 30 | price_diff > 1){
ids_in_current_backet <- list()
ids_in_current_backet <- c(ids_in_current_backet, cur_row)
d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
}
d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
}
### extract the first element from the list as a grouping variable
for (cur_row in seq(1, nrow(d1), 1)) {
d1$grouping[[cur_row]] <- d1$ids_in_current_backet[[cur_row]][[1]]
}
## sumarise value per grouping
d1 %>% group_by(grouping) %>%
summarise(sum_value = sum(value, na.rm = T))
UPDATE 3 additional dataset
d1 <- data.frame(date = as.POSIXct(c("2009-01-18 15:55:54", "2009-01-22 10:24:49", "2009-02-15 11:17:14", "2009-02-22 14:27:03",
"2009-04-19 08:59:42", "2009-05-18 08:36:13", "2009-05-23 11:03:53",
"2009-05-24 12:02:06", "2009-05-24 12:02:22", "2009-05-30 08:35:04",
"2009-05-30 12:17:50", "2009-06-15 09:11:45", "2009-06-18 11:40:19",
"2009-06-18 13:22:06", "2009-06-25 14:09:07", "2009-06-28 09:47:09",
"2009-06-28 09:51:01", "2009-06-28 09:52:53", "2009-06-28 09:54:33",
"2009-06-28 15:21:44", "2009-06-28 15:34:10", "2009-07-02 12:10:56",
"2009-07-27 09:09:20", "2009-08-13 09:58:02"), format ="%Y-%m-%d %H:%M:%S"),
value = c(5000, 3000, 15000, 1000, 1000, 9360, 8000, 4550, 2800, 1000, 2325, 1000, 3000, 1000, 1500, 4000, 10000, 10000, 3500, 10000, 3000, 1000, 6000, 2000),
price = c(169.5, 153.5, 254.8, 245.7, 160.5, 105.8, 115.2, 111.2, 111.3, 164.1, 162.8, 93.5, 126, 124.2, 155, 169.3, 166.5, 168.8, 168.8, 177.5, 174.2, 166.2, 79.5, 119.5))
Upvotes: 1
Views: 87
Reputation: 25225
Posting a possible data.table
approach:
library(data.table)
func <- function(df) {
DT <- setDT(copy(df))[, rn := .I]
chosen <- c()
DT[,
{
#drop those rows that have already been chosen
idx <- setdiff(
DT[rn >= .BY$rn & #must be after current row
date <= .BY$date+30L*60L & #must be within 30mins
abs(price - .BY$price) <= 1, #price diff less than 1
rn],
chosen)
if (.BY$rn %in% idx && all(diff(idx) == 1L)) {
#if there are other rows that should go into in this bucket with this row
#and they are consecutive rows
chosen <- c(chosen, idx)
list(workings=paste(idx, collapse="+"), val=DT[idx, sum(value)])
} else if (.BY$rn %in% idx && !all(diff(idx) == 1L)) {
#if this row has never been used and there are non consecutive rows that
#had wanted to go into this bucket
chosen <- c(chosen, .BY$rn)
list(workings=as.character(.BY$rn), val=DT[.BY$rn, value])
}
},
by=.(rn, date, price)]
}
output for func(d1)
:
rn date price workings val
1: 1 2010-05-21 08:40:30 5.0 1+2 53736
2: 3 2010-05-21 09:21:00 6.0 3 64645
3: 4 2010-05-21 09:25:00 8.0 4+5 663426
4: 6 2010-05-21 11:27:34 5.0 6 1313313
5: 7 2010-05-21 15:01:29 6.0 7+8 4848
6: 9 2010-05-21 18:25:14 6.0 9 6476
7: 10 2010-05-21 19:59:37 6.5 10 11313
8: 11 2010-05-21 22:29:50 7.4 11 9875
output for func(d2)
:
rn date price workings val
1: 1 2010-02-09 14:05:45 154.1 1+2 2149
2: 3 2010-02-09 14:37:31 128.9 3 1500
3: 4 2010-02-09 14:43:37 131.8 4+5+6+7+8 12000
4: 9 2010-02-09 15:13:48 131.8 9+10 2000
5: 11 2010-02-09 15:33:40 129.2 11+12 2000
6: 13 2010-02-09 15:42:26 127.8 13+14 2000
7: 15 2010-02-13 11:06:31 120.9 15 6000
8: 16 2010-03-16 15:48:42 29.1 16 5000
9: 17 2010-03-19 08:23:01 29.0 17 1000
10: 18 2010-03-19 11:29:58 35.6 18 5000
11: 19 2010-03-22 14:28:24 69.8 19 3500
12: 20 2010-04-10 11:08:21 11.6 20 1000
output for func(d3)
:
rn date price workings val
1: 1 2009-01-18 15:55:54 169.5 1 5000
2: 2 2009-01-22 10:24:49 153.5 2 3000
3: 3 2009-02-15 11:17:14 254.8 3 15000
4: 4 2009-02-22 14:27:03 245.7 4 1000
5: 5 2009-04-19 08:59:42 160.5 5 1000
6: 6 2009-05-18 08:36:13 105.8 6 9360
7: 7 2009-05-23 11:03:53 115.2 7 8000
8: 8 2009-05-24 12:02:06 111.2 8+9 7350
9: 10 2009-05-30 08:35:04 164.1 10 1000
10: 11 2009-05-30 12:17:50 162.8 11 2325
11: 12 2009-06-15 09:11:45 93.5 12 1000
12: 13 2009-06-18 11:40:19 126.0 13 3000
13: 14 2009-06-18 13:22:06 124.2 14 1000
14: 15 2009-06-25 14:09:07 155.0 15 1500
15: 16 2009-06-28 09:47:09 169.3 16 4000
16: 17 2009-06-28 09:51:01 166.5 17 10000
17: 18 2009-06-28 09:52:53 168.8 18+19 13500
18: 20 2009-06-28 15:21:44 177.5 20 10000
19: 21 2009-06-28 15:34:10 174.2 21 3000
20: 22 2009-07-02 12:10:56 166.2 22 1000
21: 23 2009-07-27 09:09:20 79.5 23 6000
22: 24 2009-08-13 09:58:02 119.5 24 2000
rn date price workings val
d1 <- data.frame(date = as.POSIXct(c("2010-05-21 08:40:30",
"2010-05-21 09:02:06",
"2010-05-21 09:21:00",
"2010-05-21 09:25:00",
"2010-05-21 09:41:53",
"2010-05-21 11:27:34",
"2010-05-21 15:01:29",
"2010-05-21 15:16:01",
"2010-05-21 18:25:14",
"2010-05-21 19:59:37",
"2010-05-21 22:29:50"), format ="%Y-%m-%d %H:%M:%S"),
price = c(5, 5.2, 6, 8, 7, 5, 6, 6, 6, 6.5, 7.4),
value = c(11313,42423,64645,20000,643426,1313313,1313,3535,6476,11313,9875))
####################################################################################################
d2 <- data.frame(date = as.POSIXct(c("2010-02-09 14:05:45", "2010-02-09 14:05:52",
"2010-02-09 14:37:31", "2010-02-09 14:43:37",
"2010-02-09 14:44:15", "2010-02-09 15:10:37",
"2010-02-09 15:10:44", "2010-02-09 15:12:29",
"2010-02-09 15:13:48", "2010-02-09 15:21:53",
"2010-02-09 15:33:40", "2010-02-09 15:33:46",
"2010-02-09 15:42:26", "2010-02-09 15:42:38",
"2010-02-13 11:06:31", "2010-03-16 15:48:42",
"2010-03-19 08:23:01", "2010-03-19 11:29:58",
"2010-03-22 14:28:24", "2010-04-10 11:08:21"), format ="%Y-%m-%d %H:%M:%S"),
value = c(1074, 1075, 1500, 3000, 3000, 2500, 2500, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 6000, 5000, 1000, 5000, 3500, 1000),
price = c(154.1, 154, 128.9, 131.8, 131.7, 131.7, 131.6, 131.7,
131.8, 131.8, 129.2, 129.2, 127.8, 127.7, 120.9, 29.1, 29, 35.6, 69.8, 11.6))
####################################################################################################
d3 <- data.frame(date = as.POSIXct(c("2009-01-18 15:55:54", "2009-01-22 10:24:49",
"2009-02-15 11:17:14", "2009-02-22 14:27:03",
"2009-04-19 08:59:42", "2009-05-18 08:36:13", "2009-05-23 11:03:53",
"2009-05-24 12:02:06", "2009-05-24 12:02:22", "2009-05-30 08:35:04",
"2009-05-30 12:17:50", "2009-06-15 09:11:45", "2009-06-18 11:40:19",
"2009-06-18 13:22:06", "2009-06-25 14:09:07", "2009-06-28 09:47:09",
"2009-06-28 09:51:01", "2009-06-28 09:52:53", "2009-06-28 09:54:33",
"2009-06-28 15:21:44", "2009-06-28 15:34:10", "2009-07-02 12:10:56",
"2009-07-27 09:09:20", "2009-08-13 09:58:02"), format ="%Y-%m-%d %H:%M:%S"),
value = c(5000, 3000, 15000, 1000, 1000, 9360, 8000, 4550, 2800, 1000, 2325, 1000,
3000, 1000, 1500, 4000, 10000, 10000, 3500, 10000, 3000, 1000, 6000, 2000),
price = c(169.5, 153.5, 254.8, 245.7, 160.5, 105.8, 115.2, 111.2, 111.3, 164.1,
162.8, 93.5, 126, 124.2, 155, 169.3, 166.5, 168.8, 168.8, 177.5, 174.2,
166.2, 79.5, 119.5))
Upvotes: 1
Reputation: 400
I used this not elegant solution:
## init of an empty list
ids_in_current_backet <- list()
## loop row by roe
for (cur_row in seq(1, nrow(d1), 1)) {
# if it is last row, break the for loop
if(cur_row == nrow(d1)){
d1$ids_in_current_backet[[cur_row]] <- list(nrow(d1))
break}
# collect ids in the current bucket
ids_in_current_backet <- c(ids_in_current_backet, cur_row)
# calc of differences
time_diff <- (as.numeric(d1$date[[last(ids_in_current_backet)]] - d1$date[[first(ids_in_current_backet)]], units = 'mins'))
price_diff <- abs(d1$price[[last(ids_in_current_backet)]] - d1$price[[first(ids_in_current_backet)]])
# conditions not met: more than 30 mins time OR price diff more than one
if(time_diff > 30 | price_diff > 1){
ids_in_current_backet <- list()
ids_in_current_backet <- c(ids_in_current_backet, cur_row)
d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
}
d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
}
### extract the first element from the list as a grouping variable
for (cur_row in seq(1, nrow(d1), 1)) {
d1$grouping[[cur_row]] <- d1$ids_in_current_backet[[cur_row]][[1]]
}
## sumarise value per grouping
d1 %>% group_by(grouping) %>%
summarise(sum_value = sum(value, na.rm = T))
Upvotes: 0