user1766682
user1766682

Reputation: 400

Aggregating over time and value difference

I have a chronologically ordered data.frame like this one:

d1 <- data.frame(date = as.POSIXct(c("2010-05-21 08:40:30",
                                 "2010-05-21 09:02:06",
                                 "2010-05-21 09:21:00",
                                 "2010-05-21 09:25:00",
                                 "2010-05-21 09:41:53",
                                 "2010-05-21 11:27:34",
                                 "2010-05-21 15:01:29",
                                 "2010-05-21 15:16:01",
                                 "2010-05-21 18:25:14",
                                 "2010-05-21 19:59:37",
                                 "2010-05-21 22:29:50"), format ="%Y-%m-%d %H:%M:%S"),
                              price = c(5, 5.2, 6, 8, 7, 5, 6, 6, 6, 6.5, 7.4),
                              value = c(11313,42423,64645,20000,643426,1313313,1313,3535,6476,11313,9875))

How to aggregate the value column by the following rules:

  1. Start with the first row and go row by row
  2. check the next record's timestamp, if it is whithin 30 mins and the price difference is <= 1 USD (both conditions applied to the first record in the current bucket), then
  3. run cumsum(value) over all rows from the first row in the current bucket and until a) out of 30 mins or b) price difference > 1 USD
  4. go to next row and follow 1 to 3
  5. if conditions are violated, start the next bucket

The resulting data.frame should be aggregated:

Resulting data.frame:

53736   row 1+2
64645   row 3
663426  row 4+5
1313313 row 6
4848    row 7+8
6476    row 9
11313   row 10
9875        row 11




time_diff; price_diff
true; true  -> aggregate
true; false -> leave
false; true -> leave
false; false -> leave

Thank you!

UPDATE:

An additional example data.frame

    d1 <- data.frame(date = as.POSIXct(c("2010-02-09 14:05:45", "2010-02-09 14:05:52",
"2010-02-09 14:37:31", "2010-02-09 14:43:37", "2010-02-09 14:44:15", "2010-02-09 15:10:37", 
"2010-02-09 15:10:44", "2010-02-09 15:12:29", "2010-02-09 15:13:48", "2010-02-09 15:21:53", 
"2010-02-09 15:33:40", "2010-02-09 15:33:46", "2010-02-09 15:42:26", "2010-02-09 15:42:38", 
"2010-02-13 11:06:31", "2010-03-16 15:48:42", "2010-03-19 08:23:01", "2010-03-19 11:29:58", 
"2010-03-22 14:28:24", "2010-04-10 11:08:21"), format ="%Y-%m-%d %H:%M:%S"),
value = c(1074, 1075, 1500, 3000, 3000, 2500, 2500, 1000, 1000, 1000, 
1000, 1000, 1000, 1000, 6000, 5000, 1000, 5000, 3500, 1000),
price = c(154.1, 154, 128.9, 131.8, 131.7, 131.7, 131.6, 131.7, 
131.8, 131.8, 129.2, 129.2, 127.8, 127.7, 120.9, 29.1, 29, 35.6, 69.8, 11.6))

EXPECTED RESULT:

row 1+2
row 3
row 4 to 8
row 9+10
row 11+12
row 13+14
row 15
row 16
row 17
row 18
row 19
row 20

UPDATE 2 For the additional dataset I wrote a For Loop which goes row by row. It is not a elegant solution but it seems to work. And I think that I still have a problem with the last row (I am hard coding it at the beginning of the For Loop).

## init of an empty list
ids_in_current_backet <- list()

## loop row by roe
for (cur_row in seq(1, nrow(d1), 1)) {

  # if it is last row, break the for loop
  if(cur_row == nrow(d1)){
    d1$ids_in_current_backet[[cur_row]] <- list(nrow(d1))
    break}
  # collect ids in the current bucket
  ids_in_current_backet <- c(ids_in_current_backet, cur_row)

  # calc of differences
  time_diff <- (as.numeric(d1$date[[last(ids_in_current_backet)]] -  d1$date[[first(ids_in_current_backet)]], units = 'mins'))
  price_diff <- abs(d1$price[[last(ids_in_current_backet)]] - d1$price[[first(ids_in_current_backet)]])

  # conditions not met: more than 30 mins time OR price diff more than one
  if(time_diff > 30 | price_diff > 1){
    ids_in_current_backet <- list()
    ids_in_current_backet <- c(ids_in_current_backet, cur_row)
    d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
   } 

  d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet

}

### extract the first element from the list as a grouping variable

for (cur_row in seq(1, nrow(d1), 1)) {
   d1$grouping[[cur_row]] <- d1$ids_in_current_backet[[cur_row]][[1]]
}

## sumarise value per grouping

d1 %>% group_by(grouping) %>%
  summarise(sum_value = sum(value, na.rm = T))

UPDATE 3 additional dataset

d1 <- data.frame(date = as.POSIXct(c("2009-01-18 15:55:54", "2009-01-22 10:24:49", "2009-02-15 11:17:14", "2009-02-22 14:27:03", 
                                 "2009-04-19 08:59:42", "2009-05-18 08:36:13", "2009-05-23 11:03:53", 
                                 "2009-05-24 12:02:06", "2009-05-24 12:02:22", "2009-05-30 08:35:04", 
                                 "2009-05-30 12:17:50", "2009-06-15 09:11:45", "2009-06-18 11:40:19", 
                                 "2009-06-18 13:22:06", "2009-06-25 14:09:07", "2009-06-28 09:47:09", 
                                 "2009-06-28 09:51:01", "2009-06-28 09:52:53", "2009-06-28 09:54:33", 
                                 "2009-06-28 15:21:44", "2009-06-28 15:34:10", "2009-07-02 12:10:56", 
                                 "2009-07-27 09:09:20", "2009-08-13 09:58:02"), format ="%Y-%m-%d %H:%M:%S"),
             value = c(5000, 3000, 15000, 1000, 1000, 9360, 8000, 4550, 2800, 1000, 2325, 1000, 3000, 1000, 1500, 4000, 10000, 10000, 3500, 10000, 3000, 1000, 6000, 2000),
             price = c(169.5, 153.5, 254.8, 245.7, 160.5, 105.8, 115.2, 111.2, 111.3, 164.1, 162.8, 93.5, 126, 124.2, 155, 169.3, 166.5, 168.8, 168.8, 177.5, 174.2, 166.2, 79.5, 119.5))

Upvotes: 1

Views: 87

Answers (2)

chinsoon12
chinsoon12

Reputation: 25225

Posting a possible data.table approach:

library(data.table)
func <- function(df) {
    DT <- setDT(copy(df))[, rn := .I]
    chosen <- c()
    DT[, 
        {
            #drop those rows that have already been chosen
            idx <- setdiff(
                DT[rn >= .BY$rn &                    #must be after current row
                        date <= .BY$date+30L*60L &   #must be within 30mins
                        abs(price - .BY$price) <= 1, #price diff less than 1
                    rn], 
                chosen)

            if (.BY$rn %in% idx && all(diff(idx) == 1L)) {
                #if there are other rows that should go into in this bucket with this row
                #and they are consecutive rows
                chosen <- c(chosen, idx)
                list(workings=paste(idx, collapse="+"), val=DT[idx, sum(value)])

            } else if (.BY$rn %in% idx && !all(diff(idx) == 1L)) {
                #if this row has never been used and there are non consecutive rows that 
                #had wanted to go into this bucket
                chosen <- c(chosen, .BY$rn)
                list(workings=as.character(.BY$rn), val=DT[.BY$rn, value])
            }
        },
        by=.(rn, date, price)]
}

output for func(d1):

   rn                date price workings     val
1:  1 2010-05-21 08:40:30   5.0      1+2   53736
2:  3 2010-05-21 09:21:00   6.0        3   64645
3:  4 2010-05-21 09:25:00   8.0      4+5  663426
4:  6 2010-05-21 11:27:34   5.0        6 1313313
5:  7 2010-05-21 15:01:29   6.0      7+8    4848
6:  9 2010-05-21 18:25:14   6.0        9    6476
7: 10 2010-05-21 19:59:37   6.5       10   11313
8: 11 2010-05-21 22:29:50   7.4       11    9875

output for func(d2):

    rn                date price  workings   val
 1:  1 2010-02-09 14:05:45 154.1       1+2  2149
 2:  3 2010-02-09 14:37:31 128.9         3  1500
 3:  4 2010-02-09 14:43:37 131.8 4+5+6+7+8 12000
 4:  9 2010-02-09 15:13:48 131.8      9+10  2000
 5: 11 2010-02-09 15:33:40 129.2     11+12  2000
 6: 13 2010-02-09 15:42:26 127.8     13+14  2000
 7: 15 2010-02-13 11:06:31 120.9        15  6000
 8: 16 2010-03-16 15:48:42  29.1        16  5000
 9: 17 2010-03-19 08:23:01  29.0        17  1000
10: 18 2010-03-19 11:29:58  35.6        18  5000
11: 19 2010-03-22 14:28:24  69.8        19  3500
12: 20 2010-04-10 11:08:21  11.6        20  1000

output for func(d3):

    rn                date price workings   val
 1:  1 2009-01-18 15:55:54 169.5        1  5000
 2:  2 2009-01-22 10:24:49 153.5        2  3000
 3:  3 2009-02-15 11:17:14 254.8        3 15000
 4:  4 2009-02-22 14:27:03 245.7        4  1000
 5:  5 2009-04-19 08:59:42 160.5        5  1000
 6:  6 2009-05-18 08:36:13 105.8        6  9360
 7:  7 2009-05-23 11:03:53 115.2        7  8000
 8:  8 2009-05-24 12:02:06 111.2      8+9  7350
 9: 10 2009-05-30 08:35:04 164.1       10  1000
10: 11 2009-05-30 12:17:50 162.8       11  2325
11: 12 2009-06-15 09:11:45  93.5       12  1000
12: 13 2009-06-18 11:40:19 126.0       13  3000
13: 14 2009-06-18 13:22:06 124.2       14  1000
14: 15 2009-06-25 14:09:07 155.0       15  1500
15: 16 2009-06-28 09:47:09 169.3       16  4000
16: 17 2009-06-28 09:51:01 166.5       17 10000
17: 18 2009-06-28 09:52:53 168.8    18+19 13500
18: 20 2009-06-28 15:21:44 177.5       20 10000
19: 21 2009-06-28 15:34:10 174.2       21  3000
20: 22 2009-07-02 12:10:56 166.2       22  1000
21: 23 2009-07-27 09:09:20  79.5       23  6000
22: 24 2009-08-13 09:58:02 119.5       24  2000
    rn                date price workings   val

data:

d1 <- data.frame(date = as.POSIXct(c("2010-05-21 08:40:30",
    "2010-05-21 09:02:06",
    "2010-05-21 09:21:00",
    "2010-05-21 09:25:00",
    "2010-05-21 09:41:53",
    "2010-05-21 11:27:34",
    "2010-05-21 15:01:29",
    "2010-05-21 15:16:01",
    "2010-05-21 18:25:14",
    "2010-05-21 19:59:37",
    "2010-05-21 22:29:50"), format ="%Y-%m-%d %H:%M:%S"),
    price = c(5, 5.2, 6, 8, 7, 5, 6, 6, 6, 6.5, 7.4),
    value = c(11313,42423,64645,20000,643426,1313313,1313,3535,6476,11313,9875))

####################################################################################################

d2 <- data.frame(date = as.POSIXct(c("2010-02-09 14:05:45", "2010-02-09 14:05:52",
    "2010-02-09 14:37:31", "2010-02-09 14:43:37",
    "2010-02-09 14:44:15", "2010-02-09 15:10:37",
    "2010-02-09 15:10:44", "2010-02-09 15:12:29",
    "2010-02-09 15:13:48", "2010-02-09 15:21:53",
    "2010-02-09 15:33:40", "2010-02-09 15:33:46",
    "2010-02-09 15:42:26", "2010-02-09 15:42:38",
    "2010-02-13 11:06:31", "2010-03-16 15:48:42",
    "2010-03-19 08:23:01", "2010-03-19 11:29:58",
    "2010-03-22 14:28:24", "2010-04-10 11:08:21"), format ="%Y-%m-%d %H:%M:%S"),
    value = c(1074, 1075, 1500, 3000, 3000, 2500, 2500, 1000, 1000, 1000,
        1000, 1000, 1000, 1000, 6000, 5000, 1000, 5000, 3500, 1000),
    price = c(154.1, 154, 128.9, 131.8, 131.7, 131.7, 131.6, 131.7,
        131.8, 131.8, 129.2, 129.2, 127.8, 127.7, 120.9, 29.1, 29, 35.6, 69.8, 11.6))



####################################################################################################

d3 <- data.frame(date = as.POSIXct(c("2009-01-18 15:55:54", "2009-01-22 10:24:49",
    "2009-02-15 11:17:14", "2009-02-22 14:27:03",
    "2009-04-19 08:59:42", "2009-05-18 08:36:13", "2009-05-23 11:03:53",
    "2009-05-24 12:02:06", "2009-05-24 12:02:22", "2009-05-30 08:35:04",
    "2009-05-30 12:17:50", "2009-06-15 09:11:45", "2009-06-18 11:40:19",
    "2009-06-18 13:22:06", "2009-06-25 14:09:07", "2009-06-28 09:47:09",
    "2009-06-28 09:51:01", "2009-06-28 09:52:53", "2009-06-28 09:54:33",
    "2009-06-28 15:21:44", "2009-06-28 15:34:10", "2009-07-02 12:10:56",
    "2009-07-27 09:09:20", "2009-08-13 09:58:02"), format ="%Y-%m-%d %H:%M:%S"),
    value = c(5000, 3000, 15000, 1000, 1000, 9360, 8000, 4550, 2800, 1000, 2325, 1000,
        3000, 1000, 1500, 4000, 10000, 10000, 3500, 10000, 3000, 1000, 6000, 2000),
    price = c(169.5, 153.5, 254.8, 245.7, 160.5, 105.8, 115.2, 111.2, 111.3, 164.1,
        162.8, 93.5, 126, 124.2, 155, 169.3, 166.5, 168.8, 168.8, 177.5, 174.2,
        166.2, 79.5, 119.5))

Upvotes: 1

user1766682
user1766682

Reputation: 400

I used this not elegant solution:

    ## init of an empty list
ids_in_current_backet <- list()

## loop row by roe
for (cur_row in seq(1, nrow(d1), 1)) {

  # if it is last row, break the for loop
  if(cur_row == nrow(d1)){
    d1$ids_in_current_backet[[cur_row]] <- list(nrow(d1))
    break}
  # collect ids in the current bucket
  ids_in_current_backet <- c(ids_in_current_backet, cur_row)

  # calc of differences
  time_diff <- (as.numeric(d1$date[[last(ids_in_current_backet)]] -  d1$date[[first(ids_in_current_backet)]], units = 'mins'))
  price_diff <- abs(d1$price[[last(ids_in_current_backet)]] - d1$price[[first(ids_in_current_backet)]])

  # conditions not met: more than 30 mins time OR price diff more than one
  if(time_diff > 30 | price_diff > 1){
    ids_in_current_backet <- list()
    ids_in_current_backet <- c(ids_in_current_backet, cur_row)
    d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet
   } 

  d1$ids_in_current_backet[[cur_row]] <- ids_in_current_backet

}

### extract the first element from the list as a grouping variable

for (cur_row in seq(1, nrow(d1), 1)) {
   d1$grouping[[cur_row]] <- d1$ids_in_current_backet[[cur_row]][[1]]
}

## sumarise value per grouping

d1 %>% group_by(grouping) %>%
  summarise(sum_value = sum(value, na.rm = T))

Upvotes: 0

Related Questions