Generate ranking by date then transfer to second data frame

Question

I want to generate a column that contains an index based on an event in another column, with the indices ordered by date. For other events, the index is assigned based on the date range. The index should grouped by unit id.

The data consists of units, events, and dates two different data frames. One data frame has an event called "Entire File". When the "Entire file" event occurs, a rank should be incremented. Then the ranking has to be transferred to other rows for that unit and in that date range. If there are events before the first "Entire File" event, those events should have a rank 0.

In the first data frame, which contains "Entire File" events, if the unit number changes, and the first event is not "Entire File", then the beginning events have the last rank of the previous unit number.

Then the ranks have to be transferred to subsequent data frames by unit / date range. This is where I'm stuck

Hope this example makes the question clear.

Generate sample data

set.seed(13)

# Ref http://stackoverflow.com/questions/14720983/efficiently-generate-a-random-sample-of-times-and-dates-between-two-dates

rDates <- function(N, st="2014/01/01", et="2014/10/01") {
  st <- as.POSIXct(as.Date(st))
  et <- as.POSIXct(as.Date(et))
  dt <- as.numeric(difftime(et,st,unit="sec"))
  ev <- sort(runif(N, 0, dt))
  rt <- st + ev
}

nSamples_df1=100
nSamples_df2=75

df1<-data.frame(Event.Name=sample(c("Entire File",paste("Event ",letters[1:5])),nSamples_df1,replace=TRUE,prob=c(3,rep(1,5))),
                unit=sample(1:10,nSamples_df1,replace=TRUE),
                event_time = rDates(nSamples_df1))

df2<-data.frame(event=sample(c(paste("Event ",letters[6:10])),nSamples_df2,replace=TRUE),
                unit=sample(1:10,nSamples_df2,replace=TRUE),
                event_time = rDates(nSamples_df2))

The first step is to order the data and then rank the "Entire File" events in df1.

# Put df1 in order
df1<-with(df1,head(df1[order(unit,event_time),],50))
# Extract and rank the "Entire File" events
entireFileEvents <- df1[df1$Event.Name=="Entire File",
                        c("Event.Name","unit","event_time")]
rankedEntireFileEvents <- transform(entireFileEvents, 
                                    fileEventIndex = ave(xtfrm(event_time), unit, 
                                                     FUN = function(x) rank(x, ties.method = "first")))

Transfer the data to the original data frame. I'm not sure if this is correct, do the ranks end up in the correct location?

df1$fileEventIndex=NA

# Original risky assignment
# df1[df1$Event.Name=="Entire File","fileEventIndex"] <- rankedEntireFileEvents$fileEventIndex
# I'm not sure how to use merge in this case
# df1b <- merge(df1, rankedEntireFileEvents, by=c("Event.Name","unit","event_time"), sort = FALSE)
# Assignment using match, thanks akrun
match_rows <- match(paste(df1$Event.Name, df1$unit, df1$event_time),
      paste(rankedEntireFileEvents$Event.Name, rankedEntireFileEvents$unit, rankedEntireFileEvents$event_time))
df1_match_rows = which(!is.na(match_rows))
refe_match_rows = match_rows[!is.na(match_rows)]
df1[df1_match_rows,"fileEventIndex"] <- rankedEntireFileEvents$fileEventIndex[refe_match_rows]

Fill in the remaining ranks using zoo

library(zoo)
df1<-na.locf(df1, na.rm = FALSE)
df1$fileEventIndex[is.na(df1$fileEventIndex)]=0

Now, I don't know how to transfer the fileEventIndex from rankedEntireFileEvents into the second data frame. For a unit, the fileEventIndexin df2 should have the same value if the date is later.

Here are the current results for df1. The rank is not correct because the unit changed before a "Entire File" occurred, so the first event for unit 2 has rank 4, and should have rank 0.

> with(df1,head(df1[order(unit,event_time),],50))
     Event.Name unit          event_time fileEventIndex
6   Entire File    1 2014-01-09 01:43:24              1
12     Event  a    1 2014-01-23 10:25:59              1
26     Event  c    1 2014-02-26 16:51:07              1
28     Event  b    1 2014-03-04 05:39:57              1
47  Entire File    1 2014-05-05 02:19:16              2
67  Entire File    1 2014-07-01 18:52:56              3
76     Event  a    1 2014-07-21 03:42:14              3
82     Event  b    1 2014-08-07 16:33:33              3
87     Event  a    1 2014-08-22 01:04:39              3
89  Entire File    1 2014-08-30 15:42:21              4
94     Event  a    1 2014-09-07 13:46:25              4
8      Event  e    2 2014-01-12 23:49:24              4   <-- This should be 1
16  Entire File    2 2014-01-27 10:20:28              1
21  Entire File    2 2014-02-11 17:24:22              2
22     Event  c    2 2014-02-21 22:32:28              2

Here is an approach for the 2nd df, but gives incorrect results

df2$fileEventIndex=NA
units <- sort(unique(rankedEntireFileEvents$unit))

for (iu in seq(1,length(units))) {
  uu = units[iu]
  rankSameUnit = rankedEntireFileEvents$unit==uu
  dfSameUnit = df2$unit == uu
  uDates <- rankedEntireFileEvents[rankSameUnit,"event_time"]
  uFileEventIndex <-  rankedEntireFileEvents[rankSameUnit,"fileEventIndex"]
  nDates = length(uDates)
  if (nDates>0) {
    dfBeforeFirstDate = df2$event_time < uDates[1]
    df2_rows = dfSameUnit & dfBeforeFirstDate
    if (any(df2_rows)) {
      df2[df2_rows, "fileEventIndex"] = 0
    }
    for (id in seq(1,nDates-1)) {
      dfAfterCurrentDate = df2$event_time >= uDates[id]
      dfBeforeNextDate = df2$event_time < uDates[id]
      currentRank = uFileEventIndex[id]
      df2_rows = dfSameUnit & dfAfterCurrentDate & dfBeforeNextDate
      if (any(df2_rows)) {
        df2[df2_rows, "fileEventIndex" ] = currentRank
      }
    }
    dfAfterLastDate = df2$event_time >= uDates[nDates]
    df2_rows = dfSameUnit & dfAfterLastDate
    if (any(df2_rows)) {
      df2[df2_rows, "fileEventIndex"] = uFileEventIndex[nDates]  
    }
  }
}

This is the output of df2, should not have NA values

> with(df2,head(df2[order(unit,event_time),],50))
      event unit          event_time fileEventIndex
7  Event  g    1 2014-01-18 05:39:10             NA
25 Event  g    1 2014-03-25 01:56:28             NA
38 Event  g    1 2014-04-29 09:57:39             NA
42 Event  j    1 2014-05-17 05:39:30             NA
43 Event  g    1 2014-05-23 05:07:06             NA
46 Event  g    1 2014-06-03 07:12:13             NA
53 Event  i    1 2014-06-25 21:51:25             NA
54 Event  h    1 2014-06-30 00:41:00             NA
64 Event  f    1 2014-08-05 06:28:56             NA
2  Event  f    2 2014-01-03 03:27:28              0
12 Event  h    2 2014-02-01 08:52:08             NA
27 Event  i    2 2014-03-25 22:36:06             NA
39 Event  f    2 2014-05-02 07:00:18             NA
44 Event  f    2 2014-05-24 09:41:48             NA
47 Event  j    2 2014-06-04 22:45:07             NA
50 Event  g    2 2014-06-08 20:25:46             NA
58 Event  j    2 2014-07-19 05:03:48             NA
67 Event  h    2 2014-08-10 05:00:55             NA
22 Event  h    3 2014-03-15 20:25:16              0

vpipkt · Accepted Answer

A decent way to do this is probably using data.table rolling join on your rankedEntireFileEvents entries to your primary tables.

library(data.table)

dt1<-data.table(df1)
dt2<-data.table(df2)
rankedEntireFileEvents.table <-data.table(rankedEntireFileEvents)

setkey(dt1,unit,event_time)
setkey(dt2,unit,event_time)
setkey(rankedEntireFileEvents.table, unit, event_time)

dt1.ranked <- rankedEntireFileEvents.table[dt1, roll=TRUE]
#some cleaning up to get your desired result
dt1.ranked$Event.Name <-NULL
setnames(dt1.ranked,"i.Event.Name","Event.Name")

#NA's in fileEventIndex indicte they precede fileEventIndex 1 

dt2.ranked <- rankedEntireFileEvents.table[dt2, roll=TRUE]

Now tables dt1.ranked and dt2.ranked are your desired outputs.

Generate ranking by date then transfer to second data frame

Answers (1)

Related Questions