dubois
dubois

Reputation: 211

Getting visit time from action time between two R data frames

I want to figure out what profile.views happened on what user visit. The users are uniquely identified by the pair uid,state. This data is stored in two data frames.

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))

> visits
    id  uid state ts
1 2001 1001    CA 51
2 2002 1002    CA 52
3 2003 1001    CA 53
4 2004 1001    MA 54

> profile.views
id  uid state ts
1 3001 1001    CA 51
2 3002 1003    CA 57
3 3003 1002    CA 59
4 3004 1001    CA 59

For each profile.view, I want to figure out which visit it came from. This is done by looking back to the most recent visit with a matching uid and state that has a ts less than or equal to the ts on the profile.views row.

Here are the results I would want (in some form):

profile.views[1,] came from visits[1,]

profile.views[2,] did not come from any visit (this could be caused by a data recording error)

profile.views[3,] came from visits[2,]

profile.views[4,] came from visits[3,]

Does anyone know a good way to do this?

Upvotes: 1

Views: 135

Answers (4)

Blue Magister
Blue Magister

Reputation: 13363

Using base R's merge and aggregate:

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
##merges data frames based on uid and state
newdf.merged <- merge(visits,profile.views, by=c("uid","state"),all.y=TRUE)
##puts unmatched rows into another dataset
newdf.na <- with(newdf,newdf[is.na(ts.x),])
##filters views that happened after visits (like WHERE)
newdf.filter <- with(newdf,newdf[ts.y >= ts.x,])
##aggregates using the max function, selecting max id and ts
newdf.agg <- aggregate(cbind(id.y,ts.y) ~ uid + state + id.x + ts.x, data = newdf.filter, FUN = max)
##merges aggregated result and na rows
newdf.final <- rbind(newdf.agg,newdf.na)
##optional ordering step
newdf.final <- newdf.final[with(newdf.final,order(uid,state,id.x)),]

Upvotes: 0

Blue Magister
Blue Magister

Reputation: 13363

A faster data.table way, which matches a profile view to a visit ID:

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
visits <- data.table(visits)
profile.views <- data.table(profile.views)
setkey(visits,uid,state,ts)
#orders columns so that joins are on first three columns
setcolorder(profile.views,c("uid","state","ts","id"))
##set names to avoid name collision
setnames(profile.views,c("uid","state","view.ts","view.id"))
##rolling join
visits[profile.views,roll=TRUE]
    # uid state ts   id view.id
# 1: 1001    CA 51 2001    3001
# 2: 1003    CA 57   NA    3002
# 3: 1002    CA 59 2002    3003
# 4: 1001    CA 59 2003    3004

Upvotes: 2

Blue Magister
Blue Magister

Reputation: 13363

Using SQL-style syntax with sqldf:

library(sqldf)
sqldf("
SELECT a.id, a.uid, a.state, a.ts, MAX(b.ts) AS visit_ts
FROM \"profile.views\" AS a
LEFT OUTER JOIN visits AS b
ON a.uid = b.uid
AND a.state = b.state
AND a.ts >= b.ts
GROUP BY a.id, a.uid, a.state, a.ts
ORDER BY a.id
")

Upvotes: 2

Blue Magister
Blue Magister

Reputation: 13363

Here's a data.table solution. There are a few things that can probably be done better, but here's a first pass at it.

library(data.table)
visits <- data.table(visits)
profile.views <- data.table(profile.views)
##renames some columns to avoid name collision
##there's probably a better solution to this
setnames(profile.views,c("id","ts"),c("view.id","view.ts"))
setkey(visits,uid,state)
setkey(profile.views,uid,state)
##outer joins visits to profile.views by uid and state
##leaving NA if a row in profile.views has no matches
#visits[profile.views] 
##filters out rows where views happen before visits
#visits[profile.views][view.ts >= ts | is.na(ts)] 
##picks the latest visit timestamp by view
visits[profile.views][view.ts >= ts | is.na(ts), 
  list(visit.ts=max(ts)), 
  by=list(view.id,uid,state,view.ts)][order(view.id)]
#    view.id  uid state view.ts visit.ts
# 1:    3001 1001    CA      51       51
# 2:    3002 1003    CA      57       NA
# 3:    3003 1002    CA      59       52
# 4:    3004 1001    CA      59       53

Upvotes: 1

Related Questions