Reputation:

Specific aggregation in a dataframe

I have a dataframe filled with course IDs, student IDs, week numbers (1 for the first week, 2 for the second, ...), and some information about what each user did in each course on each week. The final two columns of the df are non-NA if an instructor 'intervened' with the student in that course in that week, and NA otherwise. I want to compare each student's behavior before to after the week of their first intervention.

So what I'd like to make is a column, 'HasIntervened', which is FALSE for weeks less than that of the student's first intervention and TRUE for weeks greater than or equal, but I'm having a hell of a time creating that simple column. I'm fairly certain that aggregate is going to be the way to go, but I'm just not thinking about the problem in the right way.

Here is the dput of the first 60 rows (5 students' worth) of the dataframe:

structure(list(UserID = c(4188948L, 4188948L, 4188948L, 4188948L, 
4188948L, 4188948L, 4735684L, 4735684L, 4735684L, 4735684L, 4735684L, 
4735684L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 6292486L, 
6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6469671L, 6538263L, 
6538263L, 6538263L, 6538263L, 6538263L, 6538263L, 6621258L, 6621258L, 
6621258L, 6621258L, 6621258L, 6621258L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 6891869L, 
6891869L, 6891869L, 6978155L, 6978155L, 6978155L, 6978155L, 6978155L, 
6978155L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L, 7195846L
), CourseID = c(6567871L, 6567871L, 6567871L, 6567871L, 6567871L, 
6567871L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 6567168L, 
6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 6567159L, 
6567159L, 6567159L, 6567159L, 6567159L, 6567159L, 6567162L, 6567162L, 
6567162L, 6567162L, 6567162L, 6567162L, 6567853L, 6567853L, 6567853L, 
6567853L, 6567853L, 6567853L, 6567159L, 6567159L, 6567159L, 6567159L, 
6567159L, 6567159L, 6567864L, 6567864L, 6567864L, 6567864L, 6567864L, 
6567864L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 6567873L, 
6567859L, 6567859L, 6567859L, 6567859L, 6567859L, 6567859L), 
WeekInCourse = c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 
3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 
4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 
5, 6, 1, 2, 3, 4, 5, 6), WeekPostCount = c(1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 3L, 4L, 3L, 3L, 0L, 4L, 
0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 2L, 
2L, 0L, 0L, 4L, 0L, 3L, 0L, 3L, 0L, 0L, 0L), WeekLoginCount = c(2L, 
1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 4L, 4L, 1L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 2L, 1L, 0L, 0L, 0L, 0L, 1L, 
1L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 4L, 1L, 0L, 0L, 
0L, 0L, 3L, 3L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), 
WeekPointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.375, 0.2, 0, 0, 0, 0.85, 0.7, 0.4, 0.7, 0.7, 0, 0.857142857142857, 
0.35, 0, 0, 0.712765957446808, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5, 
0, 0, 0.7, 1, 1, 0.375, 0.723076923076923, 0, 0.738636363636364
), CumulativePointsPercent = c(0, 0, 0, 0, 0, 0, 0, 0.185714285714286, 
0.254545454545455, 0.235294117647059, 0.235294117647059, 
0.10958904109589, 0, 0.85, 0.8, 0.533333333333333, 0.55, 
0.563636363636364, 0, 0.857142857142857, 0.623076923076923, 
0.476470588235294, 0.476470588235294, 0.600558659217877, 
0, 1, 0.0666666666666667, 0.0666666666666667, 0.0461538461538462, 
0.0461538461538462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0.25, 0.166666666666667, 0.0555555555555556, 0.05, 0.0454545454545455, 
0, 0.5, 0.5, 0.166666666666667, 0.15, 0.2, 1, 1, 0.615384615384615, 
0.669230769230769, 0.621428571428571, 0.666666666666667), 
RiskEstimate = c(0.627717786405816, 0.986868933315635, 0.986687587608184, 
0.993909863003438, 0.997123961252086, 0.995862152216296, 
0.914011371723269, 0.925359536086114, 0.902625588346349, 
0.956922151061089, 0.977244888475535, 0.975006380719003, 
0.215420992232115, 0.174623555825523, 0.241380495376484, 
0.699712463799006, 0.692014530298594, 0.697966901130338, 
0.765071150059092, 0.763071307309743, 0.767261726128078, 
0.835918063362269, 0.854949153314029, 0.805318343915736, 
0.792873572656207, 0.790581615380765, 0.82622599277251, 0.9330287497742, 
0.965763061363497, 0.951226314109191, 0.851355921713566, 
0.991081300877175, 0.989671569185701, 0.995402298000919, 
0.997671718747865, 0.996593366142757, 0.738690043138604, 
0.865412845144037, 0.831369850200541, 0.93845410260835, 0.968400480533385, 
0.9533338828382, 0.624930735381371, 0.981915016747928, 0.985037736895337, 
0.994680902796769, 0.996907588471311, 0.995388109404559, 
0.887995464972052, 0.970620002831325, 0.97136665697772, 0.992618626388727, 
0.99543249839328, 0.992149889176406, 0.923802324633255, 0.984464950934932, 
0.978726967214146, 0.971473084822075, 0.97886220009245, 0.979311013989987
), RiskBin = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L), InterventionID = c(NA, 26L, NA, NA, NA, 
NA, NA, NA, NA, NA, 50L, NA, NA, NA, NA, NA, 73L, NA, NA, 
NA, NA, NA, 56L, NA, NA, NA, NA, 46L, NA, NA, NA, 33L, NA, 
NA, NA, NA, 15L, NA, NA, 43L, 53L, NA, NA, NA, NA, NA, 71L, 
NA, NA, NA, NA, NA, 78L, NA, NA, 36L, NA, NA, 80L, NA), InterventionType = structure(c(NA, 
2L, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 3L, NA, NA, NA, 
2L, NA, NA, NA, NA, 3L, NA, NA, 3L, 2L, NA, NA, NA, NA, NA, 
2L, NA, NA, NA, NA, NA, 2L, NA, NA, 3L, NA, NA, 3L, NA), .Label = c("", 
"At-Risk Form", "Email", "Other", "Phone"), class = "factor")), .Names = c("UserID", 
"CourseID", "WeekInCourse", "WeekPostCount", "WeekLoginCount", 
"WeekPointsPercent", "CumulativePointsPercent", "RiskEstimate", 
"RiskBin", "InterventionID", "InterventionType"), row.names = c(NA, 
60L), class = "data.frame")

Upvotes: 1

Answers (4)

mnel

Reputation: 115485

A data.table approach for coding elegance and memory efficiency

library(data.table)
# assuming your data is in DF
DT <- as.data.table(DF)
# set the key to ensure that the data is sorted by week within 
# each user / course combination
setkey(DT, UserID, CourseID,  WeekInCourse)
# using cumsum  
DT[,hasIntervened := cumsum(!is.na(InterventionID))>0 ,by =list(CourseID, UserID)]

The data.table syntax avoids the need for with

Upvotes: 1

Maiasaura

Reputation: 32996

This should work:

library(plyr)
ddply(df, .(UserID), function(x) {
     i <- which.min((x$InterventionID))
    if(i>1) {
        x$HasIntervened <- c(rep(FALSE,i-1), rep(TRUE, nrow(x)-i+1))
        } else {
        x$HasIntervened <- TRUE     
        }
     x
    })

Upvotes: 0

IRTFM

Reputation: 263481

courses$HasIntervened <- as.logical( with(courses, ave(InterventionID,
                                                 UserID, CourseID,  # grouping factors
                                           FUN=function(x) cumsum( !is.na(x) ) ) ) )

Upvotes: 1

Ali

Reputation: 9850

Try this one:

foo = your.data
foo$WeekInCourse[is.na(foo$InterventionID)]=Inf
bar = setNames(aggregate(WeekInCourse ~ UserID, foo, min),c("UserID","FirstW"))
foo = merge(foo, bar, by="UserID")
your.data$HasIntervened = your.data$WeekInCourse >= foo$FirstW

Upvotes: 1

Specific aggregation in a dataframe

Answers (4)

Related Questions