Reputation: 7928
I have the following data frame https://www.dropbox.com/s/c02qu7uobvrc8ku/college_Rda
This is a sample of the data: (copy+paste
'able)
educational_history <- structure(list(SCH_COLLEGE_STATUS_1997_09 = structure(c(1L, 1L,
1L, 1L, 5L, 1L, 1L, 5L, 5L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1998_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1999_09 = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2000_09 = structure(c(3L,
3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2001_09 = structure(c(3L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2002_09 = structure(c(3L,
3L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2003_09 = structure(c(1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2004_09 = structure(c(1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2005_09 = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2006_09 = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2007_09 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2008_09 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2009_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2010_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2011_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), PUBID = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10)), .Names = c("SCH_COLLEGE_STATUS_1997_09",
"SCH_COLLEGE_STATUS_1998_09", "SCH_COLLEGE_STATUS_1999_09", "SCH_COLLEGE_STATUS_2000_09",
"SCH_COLLEGE_STATUS_2001_09", "SCH_COLLEGE_STATUS_2002_09", "SCH_COLLEGE_STATUS_2003_09",
"SCH_COLLEGE_STATUS_2004_09", "SCH_COLLEGE_STATUS_2005_09", "SCH_COLLEGE_STATUS_2006_09",
"SCH_COLLEGE_STATUS_2007_09", "SCH_COLLEGE_STATUS_2008_09", "SCH_COLLEGE_STATUS_2009_09",
"SCH_COLLEGE_STATUS_2010_09", "SCH_COLLEGE_STATUS_2011_09", "PUBID"
), row.names = c(NA, 10L), class = "data.frame")
I want to generate a new data frame using that data.
I only need two fields: PUBID and First year enrolled in a 4 year college. The information about the year is inside the name of the column. I tried:
FirstYear4C <- function(ID) {
ndX=which(educational_history$PUBID==ID)
educational_historyNdX=educational_history[ndX,]
year=NA
if (educational_historyNdX$SCH_COLLEGE_STATUS_1997_09=="Enrolled in 4-year college"){
year=1997
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_1998_09=="Enrolled in 4-year college"){
year=1998
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_1999_09=="Enrolled in 4-year college"){
year=1999
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_2000_09=="Enrolled in 4-year college"){
year=2000
return(year)
}
return(NA)
}
FirstYear<-unlist(lapply(X=educational_history$PUBID,FirstYear4C))
FourYearCollege<-data.frame(PUBID=educational_history$PUBID,
FirstYear=FirstYear)
I'm sure there is a better way of coding that function. Having to copy and paste column by column seems very inefficient.
PUBID 1stYear4YC
1 1999
2 2000
...
6 2000
Upvotes: 1
Views: 129
Reputation: 7928
#*************************************
# Function to determine the first year attended
#*************************************
getFirstYear <- function(n){
yearsAttended = which(yearsEnrolled[n,])
if (length(yearsAttended) < 1){
firstYear = NA
}else{
firstYear = min(yearsAttended)
}
return(firstYear)
}
#*************************************
# Main Code
#*************************************
#Set up some constants
numRows = nrow(educational_history)
enrolled = "Enrolled"
college = "COLLEGE"
#Determine which columns have attendance data in them
semesters = grep(college, names(educational_history))
yearsEnrolled = unlist(lapply(semesters, function(x) grepl(enrolled,educational_history[,x],ignore.case=FALSE)))
yearsEnrolled = matrix(yearsEnrolled, nrow=numRows, ncol=15)
#Identify the actual years associated with attendance
semesterHeaders = names(educational_history)[semesters]
years = sub("SCH_COLLEGE_STATUS_","",semesterHeaders)
years = as.numeric(sub("_09", "", years))
#Get IDs
PUBID = educational_history$PUBID
#Get first year attended
ndx = unlist(lapply(1:numRows, getFirstYear))
FirstYear4C = unlist(lapply(1:numRows, function(n) years[ndx[n]]))
#Combine data into dataframe
firstYearData = cbind.data.frame(PUBID, FirstYear4C)
#Complete cases
firstYearData<-firstYearData[complete.cases(firstYearData),]
row.names(firstYearData) <- seq(nrow(firstYearData))
Upvotes: 0
Reputation: 145755
One more answer:
educational_history
require(stringr)
require(plyr)
eh <- melt(educational_history, id.var = "PUBID") ## Long format
eh$enrolled <- str_detect(eh$value, pattern = "^Enrolled in 4")
## Extract year
eh$year <- str_extract(eh$variable, pattern="_[0-9]*_")
eh$year <- as.numeric(str_replace_all(eh$year, pattern="_", replacement= ""))
## Summarize
ddply(eh[eh$enrolled, ], .variables=.(PUBID),
.fun= summarize, FirstYear4YC = min(year))
Upvotes: 0
Reputation: 15458
Assuming, your rownames and PUBID is same as in your sample data
Map(function(x) cbind(year=substr(x,20,26),PUBID=which(df[x]=="Enrolled in 4-year college")),as.list(names(df)[-16]))
[[1]]
year
[1,] "1997_09"
[[2]]
year
[1,] "1998_09"
[[3]]
year PUBID
[1,] "1999_09" "1"
[2,] "1999_09" "8"
[[4]]
year PUBID
[1,] "2000_09" "1"
[2,] "2000_09" "2"
[3,] "2000_09" "6"
[4,] "2000_09" "8"
[5,] "2000_09" "9"
[[5]]
year PUBID
[1,] "2001_09" "1"
[2,] "2001_09" "9"
[[6]]
year PUBID
[1,] "2002_09" "1"
[2,] "2002_09" "2"
[3,] "2002_09" "8"
[4,] "2002_09" "9"
[5,] "2002_09" "10"
[[7]]
year PUBID
[1,] "2003_09" "2"
[2,] "2003_09" "9"
[3,] "2003_09" "10"
[[8]]
year PUBID
[1,] "2004_09" "2"
[2,] "2004_09" "9"
[3,] "2004_09" "10"
[[9]]
year PUBID
[1,] "2005_09" "10"
[[10]]
year
[1,] "2006_09"
[[11]]
year PUBID
[1,] "2007_09" "6"
[[12]]
year PUBID
[1,] "2008_09" "6"
[[13]]
year
[1,] "2009_09"
[[14]]
year
[1,] "2010_09"
[[15]]
year
[1,] "2011_09"
Upvotes: 0
Reputation: 49448
library(data.table)
library(reshape2)
data.table(melt(educational_history, id.var = 'PUBID'))[,
list(first.year = sub('.*_([0-9]+)_[0-9]+$',
'\\1',
variable[value == "Enrolled in 4-year college"][1])),
by = PUBID]
# PUBID first.year
# 1: 1 1999
# 2: 2 2000
# 3: 3 NA
# 4: 4 NA
# 5: 5 NA
# 6: 6 2000
# 7: 7 NA
# 8: 8 1999
# 9: 9 2000
#10: 10 2002
Run in pieces to see how it works. Basic idea is to first convert to long format, and then it's easy to get what you want.
Upvotes: 1