Reputation: 7928

manipulating a data frame to extract information from columns names

I have the following data frame https://www.dropbox.com/s/c02qu7uobvrc8ku/college_Rda

This is a sample of the data: (copy+paste'able)

 educational_history <- structure(list(SCH_COLLEGE_STATUS_1997_09 = structure(c(1L, 1L, 
1L, 1L, 5L, 1L, 1L, 5L, 5L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1998_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1999_09 = structure(c(3L, 
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2000_09 = structure(c(3L, 
3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2001_09 = structure(c(3L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2002_09 = structure(c(3L, 
3L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2003_09 = structure(c(1L, 
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2004_09 = structure(c(1L, 
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2005_09 = structure(c(1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2006_09 = structure(c(1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2007_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2008_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2009_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2010_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2011_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), PUBID = c(1, 
2, 3, 4, 5, 6, 7, 8, 9, 10)), .Names = c("SCH_COLLEGE_STATUS_1997_09", 
"SCH_COLLEGE_STATUS_1998_09", "SCH_COLLEGE_STATUS_1999_09", "SCH_COLLEGE_STATUS_2000_09", 
"SCH_COLLEGE_STATUS_2001_09", "SCH_COLLEGE_STATUS_2002_09", "SCH_COLLEGE_STATUS_2003_09", 
"SCH_COLLEGE_STATUS_2004_09", "SCH_COLLEGE_STATUS_2005_09", "SCH_COLLEGE_STATUS_2006_09", 
"SCH_COLLEGE_STATUS_2007_09", "SCH_COLLEGE_STATUS_2008_09", "SCH_COLLEGE_STATUS_2009_09", 
"SCH_COLLEGE_STATUS_2010_09", "SCH_COLLEGE_STATUS_2011_09", "PUBID"
), row.names = c(NA, 10L), class = "data.frame")

I want to generate a new data frame using that data.

I only need two fields: PUBID and First year enrolled in a 4 year college. The information about the year is inside the name of the column. I tried:

FirstYear4C <- function(ID) {
  ndX=which(educational_history$PUBID==ID)
  educational_historyNdX=educational_history[ndX,]
  year=NA
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1997_09=="Enrolled in 4-year college"){
    year=1997
    return(year)
  } 
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1998_09=="Enrolled in 4-year college"){
    year=1998
    return(year)
  }  
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1999_09=="Enrolled in 4-year college"){
    year=1999
    return(year)
  }  
  if (educational_historyNdX$SCH_COLLEGE_STATUS_2000_09=="Enrolled in 4-year college"){
    year=2000
    return(year)
  }
  return(NA)
}
FirstYear<-unlist(lapply(X=educational_history$PUBID,FirstYear4C))
FourYearCollege<-data.frame(PUBID=educational_history$PUBID,
                            FirstYear=FirstYear)

I'm sure there is a better way of coding that function. Having to copy and paste column by column seems very inefficient.

PUBID    1stYear4YC 
1        1999
2        2000
... 
6        2000

Upvotes: 1

Answers (4)

Ignacio

Reputation: 7928

#*************************************
# Function to determine the first year attended
#*************************************
getFirstYear <- function(n){
  yearsAttended = which(yearsEnrolled[n,])
  if (length(yearsAttended) < 1){
    firstYear = NA
  }else{
    firstYear = min(yearsAttended)
  }
  return(firstYear)
}

#*************************************
# Main Code
#*************************************
#Set up some constants
numRows = nrow(educational_history)
enrolled = "Enrolled"
college = "COLLEGE"

#Determine which columns have attendance data in them
semesters = grep(college, names(educational_history))
yearsEnrolled = unlist(lapply(semesters, function(x) grepl(enrolled,educational_history[,x],ignore.case=FALSE)))
yearsEnrolled = matrix(yearsEnrolled, nrow=numRows, ncol=15)


#Identify the actual years associated with attendance
semesterHeaders = names(educational_history)[semesters]
years = sub("SCH_COLLEGE_STATUS_","",semesterHeaders)
years = as.numeric(sub("_09", "", years))

#Get IDs
PUBID = educational_history$PUBID

#Get first year attended
ndx = unlist(lapply(1:numRows, getFirstYear))
FirstYear4C = unlist(lapply(1:numRows, function(n) years[ndx[n]]))

#Combine data into dataframe
firstYearData = cbind.data.frame(PUBID, FirstYear4C)
#Complete cases
firstYearData<-firstYearData[complete.cases(firstYearData),]
row.names(firstYearData) <- seq(nrow(firstYearData))

Upvotes: 0

Gregor Thomas

Reputation: 145755

One more answer:

educational_history
require(stringr)
require(plyr)

eh <- melt(educational_history, id.var = "PUBID") ## Long format
eh$enrolled <- str_detect(eh$value, pattern = "^Enrolled in 4")

## Extract year
eh$year <- str_extract(eh$variable, pattern="_[0-9]*_") 
eh$year <- as.numeric(str_replace_all(eh$year, pattern="_", replacement= ""))  

## Summarize
ddply(eh[eh$enrolled, ], .variables=.(PUBID),
    .fun= summarize, FirstYear4YC = min(year))

Upvotes: 0

Metrics

Reputation: 15458

Assuming, your rownames and PUBID is same as in your sample data

 Map(function(x) cbind(year=substr(x,20,26),PUBID=which(df[x]=="Enrolled in 4-year college")),as.list(names(df)[-16]))

[[1]]
     year     
[1,] "1997_09"

[[2]]
     year     
[1,] "1998_09"

[[3]]
     year      PUBID
[1,] "1999_09" "1"  
[2,] "1999_09" "8"  

[[4]]
     year      PUBID
[1,] "2000_09" "1"  
[2,] "2000_09" "2"  
[3,] "2000_09" "6"  
[4,] "2000_09" "8"  
[5,] "2000_09" "9"  

[[5]]
     year      PUBID
[1,] "2001_09" "1"  
[2,] "2001_09" "9"  

[[6]]
     year      PUBID
[1,] "2002_09" "1"  
[2,] "2002_09" "2"  
[3,] "2002_09" "8"  
[4,] "2002_09" "9"  
[5,] "2002_09" "10" 

[[7]]
     year      PUBID
[1,] "2003_09" "2"  
[2,] "2003_09" "9"  
[3,] "2003_09" "10" 

[[8]]
     year      PUBID
[1,] "2004_09" "2"  
[2,] "2004_09" "9"  
[3,] "2004_09" "10" 

[[9]]
     year      PUBID
[1,] "2005_09" "10" 

[[10]]
     year     
[1,] "2006_09"

[[11]]
     year      PUBID
[1,] "2007_09" "6"  

[[12]]
     year      PUBID
[1,] "2008_09" "6"  

[[13]]
     year     
[1,] "2009_09"

[[14]]
     year     
[1,] "2010_09"

[[15]]
     year     
[1,] "2011_09"

Upvotes: 0

eddi

Reputation: 49448

library(data.table)
library(reshape2)

data.table(melt(educational_history, id.var = 'PUBID'))[,
    list(first.year = sub('.*_([0-9]+)_[0-9]+$',
                          '\\1',
                          variable[value == "Enrolled in 4-year college"][1])),
    by = PUBID]
#    PUBID first.year
# 1:     1       1999
# 2:     2       2000
# 3:     3         NA
# 4:     4         NA
# 5:     5         NA
# 6:     6       2000
# 7:     7         NA
# 8:     8       1999
# 9:     9       2000
#10:    10       2002

Run in pieces to see how it works. Basic idea is to first convert to long format, and then it's easy to get what you want.

Upvotes: 1

manipulating a data frame to extract information from columns names

Answers (4)

Related Questions