user8959427
user8959427

Reputation: 2067

expanding rows by line and pivoting wider

I have some data which is separated by \n.

# A tibble: 3 x 7
  name...15             title...17           company...18          date_range...19                 location...20         description...21               li_company_url               
  <chr>                 <chr>                <chr>                 <chr>                           <chr>                 <chr>                          <chr>                        
1 "jobPosition1"         NA                   NA                    NA                              NA                    NA                             NA                          
2 "jobPosition1"         NA                   NA                    NA                              NA                    NA                             NA                          
3 "jobPosition1\njobPo~ "CEO\nDirector\nGer~ "European Outsorcing~ "jan 1993 ? nov 1999\njan 2010~ "Germany\nUSA\nMissi~ "Director of systems\nAdmin o~ "\nhttps://www.google.com/\n~

Each row is an observation and inside each row - separated by \n is a new line of information for that observation. I want to expand the rows out such that the expected output looks like:

  jobposition1_title jobposition2_title      jobposition3_title jobposition4_title jobposition5_title jobposition1_company jobposition2_company jobposition3_company
1                 NA                 NA                      NA                 NA                 NA                   NA                   NA                   NA
2                 NA                 NA                      NA                 NA                 NA                   NA                   NA                   NA
3                CEO           Director Member of the directive            Partner                 NA  European Outsorcing               Google                Pepsi
  jobposition4_company jobposition5_company   jobposition1_date jobposition2_date   jobposition3_date   jobposition4_date    jobposition5_date jobposition1_location
1                   NA                   NA                  NA                NA                  NA                  NA                   NA                    NA
2                   NA                   NA                  NA                NA                  NA                  NA                   NA                    NA
3                Yahoo            Microsoft jan 1993 ? nov 1999    jan 2010 ? now jan 2010 ? nov 2016 nov 1999 ? oct 2009 sept 2012 ? dic 2014               Germany

So the jobPosition's are the column names along with the title, company, dates etc. (which are separated by \n.

Data:

data <- structure(list(name...15 = c("jobPosition1", "jobPosition1", 
                             "jobPosition1\njobPosition2\njobPosition3\njobPosition4\njobPosition5"
), title...17 = c(NA, NA, "CEO\nDirector\nGerente \nMember of the directive\nPartner"
), company...18 = c(NA, NA, "European Outsorcing\nGoogle\nPepsi\nYahoo\nMicrosoft"
), date_range...19 = c(NA, NA, "jan 1993 ? nov 1999\njan 2010 ? now\njan 2010 ? nov 2016\nnov 1999 ? oct 2009\nsept 2012 ? dic 2014"
), location...20 = c(NA, NA, "Germany\nUSA\nMissing Data\nMissing Data\nMissing Data"
), description...21 = c(NA, NA, "Director of systems\nAdmin of systems\nParnet in the area of accountancy\ntecnical director"
), li_company_url = c(NA, NA, "\nhttps://www.google.com/\nhttps://www.yahoo.com/\nhttps://www.abcd.com/\nhttps://www.xzy.com/"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))

Expected output data:

data.frame(
  
  # Job titles
  jobposition1_title = c("NA", "NA", "CEO"),
  jobposition2_title = c("NA", "NA", "Director"),
  jobposition3_title = c("NA", "NA", "Member of the directive"),
  jobposition4_title = c("NA", "NA", "Partner"),
  jobposition5_title = c("NA", "NA", "NA"),
  
  # Job companies
  jobposition1_company = c("NA", "NA", "European Outsorcing"),
  jobposition2_company = c("NA", "NA", "Google"),
  jobposition3_company = c("NA", "NA", "Pepsi"),
  jobposition4_company = c("NA", "NA", "Yahoo"),
  jobposition5_company = c("NA", "NA", "Microsoft"),
  
  # Date ranges
  jobposition1_date = c("NA", "NA", "jan 1993 ? nov 1999"),
  jobposition2_date = c("NA", "NA", "jan 2010 ? now"),
  jobposition3_date = c("NA", "NA", "jan 2010 ? nov 2016"),
  jobposition4_date = c("NA", "NA", "nov 1999 ? oct 2009"),
  jobposition5_date = c("NA", "NA", "sept 2012 ? dic 2014"),
  
  # Locations
  jobposition1_location = c("NA", "NA", "Germany"),
  jobposition2_location = c("NA", "NA", "USA"),
  jobposition3_location = c("NA", "NA", "Missing Data"),
  jobposition4_location = c("NA", "NA", "Missing Data"),
  jobposition5_location = c("NA", "NA", "Missing Data"),
  
  # Description
  jobposition1_description = c("NA", "NA", "Director of systmes"),
  jobposition2_description = c("NA", "NA", "Admin of systems"),
  jobposition3_description = c("NA", "NA", "Parnet in the area of accountancy"),
  jobposition4_description = c("NA", "NA", "tecnical director")
  jobposition5_description = c("NA", "NA", "NA") # NOTE: Here in data$description...21 has 4 inputs separated by \n but there are 5 job descriptions...
  # Anything after the last \n with empty space would be an NA.
  # etc.
  
)

Upvotes: 0

Views: 32

Answers (1)

Paul
Paul

Reputation: 9087

Use str_split to get a list of vectors of different lengths.

library(tidyverse)

unnested <- map(data, ~unlist(str_split(., "\n")))
unnested
#> $name...15
#> [1] "jobPosition1" "jobPosition1" "jobPosition1" "jobPosition2"
#> [5] "jobPosition3" "jobPosition4" "jobPosition5"
#> 
#> $title...17
#> [1] NA                        NA                       
#> [3] "CEO"                     "Director"               
#> [5] "Gerente "                "Member of the directive"
#> [7] "Partner"                
#> 
#> $company...18
#> [1] NA                    NA                    "European Outsorcing"
#> [4] "Google"              "Pepsi"               "Yahoo"              
#> [7] "Microsoft"          
#> 
#> $date_range...19
#> [1] NA                     NA                     "jan 1993 ? nov 1999" 
#> [4] "jan 2010 ? now"       "jan 2010 ? nov 2016"  "nov 1999 ? oct 2009" 
#> [7] "sept 2012 ? dic 2014"
#> 
#> $location...20
#> [1] NA             NA             "Germany"      "USA"         
#> [5] "Missing Data" "Missing Data" "Missing Data"
#> 
#> $description...21
#> [1] NA                                  NA                                 
#> [3] "Director of systems"               "Admin of systems"                 
#> [5] "Parnet in the area of accountancy" "tecnical director"                
#> 
#> $li_company_url
#> [1] NA                        NA                       
#> [3] ""                        "https://www.google.com/"
#> [5] "https://www.yahoo.com/"  "https://www.abcd.com/"  
#> [7] "https://www.xzy.com/"   

To combine these into a data frame, they need to be the same length. So find out how long they should be.

max_len <- unnested %>% lengths() %>% max()
max_len
#> [1] 7

Then set the lengths and bind them together.

unnested %>%
  map(`length<-`, max_len) %>%
  bind_cols()
#> # A tibble: 7 x 7
#>   name   title   company   date_range location description   li_company_url 
#>   <chr>  <chr>   <chr>     <chr>      <chr>    <chr>         <chr>          
#> 1 jobPo…  NA     NA        NA         NA       NA             NA            
#> 2 jobPo…  NA     NA        NA         NA       NA             NA            
#> 3 jobPo… "CEO"   European… jan 1993 … Germany  Director of … ""             
#> 4 jobPo… "Direc… Google    jan 2010 … USA      Admin of sys… "https://www.g…
#> 5 jobPo… "Geren… Pepsi     jan 2010 … Missing… Parnet in th… "https://www.y…
#> 6 jobPo… "Membe… Yahoo     nov 1999 … Missing… tecnical dir… "https://www.a…
#> 7 jobPo… "Partn… Microsoft sept 2012… Missing… NA            "https://www.x…

Upvotes: 1

Related Questions