BPipher
BPipher

Reputation: 118

Transforming a list of strings into a data.frame in a tidy manner

I'm wondering if anyone has any tips/tricks on how to handle transforming data, like what is below:

library(tidyverse)
example.list = list(" 1 North Carolina State University at Raleigh         15       9    12    13     22    15    32    19   14    20    12     17    19    20     19     25     283", 
    " 2 Iowa State University                                9      8     5    11     14     4    11    13   14      9   15     28    14      9    18     27     209", 
    " 3 University of Wisconsin-Madison                      5      6    14     9     20    13    15    12   13      9   13     10    13    24     15     17     208", 
    " 4 Stanford University*                               10      12    14     6      9    10     5     9   13      7   13     10     4      9    23       6    160", 
    " 5 Texas A & M University-College Station               6     12    18    10      7     4     5    11   16    18    10      7    15      4     8       8    159", 
    " 9 University of Michigan-Ann Arbor                     8      5     3     3      8     9    12    11    7    11    13      9     8    11     13       9    140", 
    "10 University of California-Los Angeles                 2      2     2     6      9     7     9     8    7    11    11      8     6    12     13     10     123", 
    "19 Rice University                                      3      3     5    11      4     7     7    11    2      6     4     6     3      8     7       7     94")

Into something like the output here:

example.list %>%
    substring(3) %>% 
    str_replace_all("[^[:alnum:]]", " ") %>%
    str_squish() %>% 
    strsplit(split = "(?<=[a-zA-Z])\\s*(?=[0-9])", perl = TRUE) %>% 
    unlist() %>% 
    matrix(ncol = 2, byrow = TRUE) %>% 
    data.frame() %>% 
    separate("X2",into = paste0("X",2:18),sep = " ") 

The general pattern that needs extracted is all characters into its own column up to the first numbers, with all other columns separated by whitespace into other columns.

Things that would be interesting is whether most of this could be done in a single regex pattern or without it at all.

I'm just looking to improve string handling as I haven't worked with it much! The use-case here would be like trying to pull tabled data from a pdf/html into a data.frame.

Edit:

I appreciate all of the advice and different perspectives!

I realized I actually missed a few worthy-of-mention test cases:

example2.list = list(" 2 Iowa State University                                9      8     5    11     14     4    11    13   14      9   15     28    14      9    18     27     209", 
    " 3 University of Wisconsin-Madison                      5      6    14     9     20    13    15    12   13      9   13     10    13    24     15     17     208", 
    " 4 Stanford University*                               10      12    14     6      9    10     5     9   13      7   13     10     4      9    23       6    160", 
    " 5 Texas A & M University-College Station               6     12    18    10      7     4     5    11   16    18    10      7    15      4     8       8    159", 
    " 9 University of Michigan-Ann Arbor                     8      5     3     3      8     9    12    11    7    11    13      9     8    11     13       9    140", 
    "10 University of California-Los Angeles                 2      2     2     6      9     7     9     8    7    11    11      8     6    12     13     10     123", 
    "19 Rice University                                      3      3     5    11      4     7     7    11    2      6     4     6     3      8     7       7     94", 
    "52 Bowling Green State University             0 0 0 0 0 1 5 2 2 2  4 7 3  4  4  3 37", 
    "55 University of New Mexico                   4 2 3 1 3 0 5 3 2 1  1 2 3  2  3  0 35")

It doesn't actually come out as neatly as aligned.

Full dataset, slightly cleaned:

library(pdftools)
library(tidyverse)
data.loc = "https://ww2.amstat.org/misc/StatsPhD2003-MostRecent.pdf"
data.full =
    pdf_text(data.loc) %>%
    read_lines() %>%
    head(-2) %>%
    tail(-3) %>%
    lapply(function(ele) if(ele == "") NULL else ele) %>% 
    compact()

Here was my second attempt:

library(tidyverse)
library(magrittr)
  # Ignores column names
  data.full[-1] %>%
  # Removing excess whitepace
  str_squish() %>%
  # Removes index
  str_remove("^\\s*\\d*\\s*") %>%
  # Split on all whitespace occurring before digits
  str_split("\\s+(?=\\d)") %>%
  # Turn list into a matrix
  unlist() %>%
  matrix(ncol = 18, byrow = TRUE) %>%
  # Handling variables names
  set_colnames(value =
                 data.full[1] %>%
                 str_squish() %>%
                 str_split("\\s+(?=\\d)") %>%
                 unlist) %>%
  as_tibble() %>%
  # Transformating variables into  numeric
  type_convert()

Upvotes: 3

Views: 228

Answers (5)

user10191355
user10191355

Reputation:

If you want the column names, too (which imo makes sense), use data.full with something like this:

library(tidyverse)

data.full %>% 
    str_trim %>% 
    str_replace_all("\\s+(?=\\d)", ";") %>% 
    paste(collapse = "\n") %>% 
    read_csv2()

Output:

# A tibble: 78 x 18
   `Statistics PhD… `2003` `2004` `2005` `2006` `2007` `2008` `2009` `2010` `2011` `2012`
   <chr>             <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 1 North Carolin…     15      9     12     13     22     15     32     19     14     20
 2 2 Iowa State Un…      9      8      5     11     14      4     11     13     14      9
 3 3 University of…      5      6     14      9     20     13     15     12     13      9
 4 4 Stanford Univ…     10     12     14      6      9     10      5      9     13      7
 5 5 Texas A & M U…      6     12     18     10      7      4      5     11     16     18
 6 6 Pennsylvania …      6      9      8      1      8      9     10     10     14      9
 7 7 Ohio State Un…      3      5      3     12     12      7     11     11      6     11
 8 8 Purdue Univer…      6      5     12      8      8      7     10     12      9      9
 9 9 University of…      8      5      3      3      8      9     12     11      7     11
10 10 University o…      2      2      2      6      9      7      9      8      7     11
# … with 68 more rows, and 7 more variables: `2013` <dbl>, `2014` <dbl>, `2015` <dbl>,
#   `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2003‐2018` <dbl>

Upvotes: 1

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193667

Update

Since the "width" of characters on each page is different in the PDF you've shared, you'll have to process the file page by page. Here's one approach:

library(pdftools)
library(tidyverse)
library(cgwtools)
data.loc = "https://ww2.amstat.org/misc/StatsPhD2003-MostRecent.pdf"
data.full =
  pdf_text(data.loc) %>%
  read_lines() %>%
  head(-2) %>%
  tail(-4)

data.full <- data.full[nzchar(data.full)]
# Split the file according to the number of characters per line
l <- split(data.full, nchar(data.full))

out = 
  bind_rows(lapply(l, function(x) {
    temp <- do.call(rbind, strsplit(x, ""))
    y <- which(colSums(temp == " ") == nrow(temp))
    temp[, y[cumsum(seqle(y)$lengths)]] <- ","
    read_csv(apply(temp, 1, paste, collapse = ""), col_names = FALSE)
  })) %>%
  arrange(X1)

out
## # A tibble: 78 x 19
##       X1 X2         X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14   X15
##    <dbl> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1 North …    15     9    12    13    22    15    32    19    14    20    12    17    19
##  2     2 Iowa S…     9     8     5    11    14     4    11    13    14     9    15    28    14
##  3     3 Univer…     5     6    14     9    20    13    15    12    13     9    13    10    13
##  4     4 Stanfo…    10    12    14     6     9    10     5     9    13     7    13    10     4
##  5     5 Texas …     6    12    18    10     7     4     5    11    16    18    10     7    15
##  6     6 Pennsy…     6     9     8     1     8     9    10    10    14     9    12     9    17
##  7     7 Ohio S…     3     5     3    12    12     7    11    11     6    11     8    13    14
##  8     8 Purdue…     6     5    12     8     8     7    10    12     9     9     5    10    14
##  9     9 Univer…     8     5     3     3     8     9    12    11     7    11    13     9     8
## 10    10 Univer…     2     2     2     6     9     7     9     8     7    11    11     8     6
## # … with 68 more rows, and 4 more variables: X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>

Here's another approach, starting with data.full:

library(splitstackshape)
data.full <- data.full[nzchar(data.full)]

out = tibble(data.full) %>%
  extract(data.full, into = c("ind", "uni", "nums"), regex = "([0-9]+)([^0-9]+)(.*)") %>%
  mutate(uni = trimws(uni))%>%
  cSplit("nums", "\\s+", fixed = FALSE)

Original answer..

Since you've already loaded the tidyverse, just use:

read_fwf(unlist(example.list), fwf_empty(unlist(example.list)))
## # A tibble: 8 x 19
##      X1 X2          X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14
##   <dbl> <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1 North C…    15     9    12    13    22    15    32    19    14    20    12    17
## 2     2 Iowa St…     9     8     5    11    14     4    11    13    14     9    15    28
## 3     3 Univers…     5     6    14     9    20    13    15    12    13     9    13    10
## 4     4 Stanfor…    10    12    14     6     9    10     5     9    13     7    13    10
## 5     5 Texas A…     6    12    18    10     7     4     5    11    16    18    10     7
## 6     9 Univers…     8     5     3     3     8     9    12    11     7    11    13     9
## 7    10 Univers…     2     2     2     6     9     7     9     8     7    11    11     8
## 8    19 Rice Un…     3     3     5    11     4     7     7    11     2     6     4     6
## # … with 5 more variables: X15 <dbl>, X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>

Upvotes: 3

Ronak Shah
Ronak Shah

Reputation: 389205

Using base R :

do.call(rbind, lapply(strsplit(substring(example.list, 3), "\\s{2,}"), 
                      function(x) as.data.frame(t(x))))


#                                           V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
#1  North Carolina State University at Raleigh 15  9 12 13 22 15 32 19  14  20  12  17  19  20  19  25 283
#2                       Iowa State University  9  8  5 11 14  4 11 13  14   9  15  28  14   9  18  27 209
#3             University of Wisconsin-Madison  5  6 14  9 20 13 15 12  13   9  13  10  13  24  15  17 208
#4                        Stanford University* 10 12 14  6  9 10  5  9  13   7  13  10   4   9  23   6 160
#5      Texas A & M University-College Station  6 12 18 10  7  4  5 11  16  18  10   7  15   4   8   8 159
#6            University of Michigan-Ann Arbor  8  5  3  3  8  9 12 11   7  11  13   9   8  11  13   9 140
#7        University of California-Los Angeles  2  2  2  6  9  7  9  8   7  11  11   8   6  12  13  10 123
#8                             Rice University  3  3  5 11  4  7  7 11   2   6   4   6   3   8   7   7  94

We could split the string where there are more than 2 whitespace, convert each list into one-row dataframe, and rbind them.

Upvotes: 3

B. Christian Kamgang
B. Christian Kamgang

Reputation: 6529

Here is one approach that you could adopt:

library(magrittr)
library(data.table)


gsub("^...", "", example.list) %>% 
  tstrsplit(" {2,}", type.convert = TRUE, names = TRUE) %>% 
  as.data.frame()

#                                           V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
# 1 North Carolina State University at Raleigh 15  9 12 13 22 15 32 19  14  20  12  17  19  20  19  25 283
# 2                      Iowa State University  9  8  5 11 14  4 11 13  14   9  15  28  14   9  18  27 209
# 3            University of Wisconsin-Madison  5  6 14  9 20 13 15 12  13   9  13  10  13  24  15  17 208
# 4                       Stanford University* 10 12 14  6  9 10  5  9  13   7  13  10   4   9  23   6 160
# 5     Texas A & M University-College Station  6 12 18 10  7  4  5 11  16  18  10   7  15   4   8   8 159
# 6           University of Michigan-Ann Arbor  8  5  3  3  8  9 12 11   7  11  13   9   8  11  13   9 140
# 7       University of California-Los Angeles  2  2  2  6  9  7  9  8   7  11  11   8   6  12  13  10 123
# 8                            Rice University  3  3  5 11  4  7  7 11   2   6   4   6   3   8   7   7  94

Upvotes: 4

Ian Campbell
Ian Campbell

Reputation: 24868

One approach might be to split on more than 1 space or on the start of the string followed by only spaces or number.

library(magrittr)
library(stringr)
example.list %>% 
  str_split(.,"^( |[0-9])+| {2,}") %>%
  do.call(rbind,.) %>%
  as.data.frame %>%
  select(-V1)
#                                          V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#1 North Carolina State University at Raleigh 15  9 12 13 22 15 32  19  14  20  12  17  19  20  19  25 283
#2                      Iowa State University  9  8  5 11 14  4 11  13  14   9  15  28  14   9  18  27 209
#3            University of Wisconsin-Madison  5  6 14  9 20 13 15  12  13   9  13  10  13  24  15  17 208
#4                       Stanford University* 10 12 14  6  9 10  5   9  13   7  13  10   4   9  23   6 160
#5     Texas A & M University-College Station  6 12 18 10  7  4  5  11  16  18  10   7  15   4   8   8 159
#6           University of Michigan-Ann Arbor  8  5  3  3  8  9 12  11   7  11  13   9   8  11  13   9 140
#7       University of California-Los Angeles  2  2  2  6  9  7  9   8   7  11  11   8   6  12  13  10 123
#8                            Rice University  3  3  5 11  4  7  7  11   2   6   4   6   3   8   7   7  94

Upvotes: 3

Related Questions