Reputation: 118
I'm wondering if anyone has any tips/tricks on how to handle transforming data, like what is below:
library(tidyverse)
example.list = list(" 1 North Carolina State University at Raleigh 15 9 12 13 22 15 32 19 14 20 12 17 19 20 19 25 283",
" 2 Iowa State University 9 8 5 11 14 4 11 13 14 9 15 28 14 9 18 27 209",
" 3 University of Wisconsin-Madison 5 6 14 9 20 13 15 12 13 9 13 10 13 24 15 17 208",
" 4 Stanford University* 10 12 14 6 9 10 5 9 13 7 13 10 4 9 23 6 160",
" 5 Texas A & M University-College Station 6 12 18 10 7 4 5 11 16 18 10 7 15 4 8 8 159",
" 9 University of Michigan-Ann Arbor 8 5 3 3 8 9 12 11 7 11 13 9 8 11 13 9 140",
"10 University of California-Los Angeles 2 2 2 6 9 7 9 8 7 11 11 8 6 12 13 10 123",
"19 Rice University 3 3 5 11 4 7 7 11 2 6 4 6 3 8 7 7 94")
Into something like the output here:
example.list %>%
substring(3) %>%
str_replace_all("[^[:alnum:]]", " ") %>%
str_squish() %>%
strsplit(split = "(?<=[a-zA-Z])\\s*(?=[0-9])", perl = TRUE) %>%
unlist() %>%
matrix(ncol = 2, byrow = TRUE) %>%
data.frame() %>%
separate("X2",into = paste0("X",2:18),sep = " ")
The general pattern that needs extracted is all characters into its own column up to the first numbers, with all other columns separated by whitespace into other columns.
Things that would be interesting is whether most of this could be done in a single regex pattern or without it at all.
I'm just looking to improve string handling as I haven't worked with it much! The use-case here would be like trying to pull tabled data from a pdf/html into a data.frame.
Edit:
I appreciate all of the advice and different perspectives!
I realized I actually missed a few worthy-of-mention test cases:
example2.list = list(" 2 Iowa State University 9 8 5 11 14 4 11 13 14 9 15 28 14 9 18 27 209",
" 3 University of Wisconsin-Madison 5 6 14 9 20 13 15 12 13 9 13 10 13 24 15 17 208",
" 4 Stanford University* 10 12 14 6 9 10 5 9 13 7 13 10 4 9 23 6 160",
" 5 Texas A & M University-College Station 6 12 18 10 7 4 5 11 16 18 10 7 15 4 8 8 159",
" 9 University of Michigan-Ann Arbor 8 5 3 3 8 9 12 11 7 11 13 9 8 11 13 9 140",
"10 University of California-Los Angeles 2 2 2 6 9 7 9 8 7 11 11 8 6 12 13 10 123",
"19 Rice University 3 3 5 11 4 7 7 11 2 6 4 6 3 8 7 7 94",
"52 Bowling Green State University 0 0 0 0 0 1 5 2 2 2 4 7 3 4 4 3 37",
"55 University of New Mexico 4 2 3 1 3 0 5 3 2 1 1 2 3 2 3 0 35")
It doesn't actually come out as neatly as aligned.
Full dataset, slightly cleaned:
library(pdftools)
library(tidyverse)
data.loc = "https://ww2.amstat.org/misc/StatsPhD2003-MostRecent.pdf"
data.full =
pdf_text(data.loc) %>%
read_lines() %>%
head(-2) %>%
tail(-3) %>%
lapply(function(ele) if(ele == "") NULL else ele) %>%
compact()
Here was my second attempt:
library(tidyverse)
library(magrittr)
# Ignores column names
data.full[-1] %>%
# Removing excess whitepace
str_squish() %>%
# Removes index
str_remove("^\\s*\\d*\\s*") %>%
# Split on all whitespace occurring before digits
str_split("\\s+(?=\\d)") %>%
# Turn list into a matrix
unlist() %>%
matrix(ncol = 18, byrow = TRUE) %>%
# Handling variables names
set_colnames(value =
data.full[1] %>%
str_squish() %>%
str_split("\\s+(?=\\d)") %>%
unlist) %>%
as_tibble() %>%
# Transformating variables into numeric
type_convert()
Upvotes: 3
Views: 228
Reputation:
If you want the column names, too (which imo makes sense), use data.full
with something like this:
library(tidyverse)
data.full %>%
str_trim %>%
str_replace_all("\\s+(?=\\d)", ";") %>%
paste(collapse = "\n") %>%
read_csv2()
Output:
# A tibble: 78 x 18
`Statistics PhD… `2003` `2004` `2005` `2006` `2007` `2008` `2009` `2010` `2011` `2012`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 North Carolin… 15 9 12 13 22 15 32 19 14 20
2 2 Iowa State Un… 9 8 5 11 14 4 11 13 14 9
3 3 University of… 5 6 14 9 20 13 15 12 13 9
4 4 Stanford Univ… 10 12 14 6 9 10 5 9 13 7
5 5 Texas A & M U… 6 12 18 10 7 4 5 11 16 18
6 6 Pennsylvania … 6 9 8 1 8 9 10 10 14 9
7 7 Ohio State Un… 3 5 3 12 12 7 11 11 6 11
8 8 Purdue Univer… 6 5 12 8 8 7 10 12 9 9
9 9 University of… 8 5 3 3 8 9 12 11 7 11
10 10 University o… 2 2 2 6 9 7 9 8 7 11
# … with 68 more rows, and 7 more variables: `2013` <dbl>, `2014` <dbl>, `2015` <dbl>,
# `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2003‐2018` <dbl>
Upvotes: 1
Reputation: 193667
Since the "width" of characters on each page is different in the PDF you've shared, you'll have to process the file page by page. Here's one approach:
library(pdftools)
library(tidyverse)
library(cgwtools)
data.loc = "https://ww2.amstat.org/misc/StatsPhD2003-MostRecent.pdf"
data.full =
pdf_text(data.loc) %>%
read_lines() %>%
head(-2) %>%
tail(-4)
data.full <- data.full[nzchar(data.full)]
# Split the file according to the number of characters per line
l <- split(data.full, nchar(data.full))
out =
bind_rows(lapply(l, function(x) {
temp <- do.call(rbind, strsplit(x, ""))
y <- which(colSums(temp == " ") == nrow(temp))
temp[, y[cumsum(seqle(y)$lengths)]] <- ","
read_csv(apply(temp, 1, paste, collapse = ""), col_names = FALSE)
})) %>%
arrange(X1)
out
## # A tibble: 78 x 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 North … 15 9 12 13 22 15 32 19 14 20 12 17 19
## 2 2 Iowa S… 9 8 5 11 14 4 11 13 14 9 15 28 14
## 3 3 Univer… 5 6 14 9 20 13 15 12 13 9 13 10 13
## 4 4 Stanfo… 10 12 14 6 9 10 5 9 13 7 13 10 4
## 5 5 Texas … 6 12 18 10 7 4 5 11 16 18 10 7 15
## 6 6 Pennsy… 6 9 8 1 8 9 10 10 14 9 12 9 17
## 7 7 Ohio S… 3 5 3 12 12 7 11 11 6 11 8 13 14
## 8 8 Purdue… 6 5 12 8 8 7 10 12 9 9 5 10 14
## 9 9 Univer… 8 5 3 3 8 9 12 11 7 11 13 9 8
## 10 10 Univer… 2 2 2 6 9 7 9 8 7 11 11 8 6
## # … with 68 more rows, and 4 more variables: X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>
Here's another approach, starting with data.full
:
library(splitstackshape)
data.full <- data.full[nzchar(data.full)]
out = tibble(data.full) %>%
extract(data.full, into = c("ind", "uni", "nums"), regex = "([0-9]+)([^0-9]+)(.*)") %>%
mutate(uni = trimws(uni))%>%
cSplit("nums", "\\s+", fixed = FALSE)
Since you've already loaded the tidyverse, just use:
read_fwf(unlist(example.list), fwf_empty(unlist(example.list)))
## # A tibble: 8 x 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 North C… 15 9 12 13 22 15 32 19 14 20 12 17
## 2 2 Iowa St… 9 8 5 11 14 4 11 13 14 9 15 28
## 3 3 Univers… 5 6 14 9 20 13 15 12 13 9 13 10
## 4 4 Stanfor… 10 12 14 6 9 10 5 9 13 7 13 10
## 5 5 Texas A… 6 12 18 10 7 4 5 11 16 18 10 7
## 6 9 Univers… 8 5 3 3 8 9 12 11 7 11 13 9
## 7 10 Univers… 2 2 2 6 9 7 9 8 7 11 11 8
## 8 19 Rice Un… 3 3 5 11 4 7 7 11 2 6 4 6
## # … with 5 more variables: X15 <dbl>, X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>
Upvotes: 3
Reputation: 389205
Using base R :
do.call(rbind, lapply(strsplit(substring(example.list, 3), "\\s{2,}"),
function(x) as.data.frame(t(x))))
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
#1 North Carolina State University at Raleigh 15 9 12 13 22 15 32 19 14 20 12 17 19 20 19 25 283
#2 Iowa State University 9 8 5 11 14 4 11 13 14 9 15 28 14 9 18 27 209
#3 University of Wisconsin-Madison 5 6 14 9 20 13 15 12 13 9 13 10 13 24 15 17 208
#4 Stanford University* 10 12 14 6 9 10 5 9 13 7 13 10 4 9 23 6 160
#5 Texas A & M University-College Station 6 12 18 10 7 4 5 11 16 18 10 7 15 4 8 8 159
#6 University of Michigan-Ann Arbor 8 5 3 3 8 9 12 11 7 11 13 9 8 11 13 9 140
#7 University of California-Los Angeles 2 2 2 6 9 7 9 8 7 11 11 8 6 12 13 10 123
#8 Rice University 3 3 5 11 4 7 7 11 2 6 4 6 3 8 7 7 94
We could split the string where there are more than 2 whitespace, convert each list into one-row dataframe, and rbind
them.
Upvotes: 3
Reputation: 6529
Here is one approach that you could adopt:
library(magrittr)
library(data.table)
gsub("^...", "", example.list) %>%
tstrsplit(" {2,}", type.convert = TRUE, names = TRUE) %>%
as.data.frame()
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
# 1 North Carolina State University at Raleigh 15 9 12 13 22 15 32 19 14 20 12 17 19 20 19 25 283
# 2 Iowa State University 9 8 5 11 14 4 11 13 14 9 15 28 14 9 18 27 209
# 3 University of Wisconsin-Madison 5 6 14 9 20 13 15 12 13 9 13 10 13 24 15 17 208
# 4 Stanford University* 10 12 14 6 9 10 5 9 13 7 13 10 4 9 23 6 160
# 5 Texas A & M University-College Station 6 12 18 10 7 4 5 11 16 18 10 7 15 4 8 8 159
# 6 University of Michigan-Ann Arbor 8 5 3 3 8 9 12 11 7 11 13 9 8 11 13 9 140
# 7 University of California-Los Angeles 2 2 2 6 9 7 9 8 7 11 11 8 6 12 13 10 123
# 8 Rice University 3 3 5 11 4 7 7 11 2 6 4 6 3 8 7 7 94
Upvotes: 4
Reputation: 24868
One approach might be to split on more than 1 space or on the start of the string followed by only spaces or number.
library(magrittr)
library(stringr)
example.list %>%
str_split(.,"^( |[0-9])+| {2,}") %>%
do.call(rbind,.) %>%
as.data.frame %>%
select(-V1)
# V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#1 North Carolina State University at Raleigh 15 9 12 13 22 15 32 19 14 20 12 17 19 20 19 25 283
#2 Iowa State University 9 8 5 11 14 4 11 13 14 9 15 28 14 9 18 27 209
#3 University of Wisconsin-Madison 5 6 14 9 20 13 15 12 13 9 13 10 13 24 15 17 208
#4 Stanford University* 10 12 14 6 9 10 5 9 13 7 13 10 4 9 23 6 160
#5 Texas A & M University-College Station 6 12 18 10 7 4 5 11 16 18 10 7 15 4 8 8 159
#6 University of Michigan-Ann Arbor 8 5 3 3 8 9 12 11 7 11 13 9 8 11 13 9 140
#7 University of California-Los Angeles 2 2 2 6 9 7 9 8 7 11 11 8 6 12 13 10 123
#8 Rice University 3 3 5 11 4 7 7 11 2 6 4 6 3 8 7 7 94
Upvotes: 3