Reputation: 247
I am doing some tricky data cleaning. I have one dataset (first extract below) that is the output from the digitization of pdf tables. Unfortunately columns were not digitized properly. Sometimes, what shall be in column X3 ended up concatenated in column X2 with the last word of column X2...
What I am trying to do is to bring back what should be in column X3 to X3 and collapse the two rows in X2 together.
I have attached an extract of the output I am trying to create.
Any idea about how can I do this?
Thank you!
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L, NA), X2 = structure(c(7L, 1L, 8L,
1L, 1L, 9L, 1L, 1L, 6L, 3L, 1L, 5L, 2L, 1L, 10L, 4L), .Label = c("",
"A - BWHITE", "ASMITH", "B - DBURNEY", "Garden Harris", "House M. Aba",
"House M. Bab", "House M. Cac", "Street M. Bak", "Villa Thomas"
), class = "factor"), X3 = structure(c(2L, 1L, 3L, 1L, 1L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "A",
"A - C", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
structure(list(X1 = c(111L, NA, 2L, NA, NA, 121L, NA, NA, 121L,
NA, NA, 141L, NA, NA, 141L), X2 = structure(c(4L, 1L, 5L, 1L,
1L, 6L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 7L), .Label = c("", "Garden Harris WHITE",
"House M. Aba SMITH", "House M. Bab", "House M. Cac", "Street M. Bak",
"Villa Thomas BURNEY"), class = "factor"), X3 = structure(c(2L,
1L, 4L, 1L, 1L, 6L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 5L), .Label = c("",
"A", "A - B", "A - C", "B - D", "D"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
Follow up question here: Cleaning extract_tables conditional merge rows, systematic extraction
Upvotes: 2
Views: 240
Reputation: 78917
Here is how we could do it:
Credits to MartinGal for the regex "(?<=[A-Z])[A-Z]+") (upvote!)
NA
lead
to move rows up in X3
conditional on NA
else notNA
in X1
str_extract
and regex "(?<=[A-Z])[A-Z]+"
-> combine this info with column X2
with str_c
and finally coalesce
both.str_remove
library(dyplr)
library(stringr)
df %>%
mutate(across(everything(), ~sub("^\\s*$", NA, .)),
X3= ifelse(is.na(X3), lead(X2), X3)) %>%
filter(!is.na(X1)) %>%
mutate(X2 = coalesce(str_c(X2," ", str_extract(X3, "(?<=[A-Z])[A-Z]+")), X2),
X3 = str_remove_all(X3, "(?<=[A-Z])[A-Z]+"))
Output:
X1 X2 X3
1 111 House M. Bab A
2 2 House M. Cac A - C
3 121 Street M. Bak D
4 121 House M. Aba SMITH A
5 141 Garden Harris WHITE A - B
6 141 Villa Thomas BURNEY B - D
Upvotes: 2
Reputation: 16978
You could use tidyverse
:
library(tidyr)
library(stringr)
library(dplyr)
df %>%
filter(X2 != "") %>%
mutate(
extract_name = lead(str_extract(X2, "(?<=[A-Z])[A-Z]+")),
extract_part = lead(str_extract(X2, "[A-Z](\\s-\\s[A-Z]){0,1}(?=[A-Z]+)")),
new_X2 = ifelse(!is.na(extract_name), paste(X2, extract_name), as.character(X2)),
new_X3 = ifelse(X3 != "", as.character(X3), extract_part)
) %>%
drop_na(X1) %>%
select(-extract_name, -extract_part)
which returns
X1 X2 X3 new_X2 new_X3
1 111 House M. Bab A House M. Bab A
2 2 House M. Cac A - C House M. Cac A - C
3 121 Street M. Bak D Street M. Bak D
4 121 House M. Aba House M. Aba SMITH A
5 141 Garden Harris Garden Harris WHITE A - B
6 141 Villa Thomas Villa Thomas BURNEY B - D
Note: I don't think this approach is really stable regarding the regex used. For readability I filtered out some annoying rows containing NA
and empty strings, you should remove those parts if necessary.
Upvotes: 4
Reputation: 5788
This is a yucky one:
# Retype the data and nullify empty values;
# use X1 as a key: intermediateResult => data.frame
intermediateResult <- data.frame(
lapply(
transform(
replace(df, df == "", NA_character_),
X1 = na.omit(X1)[cumsum(!is.na(X1))]
),
as.character
)
)
# Re-structure the data:
# interemdiateResult2 => data.frame
intermediateResult2 <- do.call(
rbind,
Filter(
function(y){
nrow(y) > 0
},
Map(
function(x){
z <- x[!is.na(x$X2),]
if(nrow(z) > 1 & is.na(z$X3[1])){
z$X3[1] <- z$X2[2]
head(z, 1)
}else{
z
}
},
with(
intermediateResult,
split(
intermediateResult,
paste(
X1,
cumsum(
(is.na(X2)
)
),
sep = " - "
)
)
)
)
)
)
# Regex it and hope for the best:
# result => data.frame
result <- data.frame(
transform(
intermediateResult2,
X2 = paste0(
X2,
ifelse(
(nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3)),
"",
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
paste0(" ", substr(X3, 2, nchar(X3))),
paste0(" ", gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\2", X3))
)
)
),
X3 = ifelse(
nchar(X3) == 1 | grepl("^\\w\\s+-\\s+\\w$", X3) ,
X3,
ifelse(
!(grepl("^\\w\\s+-\\s+\\w", X3)),
substr(X3, 1, 1),
gsub("(^\\w\\s+-\\s+\\w)(.*)", "\\1", X3)
)
)
),
row.names = NULL
)
Upvotes: 1