Reputation: 55
I have a text file in this rather horrendous HTML format:
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
I would like to parse this file into columns in R.
such as:
A,Metabolism
B,
B,Overview
C,01200,Carbon metabolism,Path,bpe01200
D,BP3142,Pgi,glucose-6-phosphate isomerase,GPI,glucose-6-phosphate isomerase,[EC:5.3.1.9]
...
D,BP1000,gap,glyceraldehyde-3-phosphate dehydrogenase,K00134,GAPDH,glyceraldehyde 3-phosphate dehydrogenase,[EC:1.2.1.12]
The problem is that the delimiter changes in each part of the line. It seems to follow this pattern e.g
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
^Tab ^space^Semi colon ^tab ^space^semi colon
I can think of the not so smart way to do it.By parsing 1 delimiter at a time. But does anyone have any smart solutions? or know of a tool that can interpret this nicely?
I would really appreciate some help :)
Thanks
Upvotes: 1
Views: 144
Reputation: 4378
And a simpler version of extracting the details only using the same regex strings in one match
text <- "
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
"
library(stringr)
# get the detail items (liens beginning with D blank)
details <- str_match_all(text, "D\\s+(.+)\n")[[1]][,2]
details
pattern <- "([^\\s]+)\\s([^\\s]+);(.*)\\s([^\\s]+)\\s([^\\s]+);\\s(.*)\\s([^\\s]+)$"
trimws(str_match(details, pattern)[,-1])
#[,1] [,2] [,3] [,4] [,5]
#[1,] "BP3142" "pgi" "glucose-6-phosphate isomerase" "K01810" "GPI"
#[2,] "BP1971" "pgi" "glucose-6-phosphate isomerase" "K01810" "GPI"
#[3,] "BP1519" "fba" "fructose-1,6-bisphosphate aldolase" "K01624" "FBA"
#[4,] "BP0801" "tpiA" "triosephosphate isomerase" "K01803" "TPI"
#[5,] "BP1000" "gap" "glyceraldehyde-3-phosphate dehydrogenase" "K00134" "GAPDH"
# [,6] [,7]
#[1,] "glucose-6-phosphate isomerase" "[EC:5.3.1.9]"
#[2,] "glucose-6-phosphate isomerase" "[EC:5.3.1.9]"
#[3,] "fructose-bisphosphate aldolase, class II" "[EC:4.1.2.13]"
#[4,] "triosephosphate isomerase (TIM)" "[EC:5.3.1.1]"
#[5,] "glyceraldehyde 3-phosphate dehydrogenase" "[EC:1.2.1.12]"
Upvotes: 1
Reputation: 4378
text <- "
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
"
library(stringr)
# get the header items (beginning with C blank)
headers <- str_match(text, "C\\s+(.+)\n")[,2]
header_items <- trimws(str_match(headers, "(\\d+)\\s+([^\\[]+)(.+)")[2:4])
# get the detail items (liens beginning with D blank)
details <- str_match_all(text, "D\\s+(.+)\n")[[1]][,2]
# parse each item within detail
# split on ";" and organize into dataframe
items <- as.data.frame(t(data.frame(
str_split(details,";\\s")
)), row.names = 1:length(details), stringsAsFactors = FALSE)
# parse each part using pattern matches
# capture () beginning of string ^ and all characters not whitespace [^\\s]+
items$V1A <- str_match(items$V1,"(^[^\\s]+)")[,2]
# capture () end of string $ and a non-whitespace sequence [^\\s]+
items$V1B <- str_match(items$V1,"([^\\s]+)$")[,2]
# capture () beginning of string exluding two non-whitespace sequences [^\\s]+ at end $
items$V2A <- str_match(items$V2,"^(.+)\\s[^\\s]+\\s[^\\s]+$")[,2]
# capture () non-whitespace sequence [^\\s]+ at end of string $
items$V2C <- str_match(items$V2,"([^\\s]+)$")[,2]
# capture () second to last non-whitespace sequence [^\\s]+ at end of string $
items$V2B <- str_match(items$V2,"([^\\s]+)\\s[^\\s]+$")[,2]
# capture () begining of string ^ excluding last non-whitespace sequence [^\\s]+
items$V3A <- str_match(items$V3,"^(.+)\\s[^\\s]+$")[,2]
# capture () non-whitespace sequence at end $
items$V3B <- str_match(items$V3,"([^\\s]+)$")[,2]
select & reorder
items <- items[, c("V1A", "V1B", "V2A", "V2B", "V2C", "V3A", "V3B")]
items
# V1A V1B V2A V2B V2C V3A V3B
#1 BP3142 pgi glucose-6-phosphate isomerase K01810 GPI glucose-6-phosphate isomerase [EC:5.3.1.9]
#2 BP1971 pgi glucose-6-phosphate isomerase K01810 GPI glucose-6-phosphate isomerase [EC:5.3.1.9]
#3 BP1519 fba fructose-1,6-bisphosphate aldolase K01624 FBA fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
#4 BP0801 tpiA triosephosphate isomerase K01803 TPI triosephosphate isomerase (TIM) [EC:5.3.1.1]
#5 BP1000 gap glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
Upvotes: 1
Reputation: 12819
library(stringr)
library(purrr)
file <- "A<b>Metabolism</b>
B
B <b>Overview</b>
C\t01200 Carbon metabolism [PATH:bpe01200]
D\tBP3142 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D\tBP1971 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D\tBP1519 fba; fructose-1,6-bisphosphate aldolase\tK01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D\tBP0801 tpiA; triosephosphate isomerase\tK01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D\tBP1000 gap; glyceraldehyde-3-phosphate dehydrogenase\tK00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
This line is to check behavior when parsing fails."
cat(file)
data <- readLines(con = textConnection(file))
# Pattern to capture "A<b>Metabolism</b>" for instance
pattern_1 <- "^(\\w+)\\h*<b>\\h*(\\w+)\\h*</b>\\h*$"
# Pattern to capture "B" for instance
pattern_2 <- "^(\\w+)$"
# Pattern to capture "C\t01200 Carbon metabolism [PATH:bpe01200]" for instance
pattern_3 <- "^(\\w+)\\t+(\\w+)\\s+([^\\[\\t;]*)\\h*(\\[[^\\]]*\\])$"
# Pattern to capture "D\tBP3142 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]" for instance
pattern_4 <- "^(\\w+)\\t+(\\w+)\\s+(\\w+);\\h*([^\\t]*)\\t+(\\w+)\\s+(\\w+);\\h*([^\\[]*)\\h*(\\[[^\\]]*\\])$"
# Some more explanations:
# Parens wrap groups to extract
# "\\w+" matches words
# "\\t+", "\\s+" or ";\\h*" are specific separators of OP's original data
# "([^\\t]*)" matches anything until the next tab separator
# Convoluted patterns such as "(\\[[^\\]]*\\])" extract whatever is inside brackets
patterns <- mget(paste0("pattern_", 1:4))
# A list of the data parsed 4 times, once for each pattern:
patterns %>%
map(~ {
extraction <- str_match(data, .x)
cbind(match = !is.na(extraction[, 1]), extraction[, - 1])
})
# This is closer to your desired output: a list of [un]parsed rows:
data %>%
map(~ {
# Find the first pattern that matches. 0 if none does
pattern_index <- detect_index(patterns, grepl, .x, perl = TRUE)
# If failed to parse, return original row as length 1 character vector. Else return parsed row as character vector
if (pattern_index == 0L) .x else str_match(.x, get(paste0("pattern_", pattern_index)))[- 1]
})
Head of output looks like this:
list(c("A", "Metabolism"), "B", c("B", "Overview"), c("C", "01200",
"Carbon metabolism ", "[PATH:bpe01200]"), c("D", "BP3142", "pgi",
"glucose-6-phosphate isomerase", "K01810", "GPI", "glucose-6-phosphate isomerase ",
"[EC:5.3.1.9]"))
Upvotes: 2