Reputation: 129
Here's a substring I have
> substring(reut2.000[4,], regexpr(">",reut2.000[3,]) + 1)
[1] "<D>el-salvador</D><D>usa</D><D>uruguay</D></PLACES>"
And I would like to extract all the characteres between <D>
and </D>
.
In this case, the output will be
"el-salvador","use","uruguay"
So far I have tried
gsub(".*<D>\\s*|</D>.*", "", tmp)
where tmp
is the substring, and it returns "uruguay"
.
How do I modify it so that it returns all the places?
Upvotes: 0
Views: 89
Reputation: 2126
Another option using gsub:
temp <- "<D>el-salvador</D><D>usa</D><D>uruguay</D></PLACES>"
temp <- unlist(strsplit(gsub("<D>|</D>|</PLACES>", " ", x = temp ), split = " "))
temp <- temp[temp != ""]
Upvotes: 0
Reputation: 78832
You have an XML file (<<== is likely exactly the file you have, too). Note that said link is to an example file from the tmparallel
package and there are many places in that package that have code that works with it.
Work with XML as XML. Do not regex it.
xdf$places
in the following snippet has what you are looking for but since this is likely a file being used in class on text mining, you may eventually need all the other bits extracted into the data frame.
library(xml2)
library(tidyverse)
download.file(
"https://raw.githubusercontent.com/noahhl/tmparallel/master/pkg/inst/texts/reuters-21578.xml",
"~/Data/reuters-21578.xml"
)
reut <- read_xml("~/Data/reuters-21578.xml")
xml_find_all(reut, "//REUTERS") %>%
map_df(~{
xml_attrs(.x) %>%
as.list() %>%
as_data_frame() -> xdf
xdf$date <- xml_find_first(.x, ".//DATE") %>% xml_text(trim=TRUE)
#### NOTE THAT THIS FOLLOWING LINE IS THE DATA YOU ASKED FOR IN THE EXAMPLE
xdf$places <- list(xml_find_all(.x, ".//PLACES/D") %>% xml_text(trim=TRUE))
xdf$people <- list(xml_find_all(.x, ".//PEOPLE/D") %>% xml_text(trim=TRUE))
xdf$orgs <- list(xml_find_all(.x, ".//ORGS/D") %>% xml_text(trim=TRUE))
xdf$exchanges <- list(xml_find_all(.x, ".//EXCHANGES/D") %>% xml_text(trim=TRUE))
xdf$companies <- list(xml_find_all(.x, ".//COMPANIES/D") %>% xml_text(trim=TRUE))
xdf$uknown <- xml_find_first(.x, ".//UNKNOWN") %>% xml_text(trim=TRUE)
xdf$text_title <- xml_find_first(.x, ".//TEXT/TITLE") %>% xml_text(trim=TRUE)
xdf$text_dateline <- xml_find_first(.x, ".//TEXT/DATELINE") %>% xml_text(trim=TRUE)
xdf$text_body <- xml_find_first(.x, ".//TEXT/BODY") %>% xml_text(trim=TRUE)
xdf
}) -> text_df
Output:
text_df
## # A tibble: 10 x 15
## TOPICS LEWISSPLIT CGISPLIT OLDID NEWID date places people orgs
## <chr> <chr> <chr> <chr> <chr> <chr> <list> <list> <lis>
## 1 YES TRAIN TRAINING… 5544 1 26-FEB-1… <chr [… <chr [… <chr…
## 2 NO TRAIN TRAINING… 5545 2 26-FEB-1… <chr [… <chr [… <chr…
## 3 NO TRAIN TRAINING… 5546 3 26-FEB-1… <chr [… <chr [… <chr…
## 4 NO TRAIN TRAINING… 5547 4 26-FEB-1… <chr [… <chr [… <chr…
## 5 YES TRAIN TRAINING… 5548 5 26-FEB-1… <chr [… <chr [… <chr…
## 6 YES TRAIN TRAINING… 5549 6 26-FEB-1… <chr [… <chr [… <chr…
## 7 NO TRAIN TRAINING… 5550 7 26-FEB-1… <chr [… <chr [… <chr…
## 8 YES TRAIN TRAINING… 5551 8 26-FEB-1… <chr [… <chr [… <chr…
## 9 YES TRAIN TRAINING… 5552 9 26-FEB-1… <chr [… <chr [… <chr…
## 10 YES TRAIN TRAINING… 5553 10 26-FEB-1… <chr [… <chr [… <chr…
## # ... with 6 more variables: exchanges <list>, companies <list>,
## # uknown <chr>, text_title <chr>, text_dateline <chr>, text_body <chr>
glimpse(text_df)
## Observations: 10
## Variables: 15
## $ TOPICS <chr> "YES", "NO", "NO", "NO", "YES", "YES", "NO", "YE...
## $ LEWISSPLIT <chr> "TRAIN", "TRAIN", "TRAIN", "TRAIN", "TRAIN", "TR...
## $ CGISPLIT <chr> "TRAINING-SET", "TRAINING-SET", "TRAINING-SET", ...
## $ OLDID <chr> "5544", "5545", "5546", "5547", "5548", "5549", ...
## $ NEWID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"
## $ date <chr> "26-FEB-1987 15:01:01.79", "26-FEB-1987 15:02:20...
## $ places <list> [<"el-salvador", "usa", "uruguay">, "usa", "usa...
## $ people <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ orgs <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ exchanges <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ companies <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ uknown <chr> "C T\nf0704reute\nu f BC-BAHIA-COCOA-REVIEW 02...
## $ text_title <chr> "BAHIA COCOA REVIEW", "STANDARD OIL <SRD> TO FOR...
## $ text_dateline <chr> "SALVADOR, Feb 26 -", "CLEVELAND, Feb 26 -", "HO...
## $ text_body <chr> "Showers continued throughout the week in\nthe B...
str(head(text_df, 2))
## Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 15 variables:
## $ TOPICS : chr "YES" "NO"
## $ LEWISSPLIT : chr "TRAIN" "TRAIN"
## $ CGISPLIT : chr "TRAINING-SET" "TRAINING-SET"
## $ OLDID : chr "5544" "5545"
## $ NEWID : chr "1" "2"
## $ date : chr "26-FEB-1987 15:01:01.79" "26-FEB-1987 15:02:20.00"
## $ places :List of 2
## ..$ : chr "el-salvador" "usa" "uruguay"
## ..$ : chr "usa"
## $ people :List of 2
## ..$ : chr
## ..$ : chr
## $ orgs :List of 2
## ..$ : chr
## ..$ : chr
## $ exchanges :List of 2
## ..$ : chr
## ..$ : chr
## $ companies :List of 2
## ..$ : chr
## ..$ : chr
## $ uknown : chr "C T\nf0704reute\nu f BC-BAHIA-COCOA-REVIEW 02-26 0105" "F Y\nf0708reute\nd f BC-STANDARD-OIL-<SRD>-TO 02-26 0082"
## $ text_title : chr "BAHIA COCOA REVIEW" "STANDARD OIL <SRD> TO FORM FINANCIAL UNIT"
## $ text_dateline: chr "SALVADOR, Feb 26 -" "CLEVELAND, Feb 26 -"
## $ text_body : chr "Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary an"| __truncated__ "Standard Oil Co and BP North America\nInc said they plan to form a venture to manage the money market\nborrowin"| __truncated__
Upvotes: 4
Reputation: 521794
Here is one option using grepexpr
and regmatches
to capture all matches in your text:
input <- c("<D>el-salvador</D><D>usa</D><D>uruguay</D></PLACES>")
m <- gregexpr("(?<=<D>).*?(?=</D>)", input, perl=TRUE)
regmatches(input, m)[[1]]
[1] "el-salvador" "usa" "uruguay"
Note that it is generally not advisable to use regex to parse HTML/XML or similar content. One reason for this is that there could be nested tags, causing a simple regex to break.
Upvotes: 1