Reputation: 43
I have the following example html code below. I would like to have a dataframe as follows. Thank you very much for any ideas
<a name="efficiant"></a><h3>Ingredient</h3>
<div id="product_detail"><ul>
<li>X1<ul>
<li>a</li>
<li>b</li>
<li>c</li>
</ul>
</li>
<li>X2<ul>
<li>a</li>
<li>b</li>
</ul>
</li>
<li>X3<ul>
<li>c</li>
<li>b</li>
</ul>
</li>
</ul>
</div>
column A column B
X1 a
X1 b
X1 c
X2 a
X2 b
X3 c
X3 b
Upvotes: 0
Views: 639
Reputation: 50678
I'm sure this can be optimised but here is an rvest
option using some CSS selectors to extract the nested li
elements from within the ul
s.
library(rvest)
library(tidyverse)
val <- read_html(ss) %>%
html_nodes(css = "li > ul") %>%
map(~html_nodes(.x, css = "li") %>% html_text())
nms <- read_html(ss) %>%
html_nodes(css = "li") %>%
html_text() %>%
str_extract("X\\d") %>%
na.omit()
stack(setNames(val, nms))
# values ind
#1 a X1
#2 b X1
#3 c X1
#4 a X2
#5 b X2
#6 c X3
#7 b X3
ss <- '<a name="efficiant"></a><h3>Ingredient</h3>
<div id="product_detail"><ul>
<li>X1<ul>
<li>a</li>
<li>b</li>
<li>c</li>
</ul>
</li>
<li>X2<ul>
<li>a</li>
<li>b</li>
</ul>
</li>
<li>X3<ul>
<li>c</li>
<li>b</li>
</ul>
</li>
</ul>
</div>
'
Upvotes: 2
Reputation: 4130
If your HTML is in correct format,you can do it like this
library(rvest)
html = '<a name="efficiant"></a><h3>Ingredient</h3>
<div id="product_detail">
<ul>
<li>X1</li>
<li>a</li>
<li>b</li>
<li>c</li>
</ul>
<ul>
<li>X2</li>
<li>a</li>
<li>b</li>
</ul>
<ul>
<li>X3</li>
<li>c</li>
<li>b</li>
</ul>
</div>
'
df <- data.frame()
page<-read_html(html)
page %>% html_nodes("ul") -> uls
for (ul in uls) {
ul %>% html_text("li") -> text
text = gsub('\\n', ' ', text)
vec <- unlist(strsplit(text, "\\s+"))
A <- rep(vec[1],length(vec)-1)
B <- vec[2:length(vec)]
tmpdf <- data.frame(A,B)
df <- rbind(df,tmpdf)
}
output
A B
1 X1 a
2 X1 b
3 X1 c
4 X2 a
5 X2 b
6 X3 c
7 X3 b
Upvotes: 1