Reputation: 25
I am trying to scrape a website to extract data and build up a data frame out of it including specific information I need from the main pages and the subpages linked there. I have prepared a code to extract data from both main page and the linked sub-pages and it is working.
Nevertheless, I have to scrape 407 more pages and get the same data from all of them in order to complete the task.
I thought that the best thing to do would be to include a function at the beginning of the code to repeat the process again and again up to the last page taking advantage of the fact that the URLs are dynamic but it is not working.
Here's my code.
page <- paste0("https://pubmed.ncbi.nlm.nih.gov/?term=((((((%E2%80%98Food%20Supply%E2%80%99%20(MeSH))%20OR%20%E2%80%98Food%20Storage%E2%80%99%20(MeSH))%20OR%20%E2%80%98Hunger%E2%80%99(MeSH)%20OR%20food%20security%20OR%20food%20insecurity%20OR%20household%20food%20security%20OR%20global%20food%20security)%20OR%20household%20food%20insecurity)))%20AND%20((%E2%80%98Prevalence%E2%80%99%20(MeSH))%20OR%20%E2%80%98Cross-Sectional%20Studies%E2%80%99%20(MeSH)%20OR%20cross-sectional%20study%20OR%20Prevalence%20Studies%20OR%20prevalence%20study%20OR%20Cross-Sectional%20Analyses%20OR%20CrossSectional%20Analysis%20OR%20Cross%20Sectional%20Analysis%20OR%20Cross%20Sectional%20Analyses)&filter=lang.english&filter=lang.portuguese&page=", 2:407)
function(i) {
pubmed <-
page %>%
html_elements(".docsum-content") %>%
map_dfr(~ tibble(
title = .x %>%
html_element(".docsum-title") %>%
html_text2(),
authors = .x %>%
html_element(".full-authors") %>%
html_text2(),
PMID = .x %>%
html_element(".docsum-pmid") %>%
html_text2(),
synopsis = .x %>%
html_element(".full-view-snippet") %>%
html_text2(),
link = .x %>%
html_element(".docsum-title") %>%
html_attr("href") %>%
str_c("https://pubmed.ncbi.nlm.nih.gov", .)
))
get_abstract <- function(link) {
cat("Scraping:", link, "\n")
link %>%
read_html() %>%
html_elements(".abstract-content.selected") %>%
html_text2()
}}
pubmed %>%
mutate(
abstract = map_chr(link, get_abstract)
)
Upvotes: 0
Views: 60
Reputation: 5721
You were close.
This code extracts and put all in the result
dataframe
library(rvest)
library(tidyverse)
page <-
paste0(
"https://pubmed.ncbi.nlm.nih.gov/?term=((((((%E2%80%98Food%20Supply%E2%80%99%20(MeSH))%20OR%20%E2%80%98Food%20Storage%E2%80%99%20(MeSH))%20OR%20%E2%80%98Hunger%E2%80%99(MeSH)%20OR%20food%20security%20OR%20food%20insecurity%20OR%20household%20food%20security%20OR%20global%20food%20security)%20OR%20household%20food%20insecurity)))%20AND%20((%E2%80%98Prevalence%E2%80%99%20(MeSH))%20OR%20%E2%80%98Cross-Sectional%20Studies%E2%80%99%20(MeSH)%20OR%20cross-sectional%20study%20OR%20Prevalence%20Studies%20OR%20prevalence%20study%20OR%20Cross-Sectional%20Analyses%20OR%20CrossSectional%20Analysis%20OR%20Cross%20Sectional%20Analysis%20OR%20Cross%20Sectional%20Analyses)&filter=lang.english&filter=lang.portuguese&page=",
2:407
)
scrap <- function(page) {
pubmed <-
read_html(page) %>%
html_elements(".docsum-content") %>%
map_dfr(
~ tibble(
title = .x %>%
html_element(".docsum-title") %>%
html_text2(),
authors = .x %>%
html_element(".full-authors") %>%
html_text2(),
PMID = .x %>%
html_element(".docsum-pmid") %>%
html_text2(),
synopsis = .x %>%
html_element(".full-view-snippet") %>%
html_text2(),
link = .x %>%
html_element(".docsum-title") %>%
html_attr("href") %>%
str_c("https://pubmed.ncbi.nlm.nih.gov", .)
)
)
get_abstract <- function(link) {
cat("Scraping:", link, "\n")
link %>%
read_html() %>%
html_elements(".abstract-content.selected") %>%
html_text2()
}
pubmed %>%
mutate(abstract = map_chr(link, get_abstract))
}
result <- map_dfr(page, scrap)
Upvotes: 1