Reputation: 11
I have only just started using R to scrape webpages, and I am trying to scrape the information for individual projects from Asian Development Bank's website using this link https://www.adb.org/projects.
So far, I have managed to scrape the information on the link above and put all 550+ pages results in a dataframe. My code looks like this:
library(dplyr)
library(ggmap)
library(leaflet)
library(RColorBrewer)
library(stringr)
url <-read_html("https://www.adb.org/projects")
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
summary(adb_pp)
write.table(x=adb_pp,
file='adb_pp.csv',
sep=",",
row.names = FALSE)
datalist = list()
for (i in 1:558){
print(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
url <-read_html(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
datalist[[i]] <- adb_pp
#sleep a second
Sys.sleep(1)
}
full = do.call(rbind, datalist)
str(full)
View(full)
However, I can't seem to create a loop that will go through the collected links above and scrape individual project level information. I managed to scrape individual projects using Rselenium, but it's probably not the most efficient way.
library(tidyverse)
library(RSelenium)
library(netstat)
library(htmltab)
library(XML)
# start the server
rs_driver_object <- rsDriver(browser = 'chrome',
chromever = '100.0.4896.20',
verbose = FALSE,
port = free_port())
# create a client object
remDr <- rs_driver_object$client
# open a browser
remDr$open()
# navigate to website
remDr$navigate('https://www.adb.org/projects/55313-001/main')
doc <- htmlParse(remDr$getPageSource()[[1]])
table <- readHTMLTable(doc)
I checked multiple posts on this forum but none of the methods seem to work for me. Links I scraped look like this:
> head(full$pp_url)
[1] "https://www.adb.org/projects/55313-001/main" "https://www.adb.org/projects/53354-003/main" "https://www.adb.org/projects/45007-013/main"
[4] "https://www.adb.org/projects/48186-009/main" "https://www.adb.org/projects/55319-001/main" "https://www.adb.org/projects/51126-005/main"
Upvotes: 1
Views: 415
Reputation: 3173
We can simply use html_table
as an alternate to readHTMLTable
in combination with lapply
to loop through links and extract tables.
library(tidyverse)
#vector of links
links = c("https://www.adb.org/projects/55313-001/main", "https://www.adb.org/projects/53354-003/main", "https://www.adb.org/projects/45007-013/main",
"https://www.adb.org/projects/48186-009/main", "https://www.adb.org/projects/55319-001/main", "https://www.adb.org/projects/51126-005/main")
#first create a function `f1` to skip errors
f1 = function(x){
x %>% read_html() %>% html_table()
}
#looping
df = lapply(links, possibly(f1, NA))
Upvotes: 0