truehm2
truehm2

Reputation: 11

R data scraping from multiple links

I have only just started using R to scrape webpages, and I am trying to scrape the information for individual projects from Asian Development Bank's website using this link https://www.adb.org/projects.

So far, I have managed to scrape the information on the link above and put all 550+ pages results in a dataframe. My code looks like this:

library(dplyr)
library(ggmap)
library(leaflet)
library(RColorBrewer)
library(stringr)

url <-read_html("https://www.adb.org/projects")

#project title
pp_title <- url %>% 
  html_nodes(".item-title") %>%
  html_text()
table(pp_title)


#project dates 
project_dates <- url %>% 
  html_nodes(".item-meta") %>%
  html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)

dates <- sapply(strsplit(project_dates, ":"), "[", 2)

#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)

#project number
project_number <- url %>% 
  html_nodes(".item-summary") %>%
  html_text()
project_number

#seperate project number, country and sector 
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)

country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)

pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)

#url 
pp_url <- url %>% 
  html_nodes(".item-title a") %>%
  html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url

adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
summary(adb_pp)

write.table(x=adb_pp,
            file='adb_pp.csv',
            sep=",",
            row.names = FALSE)


datalist = list()

for (i in 1:558){
  
  print(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
  url <-read_html(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))

  #project title
  pp_title <- url %>% 
    html_nodes(".item-title") %>%
    html_text()
  table(pp_title)
  
  
  #project dates 
  project_dates <- url %>% 
    html_nodes(".item-meta") %>%
    html_text()
  project_dates <- gsub("\\nStatus:", " ", project_dates)
  project_dates <- gsub("\n", " ", project_dates)
  project_dates <- gsub("", " ", project_dates)
  table(project_dates)
  
  dates <- sapply(strsplit(project_dates, ":"), "[", 2)
  
  #project status
  project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
  project_status <-gsub("Approval Date", " ", project_status)
  project_status <- gsub(" ", "", project_status, fixed = TRUE)
  
  #project number
  project_number <- url %>% 
    html_nodes(".item-summary") %>%
    html_text()
  project_number
  
  #seperate project number, country and sector 
  sector <- sapply(strsplit(project_number, ";"), "[", 3)
  sector
  table(sector)
  
  country <- sapply(strsplit(project_number, ";"), "[", 2)
  table(country)
  
  pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
  sector
  table(pp_number)
  
  #url 
  pp_url <- url %>% 
    html_nodes(".item-title a") %>%
    html_attr("href")
  pp_url <- paste0("https://www.adb.org", pp_url)
  pp_url
  
  adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
  
  datalist[[i]] <- adb_pp
  #sleep a second
  Sys.sleep(1)
}

full = do.call(rbind, datalist)
str(full)
View(full)

However, I can't seem to create a loop that will go through the collected links above and scrape individual project level information. I managed to scrape individual projects using Rselenium, but it's probably not the most efficient way.

library(tidyverse)
library(RSelenium)
library(netstat)
library(htmltab)
library(XML)


# start the server
rs_driver_object <- rsDriver(browser = 'chrome',
                             chromever = '100.0.4896.20',
                             verbose = FALSE,
                             port = free_port())

# create a client object
remDr <- rs_driver_object$client

# open a browser
remDr$open()

# navigate to website
remDr$navigate('https://www.adb.org/projects/55313-001/main')

doc <- htmlParse(remDr$getPageSource()[[1]])
table <- readHTMLTable(doc)

I checked multiple posts on this forum but none of the methods seem to work for me. Links I scraped look like this:

> head(full$pp_url)
[1] "https://www.adb.org/projects/55313-001/main" "https://www.adb.org/projects/53354-003/main" "https://www.adb.org/projects/45007-013/main"
[4] "https://www.adb.org/projects/48186-009/main" "https://www.adb.org/projects/55319-001/main" "https://www.adb.org/projects/51126-005/main"

Upvotes: 1

Views: 415

Answers (1)

Nad Pat
Nad Pat

Reputation: 3173

We can simply use html_table as an alternate to readHTMLTable in combination with lapply to loop through links and extract tables.

library(tidyverse)

#vector of links
links = c("https://www.adb.org/projects/55313-001/main", "https://www.adb.org/projects/53354-003/main", "https://www.adb.org/projects/45007-013/main",
 "https://www.adb.org/projects/48186-009/main", "https://www.adb.org/projects/55319-001/main", "https://www.adb.org/projects/51126-005/main")

#first create a function `f1` to skip errors
f1 = function(x){ 
  x %>% read_html() %>% html_table()
}

#looping
df = lapply(links, possibly(f1, NA))

Upvotes: 0

Related Questions