Generic Web Scraping in R

Question

I am writing a web-scraper in R to extract details about a tax authority. So far I have this code which works perfectly for the Argentina. However, it requires each url to be manually typed in. The site doesn't have a sitemap or a robots.txt so is there another way of being able to locate the appropriate url automatically? Thanks in advance.

library(RCurl)
library(XML)
library(rvest)

#INFORMATION

url <- "http://oceantax.co.uk/links/tax-authorities-worldwide.html"
pg <- html(url)

country <- pg %>% html_nodes(xpath="//a[contains(@title, 'Argentina')]")

country <- pg %>% html_nodes("a[title~=Argentina]")

name <- as.matrix(country %>% html_text())
url <- as.matrix(country %>% html_attr("href"))

arginfo <- readLines("http://www.afip.gob.ar/contacto/")
strong <- as.matrix(grep("",arginfo))


#Telephone
strongphone <- grep(" Centro de Información Telefónica",arginfo)
rowphone <- which(grepl(strongphone, strong))
strongphonend<- strong[rowphone + 1 ,]-1
phone <- as.matrix(arginfo[strongphone:strongphonend])


#Email
strongemail <- grep("Tema:",arginfo)
rowemail <- which(grepl(strongemail, strong))
strongemailend<- strong[rowemail + 1 ,]-1
email <- as.matrix(arginfo[strongemail:strongemailend])

info <-rbind(name,url,phone,email)

library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = info, sheet = "Information")

#TAX DETAILS
argtax <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")

#finds line numbers of all the headings with tag name strong and puts into a matrix
strong <-as.matrix(grep("",argtax))

#finds the header desired in the html code and which row it's on
strong1starts <- grep("Operaciones de Seguro.",argtax)

#calculates what number header it is and thus where it ends
rowst1st <- which(grepl(strong1starts, strong))
strong1ends <- strong[rowst1st + 1 ,]-1

#defines data as the text under the heading Operaciones de Seguro as required
data1 <- as.matrix(argtax[strong1starts:strong1ends])

#Repeats the process for the second text
strong2starts <- grep("Operaciones de Seguro",argtax)

rowst2st <- which(grepl(strong2starts, strong))
strong2ends <- strong[rowst2st + 1 ,]-1

data2 <- as.matrix(argtax[strong2starts:strong2ends])

#Combines the data frames as one in order to export it to excell in one peice
data<- rbind(data1,data2)

library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = data, sheet = "Tax Details")

#PAYMENT DETAILS
argpaym <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")

stron <-as.matrix(grep("contenidoSubTitle",argpaym))


#Determination and Collection of Taxes
stron1start <- grep('"contenidoSubTitle">Determinación y Percepción de Impuestos',argpaym)

rowst1s <- which(grepl(stron1start, stron))
stron1end <- stron[rowst1s + 1 ,]-1

dat1 <- as.matrix(argpaym[stron1start:stron1end])

#Interest, Legal and Penalties
stron2start <- grep('"contenidoSubTitle">Interés, ilícitos y sanciones',argpaym)

rowst2s <- which(grepl(stron2start, stron))
stron2end <- stron[rowst2s + 1 ,]-1

dat2 <- as.matrix(argpaym[stron2start:stron2end])

#Prescription
stron3start <- grep('"contenidoSubTitle">Prescripción',argpaym)

rowst3s <- which(grepl(stron3start, stron))
stron3end <- stron[rowst3s + 1 ,]-1

dat3 <- as.matrix(argpaym[stron3start:stron3end])

dat<- rbind(dat1,dat2,dat3)

library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = dat, sheet = "Payment Details")

Generic Web Scraping in R

Answers (1)

Related Questions