Reputation: 25
I am writing a web-scraper in R to extract details about a tax authority. So far I have this code which works perfectly for the Argentina. However, it requires each url to be manually typed in. The site doesn't have a sitemap or a robots.txt so is there another way of being able to locate the appropriate url automatically? Thanks in advance.
library(RCurl)
library(XML)
library(rvest)
#INFORMATION
url <- "http://oceantax.co.uk/links/tax-authorities-worldwide.html"
pg <- html(url)
country <- pg %>% html_nodes(xpath="//a[contains(@title, 'Argentina')]")
country <- pg %>% html_nodes("a[title~=Argentina]")
name <- as.matrix(country %>% html_text())
url <- as.matrix(country %>% html_attr("href"))
arginfo <- readLines("http://www.afip.gob.ar/contacto/")
strong <- as.matrix(grep("<strong>",arginfo))
#Telephone
strongphone <- grep("<strong> Centro de Información Telefónica</strong>",arginfo)
rowphone <- which(grepl(strongphone, strong))
strongphonend<- strong[rowphone + 1 ,]-1
phone <- as.matrix(arginfo[strongphone:strongphonend])
#Email
strongemail <- grep("<strong>Tema:",arginfo)
rowemail <- which(grepl(strongemail, strong))
strongemailend<- strong[rowemail + 1 ,]-1
email <- as.matrix(arginfo[strongemail:strongemailend])
info <-rbind(name,url,phone,email)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = info, sheet = "Information")
#TAX DETAILS
argtax <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")
#finds line numbers of all the headings with tag name strong and puts into a matrix
strong <-as.matrix(grep("<strong>",argtax))
#finds the header desired in the html code and which row it's on
strong1starts <- grep("<strong>Operaciones de Seguro.</strong>",argtax)
#calculates what number header it is and thus where it ends
rowst1st <- which(grepl(strong1starts, strong))
strong1ends <- strong[rowst1st + 1 ,]-1
#defines data as the text under the heading Operaciones de Seguro as required
data1 <- as.matrix(argtax[strong1starts:strong1ends])
#Repeats the process for the second text
strong2starts <- grep("<strong>Operaciones de Seguro</strong>",argtax)
rowst2st <- which(grepl(strong2starts, strong))
strong2ends <- strong[rowst2st + 1 ,]-1
data2 <- as.matrix(argtax[strong2starts:strong2ends])
#Combines the data frames as one in order to export it to excell in one peice
data<- rbind(data1,data2)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = data, sheet = "Tax Details")
#PAYMENT DETAILS
argpaym <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")
stron <-as.matrix(grep("contenidoSubTitle",argpaym))
#Determination and Collection of Taxes
stron1start <- grep('"contenidoSubTitle">Determinación y Percepción de Impuestos',argpaym)
rowst1s <- which(grepl(stron1start, stron))
stron1end <- stron[rowst1s + 1 ,]-1
dat1 <- as.matrix(argpaym[stron1start:stron1end])
#Interest, Legal and Penalties
stron2start <- grep('"contenidoSubTitle">Interés, ilícitos y sanciones',argpaym)
rowst2s <- which(grepl(stron2start, stron))
stron2end <- stron[rowst2s + 1 ,]-1
dat2 <- as.matrix(argpaym[stron2start:stron2end])
#Prescription
stron3start <- grep('"contenidoSubTitle">Prescripción',argpaym)
rowst3s <- which(grepl(stron3start, stron))
stron3end <- stron[rowst3s + 1 ,]-1
dat3 <- as.matrix(argpaym[stron3start:stron3end])
dat<- rbind(dat1,dat2,dat3)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = dat, sheet = "Payment Details")
Upvotes: 1
Views: 268
Reputation: 379
url <- "http://oceantax.co.uk/links/tax-authorities-worldwide.html"
pg <- html(url)
countries <- pg %>% html_nodes(xpath="//a[contains(@title, '')]")
This returns 161 items, with items 37:160 what you want, with info, including link, for each country. Loop through the rest.
for (n in 37:160)
{
print(countries[n])
do x
}
Upvotes: 2