Mauricio Romero
Mauricio Romero

Reputation: 77

webscraping information from an online map

I'm trying to webscrape the information from the map in this webpage http://fhi360odk.org/kdesktoplb_2/

The webpage has infomation/location for all public schools in Liberia. Basically, I want the school location plus the information that shows up if you click on a particular school.

I have done webscraping with R before, but from the source-code in the webpage I can't figure out where the school location is.

Any help would be greatly appreciated.

Upvotes: 0

Views: 469

Answers (1)

Mauricio Romero
Mauricio Romero

Reputation: 77

I was able to figure it out. The issue is the actual school data is in a frame that has a difference source code!

Below is my code

rm(list=ls())
#install.packages("RCurl")
#install.packages("XML")
require(RCurl)
require(XML)
require(stringr)
require(foreign)


sources <- c('http://fhi360odk.org/kdesktopLB_2/EarthMapping.php#')
appUrl <- getURL(sources)
Markers=gregexpr("addMarker",appUrl)
Markers=Markers[[1]]
Markers=Markers[-1]
Markers=c(Markers,tail(Markers,1)+184)


Escuelas=data.frame(Lat=NA,Long=NA,Code=NA,Date=NA,EMIS=NA,Name=NA,Status=NA,County=NA,District=NA,Address=NA,Mobile=NA,Ownership=NA,
MaleStudents=NA,FemaleStudents=NA,TotalStudents=NA,
PermanentClassrooms=NA,SemiPermanentClassrooms=NA,RoofOnlyClassrooms=NA,TentClassrooms=NA,OpenAirClassrooms=NA,OtherClassrooms=NA,
MaleTeacher=NA,FemaleTeachers=NA,TotalTeachers=NA,stringsAsFactors =F)


for(i in 1:(length(Markers)-1)){
#for(i in 1:10){
string=substr(appUrl,Markers[i],Markers[i+1]-2)
matches=gregexpr ("\\d+\\.\\d+", string)
Location=regmatches(string, matches)
Lat=as.numeric(Location[[1]][1])
Long=as.numeric(Location[[1]][2])
matches=gregexpr ("cod=\\d+", string)
Codigo=regmatches(string, matches)
Codigo=as.numeric(gsub("cod=","",Codigo))
InfoColegio=getURL(paste0("http://fhi360odk.org/kdesktopLB_2/ident_gis.php?db=liberia_gis_v4&cod=",Codigo,"&nivel=null"))
Tabla=readHTMLTable(InfoColegio,header=F)
Tabla2=readHTMLTable(InfoColegio,header=T)
Escuelas=rbind(Escuelas,as.character(c(Lat,Long,Codigo,as.character(Tabla[[1]][,2]),as.numeric(as.character(unlist(Tabla2[[2]]))),as.numeric(as.character(Tabla[[3]][,2])),as.numeric(as.character(unlist(Tabla2[[4]]))))))

}
Escuelas=Escuelas[-1,]
save(Escuelas,file="C:/Users/Mauricio/Dropbox/Liberia/EMIS2015_RAW.Rdata")

Escuelas$Lat=as.numeric(Escuelas$Lat)
Escuelas$Long=as.numeric(Escuelas$Long)
Escuelas$Code=as.numeric(Escuelas$Code)
Escuelas$Date=as.Date(Escuelas$Date,format="%m %d %Y")
Escuelas$EMIS=as.numeric(Escuelas$EMIS)
Escuelas$MaleStudents=as.numeric(Escuelas$MaleStudents)
Escuelas$FemaleStudents=as.numeric(Escuelas$FemaleStudents)
Escuelas$TotalStudents=as.numeric(Escuelas$TotalStudents)
Escuelas$PermanentClassrooms=as.numeric(Escuelas$PermanentClassrooms)
Escuelas$SemiPermanentClassrooms=as.numeric(Escuelas$SemiPermanentClassrooms)
Escuelas$RoofOnlyClassrooms=as.numeric(Escuelas$RoofOnlyClassrooms)
Escuelas$TentClassrooms=as.numeric(Escuelas$TentClassrooms)
Escuelas$OpenAirClassrooms=as.numeric(Escuelas$OpenAirClassrooms)
Escuelas$OtherClassrooms=as.numeric(Escuelas$OtherClassrooms)
Escuelas$MaleTeacher=as.numeric(Escuelas$MaleTeacher)
Escuelas$FemaleTeachers=as.numeric(Escuelas$FemaleTeachers)
Escuelas$TotalTeachers=as.numeric(Escuelas$TotalTeachers)

Escuelas$Name[Escuelas$Name==""]=NA
Escuelas$Status[Escuelas$Status==""]=NA
Escuelas$County[Escuelas$County==""]=NA
Escuelas$District[Escuelas$District==""]=NA
Escuelas$Address[Escuelas$Address==""]=NA
Escuelas$Mobile[Escuelas$Mobile==""]=NA
Escuelas$Ownership[Escuelas$Ownership==""]=NA



save(Escuelas,file="C:/Users/Mauricio/Dropbox/Liberia/EMIS2015.Rdata")
write.dta(Escuelas,file="C:/Users/Mauricio/Dropbox/Liberia/EMIS2015.dta")

Upvotes: 1

Related Questions