Reputation: 43
All,
I am trying to scrape some information from the web using rvest by looping through a list of webpages and then combine the results from all pages into a dataframe (each page scrapped adding to a new row).
My full code is :
library(rvest)
library(stringr)
####### Cleaning Functions ######
na.zero <- function (x) {
x[is.na(x)] <-0
return(x)
}
#/
######### Team Scrapping Code ############
TeamScrape <- read_html("http://www.transfermarkt.com/jumplist/startseite/verein/2778")
#// Get Club Name
#ClubName <- TeamScrape %>%
#html_nodes(".spielername-profil") %>%
#html_text()
#// Get All Player URLs
PlayerURLs <- TeamScrape %>%
html_nodes(".spielprofil_tooltip") %>%
html_attr("href")
PlayerURLs <- unique(PlayerURLs)
PlayerURLs <- na.omit(PlayerURLs)
PlayerURLs <- paste0("http://www.transfermarkt.com", PlayerURLs)
PlayerLinks = data.frame(PlayerURLs)
######### Player Scrapping Code ############
lapply(PlayerURLs, FUN=function(URLLink){
PlayerScrape <- read_html(URLLink)
Name <- PlayerScrape %>%
html_nodes(".spielername-profil") %>%
html_text() %>%
as.character()
Name <-ifelse(length(nchar(Name)) !=0 , Name, 0)
Name <- gsub("\t", "", Name)
Name <- gsub("\r", "", Name)
Name <- gsub("\n", "", Name)
DOB <- PlayerScrape %>%
html_nodes(".wsnw > span:nth-child(1)") %>%
html_text() %>%
as.character()
DOB <-ifelse(length(nchar(DOB)) !=0 , DOB, 0)
DOB <- gsub("\t", "", DOB)
DOB <- gsub("\r", "", DOB)
DOB <- gsub("\n", "", DOB)
Club <- PlayerScrape %>%
html_nodes(".vereinprofil_tooltip+ .vereinprofil_tooltip") %>%
html_text() %>%
as.character()
Club <-ifelse(length(nchar(Club)) !=0 , Club, "-")
Nationality <- PlayerScrape %>%
html_nodes("#main .flaggenrahmen+ span") %>%
html_text() %>%
as.character()
Nationality <-ifelse(length(nchar(Nationality)) !=0 , Nationality, "-")
SquadNo <- PlayerScrape %>%
html_nodes(".rueckennummer-profil") %>%
html_text() %>%
as.character()
SquadNo <- gsub("#", "", SquadNo) %>%
as.numeric()
SquadNo <-ifelse(length(nchar(SquadNo)) !=0 , SquadNo, 0)
Age <- PlayerScrape %>%
html_nodes(".profilheader .hide-for-small td") %>%
html_text() %>%
as.numeric()
Age <-ifelse(length(nchar(Age)) !=0 , Age, 0)
Position <- PlayerScrape %>%
html_nodes(".hauptposition-left") %>%
html_text() %>%
as.character()
Position <-ifelse(length(nchar(Position)) !=0 , Position, "-")
Position <- gsub("\t", "", Position)
Position <- gsub("\r", "", Position)
Position <- gsub("\n ", "", Position)
Position <- gsub("Main position:", "", Position)
Position <- str_trim(Position, side = c("both"))
Height <- PlayerScrape %>%
html_nodes(".hide-for-small~ tr+ tr span") %>%
html_text() %>%
as.character()
Height <- gsub(",", "", Height)
Height <- gsub(" m", "", Height)
Height <-ifelse(length(nchar(Height)) !=0 , Height, 0)
Apps <- PlayerScrape %>%
html_nodes(".hide.hide-for-small+ .zentriert") %>%
html_text() %>%
as.numeric()
Apps <- na.zero(Apps)
Apps <-ifelse(length(nchar(Apps)) !=0 , Apps, 0)
Goals <- PlayerScrape %>%
html_nodes("#yw1 tfoot .zentriert:nth-child(4)") %>%
html_text() %>%
as.numeric()
#Goals <- na.zero(Goals)
#if(is.null(Goals)){Goals <-0}
Goals <-ifelse(length(nchar(Goals)) !=0 , Goals, 0)
Assists <- PlayerScrape %>%
html_nodes("tfoot .zentriert:nth-child(5)") %>%
html_text() %>%
as.numeric()
Assists <- na.zero(Assists)
Assists <-ifelse(length(nchar(Assists)) !=0 , Assists, 0)
Minutes <- PlayerScrape %>%
html_nodes("tfoot .zentriert:nth-child(7)") %>%
html_text()
Minutes <-ifelse(length(nchar(Minutes)) !=0 , Minutes, 0)
Value <- PlayerScrape %>%
html_nodes(".marktwert a") %>%
html_text()
Value <-ifelse(length(nchar(Value)) !=0 , Value, "-")
ContractExp <- PlayerScrapprinte %>%
html_nodes(".profilheader tr:nth-child(6) td") %>%
html_text() %>%
as.character()
ContractExp <-ifelse(length(nchar(ContractExp)) !=0 , ContractExp, "-")
ContractExp <- gsub("\t", "", ContractExp)
ContractExp <- gsub("\r", "", ContractExp)
ContractExp <- gsub("\n", "", ContractExp)
PlayerURLLink <- URLLink
ProfileID <- PlayerURLLink
ProfileID <- str_extract_all(ProfileID,"\\(?[0-9,]+\\)?")
Checker = data.frame(Name, Club, Nationality, Position, Height, SquadNo, Value, DOB, Age, Apps, Minutes, Goals, Assists, PlayerURLLink)
})
In the console I get the scraped data but its not combining into a dataframe with all records as new rows. All help and advice very appreciated!
Name Club Nationality Position Height
1 David Jensen FC Nordsjaelland Denmark - 195
SquadNo Value DOB Age Apps Minutes Goals
1 1 500 Th. € Mar 25, 1992 23 18 0 27
Assists
1 5
PlayerURLLink
1 http://www.transfermarkt.com/david-jensen/profil/spieler/78819
[[2]]
Name Club Nationality Position
1 Rúnar Alex Rúnarsson FC Nordsjaelland Iceland -
Height SquadNo Value DOB Age Apps Minutes
1 0 16 100 Th. € Feb 18, 1995 20 1 0
Goals Assists
1 1 0
PlayerURLLink
1 http://www.transfermarkt.com/runar-alex-runarsson/profil/spieler/205657
[[3]]
Name Club Nationality
1 Peter Vindahl Jensen FC Nordsjaelland U19 Denmark
Position Height SquadNo Value DOB Age Apps
1 - 0 0 50 Th. € Feb 16, 1998 17 0
Minutes Goals Assists
1 0 0 0
PlayerURLLink
1 http://www.transfermarkt.com/peter-vindahl-jensen/profil/spieler/395421
[[4]]
Name Club Nationality Position
1 Pascal Gregor FC Nordsjaelland Denmark Centre Back
Height SquadNo Value DOB Age Apps Minutes
1 0 3 700 Th. € Feb 18, 1994 21 17 1.530
Goals Assists
1 NA 0
Upvotes: 0
Views: 953
Reputation: 8333
After fixing the missing variable, your only real issue is that you're not assigning your lapply
to anything, it's just printing to the console.
Using the construct
library(rvest)
library(stringr)
lst_scraped_data <- lapply(PlayerURLs, FUN=function(URLLink){
## --------------------------
## all the function code here
## --------------------------
}
df <- do.call(rbind, lst_scraped_data)
gives you your data.frame
Upvotes: 1