user1593995
user1593995

Reputation: 43

Combining a rvest scraping loop's outcome into a dataframe

All,

I am trying to scrape some information from the web using rvest by looping through a list of webpages and then combine the results from all pages into a dataframe (each page scrapped adding to a new row).

My full code is :

library(rvest)
library(stringr)

    ####### Cleaning Functions ######

na.zero <- function (x) {
x[is.na(x)] <-0
return(x)
}

#/ 

######### Team Scrapping Code ############

TeamScrape <- read_html("http://www.transfermarkt.com/jumplist/startseite/verein/2778")


#// Get Club Name

#ClubName <- TeamScrape %>%
#html_nodes(".spielername-profil") %>%
#html_text()

#// Get All Player URLs 

PlayerURLs <- TeamScrape %>%
html_nodes(".spielprofil_tooltip") %>%
html_attr("href")

PlayerURLs <- unique(PlayerURLs)
PlayerURLs <- na.omit(PlayerURLs)

PlayerURLs <- paste0("http://www.transfermarkt.com", PlayerURLs)

PlayerLinks = data.frame(PlayerURLs)


######### Player Scrapping Code ############

lapply(PlayerURLs, FUN=function(URLLink){
PlayerScrape <- read_html(URLLink)

Name <- PlayerScrape %>% 
  html_nodes(".spielername-profil") %>%
  html_text() %>%
  as.character()
  Name <-ifelse(length(nchar(Name)) !=0 , Name, 0)
Name <- gsub("\t", "", Name)
Name <- gsub("\r", "", Name)
Name <- gsub("\n", "", Name)

DOB <- PlayerScrape %>% 
  html_nodes(".wsnw > span:nth-child(1)") %>%
  html_text() %>%
  as.character()
  DOB <-ifelse(length(nchar(DOB)) !=0 , DOB, 0)
DOB <- gsub("\t", "", DOB)
DOB <- gsub("\r", "", DOB)
DOB <- gsub("\n", "", DOB)

Club <- PlayerScrape %>% 
  html_nodes(".vereinprofil_tooltip+ .vereinprofil_tooltip") %>%
  html_text() %>%
  as.character()
  Club <-ifelse(length(nchar(Club)) !=0 , Club, "-")

Nationality <- PlayerScrape %>% 
  html_nodes("#main .flaggenrahmen+ span") %>%
  html_text() %>%
  as.character()
  Nationality <-ifelse(length(nchar(Nationality)) !=0 , Nationality, "-")

SquadNo <- PlayerScrape %>% 
  html_nodes(".rueckennummer-profil") %>%
  html_text() %>%
  as.character()
  SquadNo <- gsub("#", "", SquadNo) %>%
  as.numeric()
  SquadNo <-ifelse(length(nchar(SquadNo)) !=0 , SquadNo, 0)

Age <- PlayerScrape %>% 
  html_nodes(".profilheader .hide-for-small td") %>%
  html_text() %>%
  as.numeric()
  Age <-ifelse(length(nchar(Age)) !=0 , Age, 0)

Position <- PlayerScrape %>% 
  html_nodes(".hauptposition-left") %>%
  html_text() %>%
  as.character()
  Position <-ifelse(length(nchar(Position)) !=0 , Position, "-")
Position <- gsub("\t", "", Position)
Position <- gsub("\r", "", Position)
Position <- gsub("\n ", "", Position)
Position <- gsub("Main position:", "", Position)
Position <- str_trim(Position, side = c("both"))

Height <- PlayerScrape %>% 
  html_nodes(".hide-for-small~ tr+ tr span") %>%
  html_text() %>%
  as.character()
  Height <- gsub(",", "", Height)
  Height <- gsub(" m", "", Height)
  Height <-ifelse(length(nchar(Height)) !=0 , Height, 0)

Apps <- PlayerScrape %>% 
  html_nodes(".hide.hide-for-small+ .zentriert") %>%
  html_text() %>%
  as.numeric()
  Apps <- na.zero(Apps)
  Apps <-ifelse(length(nchar(Apps)) !=0 , Apps, 0)

Goals <- PlayerScrape %>% 
  html_nodes("#yw1 tfoot .zentriert:nth-child(4)") %>%
  html_text() %>%
  as.numeric()
  #Goals <- na.zero(Goals)
  #if(is.null(Goals)){Goals <-0}
  Goals <-ifelse(length(nchar(Goals)) !=0 , Goals, 0)

Assists <- PlayerScrape %>% 
  html_nodes("tfoot .zentriert:nth-child(5)") %>%
  html_text() %>%
  as.numeric()
  Assists <- na.zero(Assists)
  Assists <-ifelse(length(nchar(Assists)) !=0 , Assists, 0)

Minutes <- PlayerScrape %>% 
  html_nodes("tfoot .zentriert:nth-child(7)") %>%
  html_text()
  Minutes <-ifelse(length(nchar(Minutes)) !=0 , Minutes, 0)

Value <- PlayerScrape %>%
  html_nodes(".marktwert a") %>%
  html_text()
  Value <-ifelse(length(nchar(Value)) !=0 , Value, "-")

ContractExp <- PlayerScrapprinte %>% 
  html_nodes(".profilheader tr:nth-child(6) td") %>%
  html_text() %>%
  as.character()
  ContractExp <-ifelse(length(nchar(ContractExp)) !=0 , ContractExp, "-")
ContractExp <- gsub("\t", "", ContractExp)
ContractExp <- gsub("\r", "", ContractExp)
ContractExp <- gsub("\n", "", ContractExp)

PlayerURLLink <- URLLink
ProfileID <- PlayerURLLink
ProfileID <- str_extract_all(ProfileID,"\\(?[0-9,]+\\)?")

Checker = data.frame(Name, Club, Nationality, Position, Height, SquadNo, Value, DOB, Age, Apps, Minutes, Goals, Assists, PlayerURLLink)

})

In the console I get the scraped data but its not combining into a dataframe with all records as new rows. All help and advice very appreciated!

              Name             Club Nationality Position Height
1 David Jensen FC Nordsjaelland     Denmark        -    195
  SquadNo     Value          DOB Age Apps Minutes Goals
1       1 500 Th. € Mar 25, 1992  23   18       0    27
  Assists
1       5
                                                   PlayerURLLink
1 http://www.transfermarkt.com/david-jensen/profil/spieler/78819

[[2]]
                  Name             Club Nationality Position
1 Rúnar Alex Rúnarsson FC Nordsjaelland     Iceland        -
  Height SquadNo     Value          DOB Age Apps Minutes
1      0      16 100 Th. € Feb 18, 1995  20    1       0
  Goals Assists
1     1       0
                                                            PlayerURLLink
1 http://www.transfermarkt.com/runar-alex-runarsson/profil/spieler/205657

[[3]]
                  Name                 Club Nationality
1 Peter Vindahl Jensen FC Nordsjaelland U19     Denmark
  Position Height SquadNo    Value          DOB Age Apps
1        -      0       0 50 Th. € Feb 16, 1998  17    0
  Minutes Goals Assists
1       0     0       0
                                                            PlayerURLLink
1 http://www.transfermarkt.com/peter-vindahl-jensen/profil/spieler/395421

[[4]]
           Name             Club Nationality    Position
1 Pascal Gregor FC Nordsjaelland     Denmark Centre Back
  Height SquadNo     Value          DOB Age Apps Minutes
1      0       3 700 Th. € Feb 18, 1994  21   17   1.530
  Goals Assists
1    NA       0

Upvotes: 0

Views: 953

Answers (1)

tospig
tospig

Reputation: 8333

After fixing the missing variable, your only real issue is that you're not assigning your lapply to anything, it's just printing to the console.

Using the construct

library(rvest)
library(stringr)

lst_scraped_data <- lapply(PlayerURLs, FUN=function(URLLink){

    ## --------------------------
    ## all the function code here
    ## --------------------------

}

df <- do.call(rbind, lst_scraped_data) 

gives you your data.frame

Upvotes: 1

Related Questions