Reputation: 645
I am trying to scrape the the following website listed below. I tried to do this by using rvest
with the code below.
My attempt was to try to replicate the PUT
that I found in Google Chrome for the Download button. I'm not sure what I'm doing wrong. I am getting the error listed in my reprex
.
library(httr)
library(rvest)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
url <- "https://nfc.shgn.com/adp/baseball"
pgsession <- session(url)
pgform <- html_form(pgsession)[[2]]
filled_form <- html_form_set(pgform,
team_id = "0", from_date = "2020-10-01", to_date = "2021-02-19", num_teams = "0",
draft_type = "0", sport = "baseball", position = "",
league_teams = "0" )
#> Warning: Setting value of hidden field 'team_id'.
#> Warning: Setting value of hidden field 'from_date'.
#> Warning: Setting value of hidden field 'to_date'.
#> Warning: Setting value of hidden field 'num_teams'.
#> Warning: Setting value of hidden field 'draft_type'.
#> Warning: Setting value of hidden field 'sport'.
#> Warning: Setting value of hidden field 'position'.
#> Warning: Setting value of hidden field 'league_teams'.
session_submit(x = pgsession, form = filled_form)
#> Error: `form` doesn't contain a `action` attribute
Upvotes: 2
Views: 944
Reputation: 18425
If you just want to scrape that table, you can do it easily with rvest
and purrr
by using the URL that the "Print" button takes you to.
Although you can't use html_table
, it is straightforward to extract the cells as a dataframe using purrr::map_df
:
library(rvest)
library(dplyr)
library(purrr)
library(stringr)
pgtab <- read_html("https://nfc.shgn.com/adp.data.php") %>% #destination of Print button
html_nodes("tr") %>% #returns a list of row nodes
map_df(~html_nodes(., "td") %>% #returns a list of cell nodes for each row
html_text() %>% #extract text
str_trim() %>% #remove whitespace
set_names("Rank","Player","Team","Position","ADP","MinPick",
"MaxPick","Diff","Picks","Team2","PickBid"))
head(pgtab)
# A tibble: 6 x 11
Rank Player Team Position ADP MinPick MaxPick Diff Picks Team2 PickBid
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 Ronald Acuna Jr. ATL OF 1.69 1 6 "" 332 "" ""
2 2 Fernando Tatis Jr. SD SS 2.57 1 7 "" 332 "" ""
3 3 Mookie Betts LAD OF 3.53 1 9 "" 332 "" ""
4 4 Juan Soto WAS OF 3.98 1 10 "" 332 "" ""
5 5 Mike Trout LAA OF 6.08 1 11 "" 332 "" ""
6 6 Gerrit Cole NYY P 6.50 1 15 "" 332 "" ""
You can also set the form parameters and do this, although you'll have to check whether it makes a difference. Here is one way...
url <- "https://nfc.shgn.com/adp/baseball"
pgsession <- html_session(url)
pgform <- html_form(pgsession)[[2]]
filled_form <-set_values(pgform,
team_id = "0", from_date = "2020-10-01", to_date = "2021-02-19", num_teams = "0",
draft_type = "0", sport = "baseball", position = "",
league_teams = "0" )
filled_form$url <- "https://nfc.shgn.com/adp.data.php" #error if this is left blank
pgsession <- submit_form(pgsession, filled_form, submit = "printerFriendly")
pgtab <- pgsession %>% read_html() %>% #code as per previous answer above
html_nodes("tr") %>%
map_df(~html_nodes(., "td") %>%
html_text() %>%
str_trim() %>%
set_names("Rank","Player","Team","Position","ADP","MinPick",
"MaxPick","Diff","Picks","Team2","PickBid"))
Upvotes: 4
Reputation: 27762
Here is a possible solution using rSelenium
for downloadin the tsv.file to a given folder.
After that, easy peasy...
library( RSelenium )
library( rvest )
library( xml2 )
library( data.table )
#setup download file + location
filename <- "ADP.tsv"
download_location <- file.path(Sys.getenv("USERPROFILE"), "Downloads")
#create extra cpabilities, so the browser(firefox) does not display an save-as dialog
# when downloading the tsv file
eCaps <- makeFirefoxProfile( list( "browser.download.dir" = download_location,
"browser.download.folderList" = 2,
"browser.helperApps.neverAsk.saveToDisk" = "text/tab-separated-values",
"browser.download.manager.showWhenStarting" = FALSE ) )
#setup driver (using the firefox profile created before), client and server
driver <- rsDriver( browser = "firefox", port = 4545L, extraCapabilities = eCaps, verbose = FALSE )
server <- driver$server
browser <- driver$client
#goto url in browser
browser$navigate( "https://nfc.shgn.com/adp/baseball" )
#get
button_dl <- list()
#while no buttons found (site not loaded), try to load the download-button
while ( length( button_dl ) == 0 ) {
button_dl <- browser$findElements(using = "name", "download" )
}
#now click the button and wait for the file to show up in the download_location
button_dl[[1]]$clickElement()
#wait for download to complete
Sys.sleep(5)
#check if file is loaded
if ( file.exists( paste( download_location, filename, sep = "/" ) ) ) {
#load the file
DT <- data.table::fread( paste( download_location, filename, sep = "/" ) )
}
#close everything down properly
browser$close()
server$stop()
head(DT)
# Rank Player Team Position(s) ADP Min Pick Max Pick Difference # Picks Team Team Pick
# 1: 1 Acuna Jr., Ronald ATL OF 1.68 1 6 NA 323 NA NA
# 2: 2 Tatis Jr., Fernando SD SS 2.58 1 7 NA 323 NA NA
# 3: 3 Betts, Mookie LAD OF 3.50 1 9 NA 323 NA NA
# 4: 4 Soto, Juan WAS OF 3.98 1 10 NA 323 NA NA
# 5: 5 Trout, Mike LAA OF 6.06 1 11 NA 323 NA NA
# 6: 6 Cole, Gerrit NYY P 6.52 1 15 NA 323 NA NA
Upvotes: 3