Reputation: 417
I've written this function for scraping lyrics:
songscrape <- function(x) {
url <- paste0("https://www.azlyrics.com/", substring(x, 1, 1),"/",x, ".html")
artist <- x
SongsListScrapper <- function(x) {
page <- x
songs <- page %>%
read_html() %>%
html_nodes(xpath = "/html/body/div[2]/div/div[2]/div[4]/div/a") %>%
html_text() %>%
as.data.frame()
chart <- cbind(songs)
names(chart) <- c("Songs")
chart <- as.tibble(chart)
return(chart)
}
SongsList <- map_df(url, SongsListScrapper)
SongsList
SongsList %<>%
mutate(
Songs = as.character(Songs)
,Songs = gsub("[[:punct:]]", "", Songs)
,Songs = tolower(Songs)
,Songs = gsub(" ", "", Songs)
)
SongsList$Songs
#Scrape Lyrics
wipe_html <- function(str_html) {
gsub("<.*?>", "", str_html)
}
lyrics2 <- c()
albums2 <- c()
number <- 1
for(i in seq_along(SongsList$Songs)) {
for_url_name <- SongsList$Songs[i]
#clean name
for_url_name <- tolower(gsub("[[:punct:]]\\s", "", for_url_name))
#create url
paste_url <- paste0("https://www.azlyrics.com/lyrics/", artist,"/", for_url_name, ".html")
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
for_lyrics <- wipe_html(for_lyrics)
for_albums <- wipe_html(for_albums)
lyrics2[number] <- for_lyrics
albums2[number] <- for_albums
number <- number +1
show(paste0(for_url_name, " Scrape Complete!", "[",i,"/",nrow(SongsList),"]"))
Sys.sleep(10)
}
songs2 <- cbind(lyrics2, albums2) %>% as.data.frame()
songs2$albums2 <- gsub("[[:punct:]]", "", songs2$albums2)
return(songs2)
}
You will notice that I have used tryCatch()
in the code (shown also below) because I realized that on some edge-cases, the URL would not match and stop the function midway:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
error = function(e){NA}
})
However, I still get this error and the code stops functioning, instead of ignoring the error and moving on:
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_len(n)) { :
What am I doing wrong and how can this be fixed?
Upvotes: 2
Views: 478
Reputation: 417
As mentioned by User @27ϕ9, the trcyCatch()
was not used correctly. The error handling needed to be outside the closing brace:
tryCatch( {
#open connection to url
for_html_code <-read_html(paste_url)
for_lyrics <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[5]")
for_albums <- html_node(for_html_code, xpath = "/html/body/div[2]/div/div[2]/div[11]/div[1]/b")
}, error = function(e){NA}
)
For more information, refer to this answer here.
Upvotes: 3