Reputation: 193
I have looked at a few questions but none of the answers seem to fit. I am building a webscraper tool as a personal project. I have figured out the loops to get rider data for the Vuelta 2022 however I need to loop through all the urls for each stage. For some reason, the url loop is taking the last number in the range. My gut feeling is the formatting so I am trying to play around with that but no luck
import requests
from bs4 import BeautifulSoup
import pandas as pd
for j in range (1,10):
url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
page = requests.get(url)
urlt = page.content
soup = BeautifulSoup(urlt)
rider_rank_list = []
for i in range (1,11):
#create list of riders
results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")
#create rider rank list
rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")
#create stage name
stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')
rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)
df.to_csv('data.csv', index=False)
Upvotes: 0
Views: 145
Reputation: 3987
Inspired from other answers and added complete easier and probably more readable table format solution to your question:
import pandas as pd
al=pd.DataFrame()
for i in range(2,19): # Stage only started from 2 to 18
url = f"https://www.lavuelta.es/en/rankings/stage-{i}"
df=pd.read_html(url)[0]
# Taking top 10 rider ie. top 10 ranked riders only
df=df[["Rider"]][:10]
# Renaming using "Rider" with stage number
df.columns=[f"Stage - {i} - Rider"]
# Adding all Rider column horizontally
al=pd.concat([al,df],axis=1)
al.to_csv('data.csv', index=False)
Upvotes: 0
Reputation: 46
fixed indentation, with small changes
import requests
from bs4 import BeautifulSoup
import pandas as pd
rider_rank_list = []
for j in range (1,10):
url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
page = requests.get(url)
urlt = page.content
soup = BeautifulSoup(urlt)
for i in range (1,11):
#create list of riders
results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")
if results != None:
#create rider rank list
rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")
#create stage name
stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')
rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)
df.to_csv('data.csv', index=False)
Upvotes: 2