Python and scraping data. Getting a specific tab that has no link and going through multiple pages

Question

I am new to scraping. Also, new to Python. But I have a script that goes to WhoScored.com and pulls some data off a specific league and exports it to a .csv file. Here is the code:

from selenium import webdriver 
import time 
from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains

import time


options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")


driver = webdriver.Chrome(executable_path ="C:\Program Files (x86)\Google\Chrome\chromedriver.exe") 
#Choose any league and click on player statistics now copy that url here. for eg we want premier league data so we'll need the following url
website_URL ="https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020"
driver.get(website_URL) 


page = 1
#Check the number of pages of data available of that league. For premier league it's 32 so we set max_page as 32
max_page=32


while True:
    try:

        if page > max_page :
            print("Last page reached")
            break
        page+=1
        for i in driver.find_elements_by_xpath("""//*[@id="player-table-statistics-body"]"""):
            p_db=i.get_attribute('innerHTML')


        p_db=''+p_db+''

        df=pd.read_html(p_db)[0]
        df.drop(df.columns[1], axis=1)

        df.to_csv('premier_league_ws.csv', mode='a', header=False,index=False)


        driver.find_element_by_link_text("next").click()
        time.sleep(5)
        print("Navigating to Next Page")
    except (TimeoutException, WebDriverException) as e:
        print("Last page reached")
        break
driver.quit()

So the chrome driver goes to https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020

Then, it pulls out all the data from the Summary Tab. And it only pulls the first page of players.

Can you help me get to the sub-tabs (Defensive/Offensive/Detailed) next to the Summary? There is no direct link to them. I need to pull all that information as well.

The script stops after the first 10 players and it does not go to the next page. How do I fix this?

Thank you!

GabbeHags · Accepted Answer

This should fix your problem. But remember every time you run this script it will accept the Cookie on the site.

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains

import time

options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")

# Finding the path to chromedriver.exe and loading the options
driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\chromedriver.exe", options=options)

# Choose any league and click on player statistics now copy that url here. for eg we want premier league data so we'll need the following url
website_URL = "https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/7811/Stages/17590/PlayerStatistics/England-Premier-League-2019-2020"
driver.get(website_URL)

page = 1
# Check the number of pages of data available of that league. For premier league it's 32 so we set max_page as 32
max_page = 32

# Finding the Cookie button and accepting it
time.sleep(2)
driver.find_element_by_xpath("""//*[@id="qcCmpButtons"]/button[2]""").click()

# Scrolling down 500 px
time.sleep(3)
driver.execute_script("window.scrollTo(0, 500)")

while True:
    try:

        if page > max_page:
            print("Last page reached")
            break
        page += 1
        for i in driver.find_elements_by_xpath("""//*[@id="player-table-statistics-body"]"""):
            p_db = i.get_attribute('innerHTML')

        p_db = '' + p_db + ''

        df = pd.read_html(p_db)[0]
        df.drop(df.columns[1], axis=1)

        df.to_csv('premier_league_ws.csv', mode='a', header=False, index=False)
        time.sleep(5)
        driver.find_element_by_link_text("next").click()

        print("Navigating to Next Page")
    except (TimeoutException, WebDriverException) as e:
        print("Last page reached")
        break
driver.quit()

Things i changed or added

# Finding the path to chromedriver.exe and loading the options
driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\chromedriver.exe", options=options)

# Finding the Cookie button and accepting it
time.sleep(2)
driver.find_element_by_xpath("""//*[@id="qcCmpButtons"]/button[2]""").click()

# Scrolling down 500 px
time.sleep(3)
driver.execute_script("window.scrollTo(0, 500)")

Python and scraping data. Getting a specific tab that has no link and going through multiple pages

Answers (1)

Related Questions