vriones11
vriones11

Reputation: 13

Scraping stats with Selenium

I would to scrape 1st half result and minute of goals in 2nd half from this page https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary

Then I tried to scrape 1st half stats from this page https://www.flashscore.co.uk/match/dxm5iiWh/#match-statistics;1 with this code but I would fix Xpaths

P.S In the example I put only two stats but I will apply for all.

Thanks

stats=[]
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')

data = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "utime"))).text
hometeam=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
awayteam=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text

md = driver.find_elements_by_class_name("detailMS__incidentRow")

for g in md:
    if g.find_elements_by_class_name("icon.soccer-ball"):
        print(g.text)


try:
    extrainfo = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
    extrainfo = " "
try:
    driver.find_element_by_xpath("//*[@id='a-match-statistics']")
    WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()   
    home_shot=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[2]/div[1]/div[1]"))).text
    away_shot=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[2]/div[1]/div[3]"))).text
    home_shotontarget=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[3]/div[1]/div[1]"))).text
    away_shotontarget=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[3]/div[1]/div[3]"))).text
except NoSuchElementException:
    home_shot = "no stats"
    away_shot = "no stats"
    home_shotontarget = "no stats"
    away_shotontarget = "no stats"

    stats.append((data, hometeam, awayteam, print(g.text), extrainfo, home_shot, away_shot, home_shotontarget, away_shotontarget))

    df = pd.DataFrame(stats, columns=['data', 'hometeam', 'awayteam', 'goals', 
'extrainfo', 'hs', 'as', 'hot', 'aot'])
    df.to_csv('stats.csv', index=False, encoding='utf-8')

Upvotes: 1

Views: 682

Answers (1)

furas
furas

Reputation: 142641

Code:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
             
import pandas as pd
             

driver = webdriver.Firefox()

stats = []

# go to page with match summary 
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')

# ----------------------------------------------------------------------------------------------------------------
wait = WebDriverWait(driver, 5)
data = wait.until(EC.visibility_of_element_located((By.ID, "utime"))).text
hometeam = wait.until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
awayteam = wait.until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text

# ----------------------------------------------------------------------------------------------------------------

#match_details = driver.find_element_by_class_name("detailMS__incidentRow")

match_details = driver.find_elements_by_xpath('//div[contains(@class,"incidentRow--home")] | //div[contains(@class, "incidentRow--away")]')

print(' {:4} | {:4} | {:5} | {:17} | {:17} | {:17} |'.format('h/a', 'time', 'icon', 'participant', 'substitution_in', 'substitution_out'))

for row in match_details:
     home_or_away = row.get_attribute('class')
     home_or_away = 'home' if 'home' in home_or_away else 'away'
     #print('home_or_away:', home_or_away)
     
     time_box = row.find_element_by_class_name('time-box').text.strip()
     icon     = row.find_element_by_class_name('icon').get_attribute('class').replace('icon ', '')
     
     try:
         participant_name = row.find_element_by_class_name('participant-name').text.strip()
     except:
         participant_name = ''
         
     try:
         substitution_in_name = row.find_element_by_class_name('substitution-in-name').text.strip()
     except:
         substitution_in_name = ''
         
     try:
         substitution_out_name = row.find_element_by_class_name('substitution-out-name').text.strip()
     except:
         substitution_out_name = ''
         
     print(' {:4} | {:4} | {:5} | {:17} | {:17} | {:17} |'.format(home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name))

# ----------------------------------------------------------------------------------------------------------------

try:
    extrainfo = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
    extrainfo = " "
    
print('extrainfo:', extrainfo)

# ----------------------------------------------------------------------------------------------------------------

# go to page with stats
driver.find_element_by_xpath("//*[@id='a-match-statistics']")
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()   

# ----------------------------------------------------------------------------------------------------------------

# get stats 
for row in driver.find_elements_by_xpath('//div[@id="tab-statistics-0-statistic"]//div[@class="statTextGroup"]'):
    columns = row.find_elements_by_tag_name('div')
    columns = [x.text.strip() for x in columns]
    print('{:17} | {:>3} | {:>3} |'.format(columns[1], columns[0], columns[2]))

# ----------------------------------------------------------------------------------------------------------------

#stats.append((data, hometeam, awayteam, print(g.text), extrainfo, home_shot, away_shot, home_shotontarget, away_shotontarget))

#df = pd.DataFrame(stats, columns=['data', 'hometeam', 'awayteam', 'goals', 'extrainfo', 'hs', 'as', 'hot', 'aot'])
#df.to_csv('stats.csv', index=False, encoding='utf-8')

Result:

 h/a  | time | icon            | participant       | substitution_in   | substitution_out  |
 home | 59'  | y-card          | Dunk L.           |                   |                   |
 home | 65'  | y-card          | Jahanbakhsh A.    |                   |                   |
 away | 66'  | substitution-in |                   | Lacazette A.      | Martinelli        |
 away | 66'  | soccer-ball     | Lacazette A.      |                   |                   |
 home | 67'  | substitution-in |                   | Maupay N.         | Mac Allister A.   |
 home | 68'  | substitution-in |                   | March S.          | Propper D.        |
 home | 75'  | substitution-in |                   | Trossard L.       | Jahanbakhsh A.    |
 away | 81'  | substitution-in |                   | Ceballos D.       | Saka B.           |
 away | 89'  | substitution-in |                   | Maitland-Niles A. | Smith Rowe E.     |
extrainfo: Finished
Ball possession   | 50% | 50% |
Goal attempts     |  13 |  11 |
Shots on goal     |   2 |   3 |
Shots off goal    |   5 |   6 |
Blocked Shots     |   6 |   2 |
Free kicks        |   4 |  11 |
Corner kicks      |   5 |   4 |
Offsides          |   0 |   0 |
Goalkeeper saves  |   2 |   2 |
Fouls             |  11 |   4 |
Yellow cards      |   2 |   0 |
Total Passes      | 491 | 511 |
Tackles           |  17 |  10 |
Attacks           | 116 | 108 |
Dangerous Attacks |  58 |  34 |

Now you have to figure out how to organize in table DataFrame.
Times with indcidents doesn't fit me to table but rather in separed table in SQL database which is related with match details.


EDIT:

Code which add every information in separated column. For stats it is OK but for incidents (goals, substitutions, yellow card) it makes a lot of columns and every match will have different number of columns so it will be hard to compare values. How to compare goals when one match will have goal in columns 1,2 but other in columns 6,9. I would rather keep it database - one incident in one row - and use query with WHERE incident = goal.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
             
import pandas as pd
             

driver = webdriver.Firefox()

stats = []

# go to page with match summary 
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')

stats_row = {}
# ----------------------------------------------------------------------------------------------------------------

stats_row['data']     = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "utime"))).text
stats_row['hometeam'] = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
stats_row['awayteam'] = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text

# ----------------------------------------------------------------------------------------------------------------

#match_details = driver.find_element_by_class_name("detailMS__incidentRow")

match_details = driver.find_elements_by_xpath('//div[contains(@class,"incidentRow--home")] | //div[contains(@class, "incidentRow--away")]')

print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format('h/a', 'time', 'icon', 'participant', 'substitution_in', 'substitution_out'))
print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format('-'*4, '-'*4, '-'*15, '-'*17, '-'*17, '-'*17))

for number, row in enumerate(match_details, 1):
     home_or_away = row.get_attribute('class')
     home_or_away = 'home' if 'home' in home_or_away else 'away'
     stats_row['home_or_away'] = home_or_away
     #print('home_or_away:', home_or_away)
     
     time_box = row.find_element_by_class_name('time-box').text.strip()
     icon     = row.find_element_by_class_name('icon').get_attribute('class').replace('icon ', '')
     
     try:
         participant_name = row.find_element_by_class_name('participant-name').text.strip()
     except:
         participant_name = ''
         
     try:
         substitution_in_name = row.find_element_by_class_name('substitution-in-name').text.strip()
     except:
         substitution_in_name = ''
         
     try:
         substitution_out_name = row.find_element_by_class_name('substitution-out-name').text.strip()
     except:
         substitution_out_name = ''
         
     print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format(home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name))

     #stats_row['incident - {} '.format(number)] = [home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name]
   
     stats_row['incident - {} - home_or_away'.format(number)] = home_or_away
     stats_row['incident - {} - time_box'.format(number)] = time_box
     stats_row['incident - {} - icon'.format(number)] = icon
     stats_row['incident - {} - participant_name'.format(number)] = participant_name
     stats_row['incident - {} - substitution_in_name'.format(number)] = substitution_in_name
     stats_row['incident - {} - substitution_out_name'.format(number)] = substitution_out_name
     
# ----------------------------------------------------------------------------------------------------------------

try:
    stats_row['extrainfo'] = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
    stats_row['extrainfo'] = ""
    
print('extrainfo:', stats_row['extrainfo'])

# ----------------------------------------------------------------------------------------------------------------

# go to page with stats
driver.find_element_by_xpath("//*[@id='a-match-statistics']")
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()   

# ----------------------------------------------------------------------------------------------------------------

print('{:17} | {:>3} | {:>3} |'.format('name', 'home', 'away'))
print('{:17} | {:>3} | {:>3} |'.format('-'*17, '---', '---'))

# get stats 
for row in driver.find_elements_by_xpath('//div[@id="tab-statistics-0-statistic"]//div[@class="statTextGroup"]'):
    columns = row.find_elements_by_tag_name('div')
    columns = [x.text.strip() for x in columns]
    name = columns[1]
    home = columns[0]
    away = columns[2]
    stats_row[name + ' - home'] = home
    stats_row[name + ' - away'] = away
    print('{:17} | {:>3} | {:>3} |'.format(name, home, away))

# ----------------------------------------------------------------------------------------------------------------

#columns = stats_row.keys()
stats.append(stats_row)

df = pd.DataFrame(stats, columns=stats_row.keys())
df.to_csv('stats.csv', index=False, encoding='utf-8')

Upvotes: 2

Related Questions