Reputation: 13
I would to scrape 1st half result and minute of goals in 2nd half from this page https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary
Then I tried to scrape 1st half stats from this page https://www.flashscore.co.uk/match/dxm5iiWh/#match-statistics;1 with this code but I would fix Xpaths
P.S In the example I put only two stats but I will apply for all.
Thanks
stats=[]
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')
data = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "utime"))).text
hometeam=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
awayteam=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text
md = driver.find_elements_by_class_name("detailMS__incidentRow")
for g in md:
if g.find_elements_by_class_name("icon.soccer-ball"):
print(g.text)
try:
extrainfo = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
extrainfo = " "
try:
driver.find_element_by_xpath("//*[@id='a-match-statistics']")
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()
home_shot=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[2]/div[1]/div[1]"))).text
away_shot=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[2]/div[1]/div[3]"))).text
home_shotontarget=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[3]/div[1]/div[1]"))).text
away_shotontarget=WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='tab-statistics-0-statistic']/div[3]/div[1]/div[3]"))).text
except NoSuchElementException:
home_shot = "no stats"
away_shot = "no stats"
home_shotontarget = "no stats"
away_shotontarget = "no stats"
stats.append((data, hometeam, awayteam, print(g.text), extrainfo, home_shot, away_shot, home_shotontarget, away_shotontarget))
df = pd.DataFrame(stats, columns=['data', 'hometeam', 'awayteam', 'goals',
'extrainfo', 'hs', 'as', 'hot', 'aot'])
df.to_csv('stats.csv', index=False, encoding='utf-8')
Upvotes: 1
Views: 682
Reputation: 142641
Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
driver = webdriver.Firefox()
stats = []
# go to page with match summary
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')
# ----------------------------------------------------------------------------------------------------------------
wait = WebDriverWait(driver, 5)
data = wait.until(EC.visibility_of_element_located((By.ID, "utime"))).text
hometeam = wait.until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
awayteam = wait.until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text
# ----------------------------------------------------------------------------------------------------------------
#match_details = driver.find_element_by_class_name("detailMS__incidentRow")
match_details = driver.find_elements_by_xpath('//div[contains(@class,"incidentRow--home")] | //div[contains(@class, "incidentRow--away")]')
print(' {:4} | {:4} | {:5} | {:17} | {:17} | {:17} |'.format('h/a', 'time', 'icon', 'participant', 'substitution_in', 'substitution_out'))
for row in match_details:
home_or_away = row.get_attribute('class')
home_or_away = 'home' if 'home' in home_or_away else 'away'
#print('home_or_away:', home_or_away)
time_box = row.find_element_by_class_name('time-box').text.strip()
icon = row.find_element_by_class_name('icon').get_attribute('class').replace('icon ', '')
try:
participant_name = row.find_element_by_class_name('participant-name').text.strip()
except:
participant_name = ''
try:
substitution_in_name = row.find_element_by_class_name('substitution-in-name').text.strip()
except:
substitution_in_name = ''
try:
substitution_out_name = row.find_element_by_class_name('substitution-out-name').text.strip()
except:
substitution_out_name = ''
print(' {:4} | {:4} | {:5} | {:17} | {:17} | {:17} |'.format(home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name))
# ----------------------------------------------------------------------------------------------------------------
try:
extrainfo = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
extrainfo = " "
print('extrainfo:', extrainfo)
# ----------------------------------------------------------------------------------------------------------------
# go to page with stats
driver.find_element_by_xpath("//*[@id='a-match-statistics']")
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()
# ----------------------------------------------------------------------------------------------------------------
# get stats
for row in driver.find_elements_by_xpath('//div[@id="tab-statistics-0-statistic"]//div[@class="statTextGroup"]'):
columns = row.find_elements_by_tag_name('div')
columns = [x.text.strip() for x in columns]
print('{:17} | {:>3} | {:>3} |'.format(columns[1], columns[0], columns[2]))
# ----------------------------------------------------------------------------------------------------------------
#stats.append((data, hometeam, awayteam, print(g.text), extrainfo, home_shot, away_shot, home_shotontarget, away_shotontarget))
#df = pd.DataFrame(stats, columns=['data', 'hometeam', 'awayteam', 'goals', 'extrainfo', 'hs', 'as', 'hot', 'aot'])
#df.to_csv('stats.csv', index=False, encoding='utf-8')
Result:
h/a | time | icon | participant | substitution_in | substitution_out |
home | 59' | y-card | Dunk L. | | |
home | 65' | y-card | Jahanbakhsh A. | | |
away | 66' | substitution-in | | Lacazette A. | Martinelli |
away | 66' | soccer-ball | Lacazette A. | | |
home | 67' | substitution-in | | Maupay N. | Mac Allister A. |
home | 68' | substitution-in | | March S. | Propper D. |
home | 75' | substitution-in | | Trossard L. | Jahanbakhsh A. |
away | 81' | substitution-in | | Ceballos D. | Saka B. |
away | 89' | substitution-in | | Maitland-Niles A. | Smith Rowe E. |
extrainfo: Finished
Ball possession | 50% | 50% |
Goal attempts | 13 | 11 |
Shots on goal | 2 | 3 |
Shots off goal | 5 | 6 |
Blocked Shots | 6 | 2 |
Free kicks | 4 | 11 |
Corner kicks | 5 | 4 |
Offsides | 0 | 0 |
Goalkeeper saves | 2 | 2 |
Fouls | 11 | 4 |
Yellow cards | 2 | 0 |
Total Passes | 491 | 511 |
Tackles | 17 | 10 |
Attacks | 116 | 108 |
Dangerous Attacks | 58 | 34 |
Now you have to figure out how to organize in table DataFrame
.
Times with indcidents doesn't fit me to table but rather in separed table in SQL database which is related with match details.
EDIT:
Code which add every information in separated column. For stats it is OK but for incidents (goals, substitutions, yellow card) it makes a lot of columns and every match will have different number of columns so it will be hard to compare values. How to compare goals when one match will have goal in columns 1,2 but other in columns 6,9. I would rather keep it database - one incident in one row - and use query with WHERE incident = goal
.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
driver = webdriver.Firefox()
stats = []
# go to page with match summary
driver.get('https://www.flashscore.co.uk/match/dxm5iiWh/#match-summary')
stats_row = {}
# ----------------------------------------------------------------------------------------------------------------
stats_row['data'] = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "utime"))).text
stats_row['hometeam'] = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[1]/div[2]/div/div/a"))).text
stats_row['awayteam'] = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@id='flashscore']/div[1]/div[3]/div[2]/div/div/a"))).text
# ----------------------------------------------------------------------------------------------------------------
#match_details = driver.find_element_by_class_name("detailMS__incidentRow")
match_details = driver.find_elements_by_xpath('//div[contains(@class,"incidentRow--home")] | //div[contains(@class, "incidentRow--away")]')
print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format('h/a', 'time', 'icon', 'participant', 'substitution_in', 'substitution_out'))
print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format('-'*4, '-'*4, '-'*15, '-'*17, '-'*17, '-'*17))
for number, row in enumerate(match_details, 1):
home_or_away = row.get_attribute('class')
home_or_away = 'home' if 'home' in home_or_away else 'away'
stats_row['home_or_away'] = home_or_away
#print('home_or_away:', home_or_away)
time_box = row.find_element_by_class_name('time-box').text.strip()
icon = row.find_element_by_class_name('icon').get_attribute('class').replace('icon ', '')
try:
participant_name = row.find_element_by_class_name('participant-name').text.strip()
except:
participant_name = ''
try:
substitution_in_name = row.find_element_by_class_name('substitution-in-name').text.strip()
except:
substitution_in_name = ''
try:
substitution_out_name = row.find_element_by_class_name('substitution-out-name').text.strip()
except:
substitution_out_name = ''
print(' {:4} | {:4} | {:15} | {:17} | {:17} | {:17} |'.format(home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name))
#stats_row['incident - {} '.format(number)] = [home_or_away, time_box, icon, participant_name, substitution_in_name, substitution_out_name]
stats_row['incident - {} - home_or_away'.format(number)] = home_or_away
stats_row['incident - {} - time_box'.format(number)] = time_box
stats_row['incident - {} - icon'.format(number)] = icon
stats_row['incident - {} - participant_name'.format(number)] = participant_name
stats_row['incident - {} - substitution_in_name'.format(number)] = substitution_in_name
stats_row['incident - {} - substitution_out_name'.format(number)] = substitution_out_name
# ----------------------------------------------------------------------------------------------------------------
try:
stats_row['extrainfo'] = driver.find_element_by_xpath("//*[@id='flashscore']/div[1]/div[2]/div[2]").text
except NoSuchElementException:
stats_row['extrainfo'] = ""
print('extrainfo:', stats_row['extrainfo'])
# ----------------------------------------------------------------------------------------------------------------
# go to page with stats
driver.find_element_by_xpath("//*[@id='a-match-statistics']")
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='a-match-statistics']"))).click()
# ----------------------------------------------------------------------------------------------------------------
print('{:17} | {:>3} | {:>3} |'.format('name', 'home', 'away'))
print('{:17} | {:>3} | {:>3} |'.format('-'*17, '---', '---'))
# get stats
for row in driver.find_elements_by_xpath('//div[@id="tab-statistics-0-statistic"]//div[@class="statTextGroup"]'):
columns = row.find_elements_by_tag_name('div')
columns = [x.text.strip() for x in columns]
name = columns[1]
home = columns[0]
away = columns[2]
stats_row[name + ' - home'] = home
stats_row[name + ' - away'] = away
print('{:17} | {:>3} | {:>3} |'.format(name, home, away))
# ----------------------------------------------------------------------------------------------------------------
#columns = stats_row.keys()
stats.append(stats_row)
df = pd.DataFrame(stats, columns=stats_row.keys())
df.to_csv('stats.csv', index=False, encoding='utf-8')
Upvotes: 2