jack3311
jack3311

Reputation: 27

web scraping without getting blocked selenium

I am scraping this page https://www.betexplorer.com/soccer/russia/premier-league-2014-2015/results/ but sometimes the browser dont load the page or website cant be reached. How could i fix this problem ?

home = 'https://www.betexplorer.com/soccer/russia/premier-league-2014-2015/results/'
driver.get(home)

for i in range(1):
    scroll = driver.find_element_by_tag_name('body').send_keys(Keys.END)

l=driver.find_elements_by_xpath("//a[@class='in-match']")

urls=[]
for i in range(len(l)):
    urls.append(l[i].get_attribute('href'))



for i in urls:
    driver.get(i)
    sleep(5)
    date=WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, 'match-date'))).text
    hometeam=WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[4]/div[5]/div/div/div[1]/section/ul[2]/li[1]/figure/div/a/img'))).get_attribute("alt")
    awayteam=WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[4]/div[5]/div/div/div[1]/section/ul[2]/li[3]/figure/div/a/img'))).get_attribute("alt")
    ft=WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, 'js-score'))).text

Upvotes: 1

Views: 214

Answers (1)

Srinath Neela
Srinath Neela

Reputation: 404

I have seen similar question regarding extracting data from https://www.betexplorer.com, but it has been deleted

the question looks like this

""Web Scraping a list of elements I would scrape Matches, Date and Result row by row to csv file""

here is the code

import requests
from lxml import html
import pandas as pd
from pandas import ExcelWriter

url = 'https://www.betexplorer.com/soccer/russia/premier-league/results/'
site = 'https://www.betexplorer.com'

getr = requests.get(url)
src = html.fromstring(getr.content)

game = src.xpath("//td[@class='h-text-left']//a//@href")
ft = src.xpath("//td[@class='h-text-center']//a//text()")
date = src.xpath("//td[@class='h-text-right h-text-no-wrap']//text()")

games = list()
fts = list()
dates = list()

for (gm, ftt, dat) in zip(game, ft, date):
    gm = site + gm
    getg = requests.get(gm)
    srr = html.fromstring(getg.content)
    teams = srr.xpath('//span[@class="list-breadcrumb__item__in"]//text()')
    for team in teams:
        games.append(team)
        fts.append(ftt)
        dates.append(dat)
        fullfile = pd.DataFrame({'Games': games, 'Fts': fts, 'Dates': dates})
        writer = ExcelWriter('D:\\yourpath\\games.xlsx')
        fullfile.to_excel(writer, 'Sheet1', index=False)
        writer.save()

Upvotes: 1

Related Questions