Jerry
Jerry

Reputation: 77

When scraping the page with selenium and beautiful soup, I will get duplicates of some results

I'm scraping a page with selenium and beautiful soup and I'm getting duplicates when I use a for loop to change the page URL and I have no idea why. This is my code

import json
import requests
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from selenium import webdriver
from chromedriver_py import binary_path # this will get you the path variable
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd

teams = []
home_team = []
away_team = []
results_away = []
results_home = []
results_away1 = []
results_home1 = []
list3_away = []
list3_home = []
odds = []
odds_sub = []
odds_sub1 = []
days = ['20160826',
'20160827',
'20160901',
]
for date in days:
    header = {"user-agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    url = 'https://www.sportsbookreview.com/betting-odds/college-football/totals/1st-half/?date='+date
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'html.parser')

    #### Selenium scraper for odds
    driver = webdriver.Chrome(executable_path=binary_path)
    driver.get(url)
    time.sleep(5)
    elements = WebDriverWait(driver, 5).until(lambda d: d.find_elements_by_xpath('//*[@data-vertical-sbid="238"]'))
    for a in elements:
        odds.append(a.text)
    for a in odds:
        string = a.split('''\n''', 1)[0]
        mod_string = string[:-4]
        odds_sub.append(mod_string)
    driver.quit()

    ## All teams
    for el in soup.find_all('span', class_='participantBox-3ar9Y'):
        teams.append(el.text)

    ### Away result
    for s in soup.find_all(class_="scoreboard-1TXQV"):
        for a in s.select(".scoreboardColumn-2OtpR > div:nth-of-type(1)")[:2]:
            results_away.append(a.text)
    for a in results_away:
        if a == '-':
            a= a.replace(a, '10000')
        results_away1.append(a)
    results_away1 = [int(i) for i in results_away1]

    for i in range(0, len(results_away1)-1, 2):
        firstnum = results_away1[i]
        secondnum = results_away1[i+1]
        sumnum = firstnum + secondnum
        list3_away.append(sumnum)

    ### Home results
    for s in soup.find_all(class_="scoreboard-1TXQV"):
        for a in s.select(".scoreboardColumn-2OtpR > div:nth-of-type(2)")[:2]:
            results_home.append(a.text)
    for a in results_home:
        if a == '-':
            a= a.replace(a, '10000')
        results_home1.append(a)
    results_home1 = [int(i) for i in results_home1]

    for i in range(0, len(results_home1)-1, 2):
        firstnum = results_home1[i]
        secondnum = results_home1[i+1]
        sumnum = firstnum + secondnum
        list3_home.append(sumnum)
    time.sleep(7)

#### Create lists for away and home teams
for team in teams[::2]:
    away_team.append(team)
for team in teams[1::2]:
    home_team.append(team)

print(home_team)
print('*****')
print(away_team)
print('*****')
print(list3_home)
print('*****')
print(list3_away)
print('*****')
print(odds_sub)
print('*****')

I receive results like this:

['California', 'North Dakota State', '(13) Louisville', 'Wake Forest', 'Central Michigan', 'Cincinnati', 'Connecticut', 'Florida International', '(21) Tennessee', 'North Carolina State', 'Western Kentucky', 'Vanderbilt', '(19) Utah', 'Utah State', 'Minnesota', 'New Mexico', 'Idaho', 'UNLV']
*****
['Hawaii', 'Charleston Southern', 'Charlotte', 'Tulane', 'Presbyterian', 'Tennessee-Martin', 'Maine', 'Indiana', 'Appalachian State', 'William & Mary', 'Rice', 'South Carolina', 'Southern Utah', 'Weber State', 'Oregon State', 'South Dakota', 'Montana State', 'Jackson State']
*****
[34, 34, 34, 3, 34, 34, 3, 34, 3, 56, 7, 14, 6, 7, 10, 3, 28, 30, 10, 17, 21, 17, 35, 20, 42]
*****
[14, 14, 14, 3, 14, 14, 3, 14, 3, 0, 3, 3, 7, 7, 12, 13, 7, 7, 0, 0, 6, 14, 14, 10, 10]
*****
['34', '34', '', '34', '', '34½', '21½', '', '', '', '32', '31', '', '33½', '20', '', '', '28', '', '', '']
*****

For example list3_home has 7 results at the beginning duplicated, odds_sub list has 3 duplicates at the beginning etc.

Upvotes: 0

Views: 160

Answers (1)

chitown88
chitown88

Reputation: 28630

Ok so see if this works. There's quite a bit of parameter ids to use in the query if you want other data. But this will get you the 1st half stuff. I think I identified the correct id for Pinnacle.

import requests
import pandas as pd
from datetime import datetime, timezone, timedelta
from calendar import timegm

days = ['20160826',
'20160827',
'20160901',
'20200912',
]


homeTeam_list = []
awayTeam_list = []
homeScore_list = []
awayScore_list = []

odds_list = []


url = 'https://www.sportsbookreview.com/ms-odds-v2/odds-v2-service'
header = {"user-agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
   
for date in days:
    print(date)
    date_time_obj = datetime.strptime(date, '%Y%m%d')
    year = date_time_obj.year
    month = date_time_obj.month
    day = date_time_obj.day
    
    dt = datetime(year,month,day,0,0)
    tz = timezone(timedelta(hours=0))
    epochtime = timegm(dt.replace(tzinfo=tz).utctimetuple()) *1000

    queryStr_participants = 'query=%7B+eventsByDateByLeagueGroup(+es:+[%22in-progress%22,+%22scheduled%22,+%22complete%22,+%22suspended%22,+%22delayed%22,+%22postponed%22,+%22retired%22,+%22canceled%22],+leagueGroups:+[%7B+mtid:+398,+lid:+6,+spid:+4+%7D],+providerAcountOpener:+3,+hoursRange:+25,+showEmptyEvents:+false,+marketTypeLayout:+%22PARTICIPANTS%22,+ic:+false,+startDate:+' + str(int(epochtime)) + ',+timezoneOffset:+-4,+nof:+true,+hl:+true,+sort:+%7Bby:+[%22lid%22,+%22dt%22,+%22des%22],+order:+ASC%7D+)+%7B+events+%7B+eid+lid+spid+des+dt+es+rid+ic+ven+tvs+cit+cou+st+sta+hl+seid+writeingame+plays(pgid:+2,+limitLastSeq:+3,+pgidWhenFinished:+-1)+%7B+eid+sqid+siid+gid+nam+val+tim+%7D+scores+%7B+partid+val+eid+pn+sequence+%7D+participants+%7B+eid+partid+partbeid+psid+ih+rot+tr+sppil+sppic+startingPitcher+%7B+fn+lnam+%7D+source+%7B+...+on+Player+%7B+pid+fn+lnam+%7D+...+on+Team+%7B+tmid+lid+tmblid+nam+nn+sn+abbr+cit+senam+imageurl+%7D+...+on+ParticipantGroup+%7B+partgid+nam+lid+participants+%7B+eid+partid+psid+ih+rot+source+%7B+...+on+Player+%7B+pid+fn+lnam+%7D+...+on+Team+%7B+tmid+lid+nam+nn+sn+abbr+cit+%7D+%7D+%7D+%7D+%7D+%7D+marketTypes+%7B+mtid+spid+nam+des+settings+%7B+sitid+did+alias+format+template+sort+url+%7D+%7D++eventGroup+%7B+egid+nam+%7D+statistics(sgid:+3,+sgidWhenFinished:+4)+%7B+val+eid+nam+partid+pid+typ+siid+sequence+%7D+league+%7B+lid+nam+rid+spid+sn+settings+%7B+alias+rotation+ord+shortnamebreakpoint+matchupline+%7D+%7D+%7D+maxSequences+%7B+events:+eventsMaxSequence+scores:+scoresMaxSequence+currentLines:+linesMaxSequence+statistics:+statisticsMaxSequence+plays:+playsMaxSequence+consensus:+consensusMaxSequence+%7D+%7D+%7D'
    jsonData_participants = requests.get(url, headers = header, params=queryStr_participants).json()



    events = jsonData_participants['data']['eventsByDateByLeagueGroup']['events']
    
    for event in events:
        des = event['des'].split('@')
        if event['es'] != 'complete':
            print(' @ '.join(des), ' - ', event['es'])
            continue
        else:    
            print(' @ '.join(des))
        eventId = str(event['eid'])
        queryStr_odds = 'query=%7B+currentLines(eid:+['+eventId+'],+mtid:+[398],+marketTypeLayout:+%22PARTICIPANTS%22,+catid:+133)+openingLines(eid:+['+eventId+'],+mtid:+[398],+marketTypeLayout:+%22PARTICIPANTS%22,+paid:+3)+bestLines(catid:+133,+eid:+['+eventId+'],+mtid:+[398])+consensus(eid:+['+eventId+'],+mtid:+[398])+%7B+eid+mtid+boid+partid+sbid+bb+paid+lineid+wag+perc+vol+tvol+sequence+tim+%7D+maxSequences+%7B+events:+eventsMaxSequence+scores:+scoresMaxSequence+currentLines:+linesMaxSequence+statistics:+statisticsMaxSequence+plays:+playsMaxSequence+consensus:+consensusMaxSequence+%7D+%7D'
        jsonData_odds = requests.get(url, headers = header, params=queryStr_odds).json()
        
        pinnacle_id = 20
        currentLines = jsonData_odds['data']['currentLines']
        for currentLine in currentLines:
            if currentLine['paid'] == pinnacle_id:
                odds = currentLine['adj']
                break
            else:
                odds = None
        
        
        teamIds = {}
        for team in event['participants']:
            teamIds[team['source']['tmid']] = team['source']['nam']
        
        
        scores = pd.json_normalize(event['scores'])
        scores['team_name'] = scores['partid'].map(teamIds)
        
        awayTeam = event['participants'][0]['source']['nam']
        homeTeam = event['participants'][1]['source']['nam']
    
        awayScore = scores[(scores['team_name'] == awayTeam) & (scores['pn'] <= 2)]['val'].astype(int).sum()
        homeScore = scores[(scores['team_name'] == homeTeam) & (scores['pn'] <= 2)]['val'].astype(int).sum()
    
    
        homeTeam_list.append(homeTeam)
        awayTeam_list.append(awayTeam)
        
        homeScore_list.append(homeScore)
        awayScore_list.append(awayScore)
        
        odds_list.append(odds)

print(homeTeam_list)
print('*****')
print(awayTeam_list)
print('*****')
print(homeScore_list)
print('*****')
print(awayScore_list)
print('*****')
print(odds_list)
print('*****')

Output:

['California', 'Charleston Southern', 'Charlotte', 'Maine', 'Presbyterian', 'Tennessee-Martin', 'Wake Forest', 'Tennessee', 'Florida International', 'North Carolina State', 'Rice', 'Vanderbilt', 'Southern Utah', 'Weber State', 'Montana State', 'Oregon State', 'South Dakota', 'Jackson State']
*****
['Hawaii', 'North Dakota State', 'Louisville', 'Connecticut', 'Central Michigan', 'Cincinnati', 'Tulane', 'Appalachian State', 'Indiana', 'William & Mary', 'Western Kentucky', 'South Carolina', 'Utah', 'Utah State', 'Idaho', 'Minnesota', 'New Mexico', 'UNLV']
*****
[34, 3, 0, 7, 3, 7, 7, 3, 10, 28, 7, 10, 0, 6, 10, 14, 14, 10]
*****
[14, 3, 56, 7, 14, 6, 3, 13, 12, 7, 30, 0, 17, 21, 20, 17, 35, 42]
*****
[34, 34, 35, 35, 35, 35, 35, 31, 32, 32, 32, 21, 21, 21, 21, 28, 28, 28]
*****

Upvotes: 3

Related Questions