Tshaka Eric Lekholoane
Tshaka Eric Lekholoane

Reputation: 105

How to parse a flat HTML structure into a dictionary using BeautifulSoup?

I am trying to parse fixture data from a sports website with a flat HTML structure using BeautifulSoup.

So far what I have tried parses the only the first fixture of every fixture date and not the others on the same date.

The html is:

...

<h3 class="fix_header1">January 2019</h3>
<h4 class="fix_header2">Friday 4th January</h4>
<div class="fix_item">
    <span class="match_col">
        <span class="team">Warriors</span>
    </span>
    <span class="match_time">20:00</span>
    <span class="match_col">
        <span class="team">Knights</span>
    </span>
</div>
<h4 class="fix_header2">Saturday 5th January</h4>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<h4 class="fix_header2">Sunday 6th January</h4>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<div class="fix_item">...</div>
<div class="fix_item">...</div>

...

The code:

from bs4 import BeautifulSoup
import requests

url = "https://www.dummyurl.com/fixtures"
response = requests.get(url, timeout=5)
content = BeautifulSoup(response.content, "html.parser")

fixtures = []

def process_fixtures(date, home, time, away):
    fixture_item = {
        "date": "", 
        "home":"", 
        "time":"", 
        "away":""
    }

    fixture_item["date"] = date
    fixture_item["home"] = home
    fixture_item["time"] = time
    fixture_item["away"] = away
    fixtures.append(fixture_item)

fixtures_dates = content.find_all("h4", class_="fix_header2")
for fixtures_date in fixtures_dates:
    date = fixtures_date.text 
    home = fixtures_date.find_next("span", class_="team").text
    time = fixtures_date.find_next("span", class_="match_time").text.strip()
    away = fixtures_date.find_next("span", class_="team").find_next("span", class_="team").text
    process_fixtures(date, home, time, away)

Output:

[{'date': 'Friday 4th January',
  'home': 'Warriors',
  'time': '20:00',
  'away': 'Knights'},
 {'date': 'Saturday 5th January',
  'home': 'Kings',
  'time': '15:00',
  'away': 'Bulls'},
 {'date': 'Sunday 6th January',
  'home': 'Fishes',
  'time': '19:00',
  'away': 'Lions'}, 

  ...

What I am looking for:

[{'date': 'Friday 4th January',
  'home': 'Warriors',
  'time': '20:00',
  'away': 'Knights'},
 {'date': 'Saturday 5th January',
  'home': 'Kings',
  'time': '15:00',
  'away': 'Bulls'},
 {'date': 'Saturday 5th January',
  'home': 'Cats',
  'time': '16:30',
  'away': 'Dogs'},
 {'date': 'Saturday 5th January',
  'home': 'Empire',
  'time': '19:30',
  'away': 'County State'},

   ...

Upvotes: 1

Views: 297

Answers (3)

Ajax1234
Ajax1234

Reputation: 71451

You can also use itertools.groupby to more robustly group games of any number under the appropriate date header:

d = soup(requests.get('https://www.skysports.com/premier-league-fixtures').text, 'html.parser')
new_d = d.find('div', {'class':'fixres__body'}).find_all(re.compile('h4|div'), {'class':re.compile('fixres__header2|fixres__item')})
g = [(a, list(b)) for a, b in itertools.groupby(new_d, key=lambda x:x.name == 'h4')]
g1 = [[g[i][-1][0], g[i+1][-1]] for i in range(0, len(g), 2)]
new_g = [[a, [dict(zip(['home', 'time', 'away'], [i.text for i in j.find_all('span', {'class':re.compile('swap\-text__target|matches__date')})])) for j in b]] for a, b in g1]
final_result = [{'date':a.text, **c} for a, b in new_g for c in b]

Output:

[{'date': 'Friday 9th August', 'home': 'Liverpool', 'time': '\n        20:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 10th August', 'home': 'West Ham United', 'time': '\n        12:30    ', 'away': 'Manchester City'}, {'date': 'Saturday 10th August', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 10th August', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 10th August', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 10th August', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 10th August', 'home': 'Tottenham Hotspur', 'time': '\n        17:30    ', 'away': 'Aston Villa'}, {'date': 'Sunday 11th August', 'home': 'Leicester City', 'time': '\n        14:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Sunday 11th August', 'home': 'Newcastle United', 'time': '\n        14:00    ', 'away': 'Arsenal'}, {'date': 'Sunday 11th August', 'home': 'Manchester United', 'time': '\n        16:30    ', 'away': 'Chelsea'}, {'date': 'Saturday 17th August', 'home': 'Arsenal', 'time': '\n        12:30    ', 'away': 'Burnley'}, {'date': 'Saturday 17th August', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 17th August', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 17th August', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 17th August', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Saturday 17th August', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Saturday 17th August', 'home': 'Manchester City', 'time': '\n        17:30    ', 'away': 'Tottenham Hotspur'}, {'date': 'Sunday 18th August', 'home': 'Sheffield United', 'time': '\n        14:00    ', 'away': 'Crystal Palace'}, {'date': 'Sunday 18th August', 'home': 'Chelsea', 'time': '\n        16:30    ', 'away': 'Leicester City'}, {'date': 'Monday 19th August', 'home': 'Wolverhampton Wanderers', 'time': '\n        20:00    ', 'away': 'Manchester United'}, {'date': 'Friday 23rd August', 'home': 'Aston Villa', 'time': '\n        20:00    ', 'away': 'Everton'}, {'date': 'Saturday 24th August', 'home': 'Norwich City', 'time': '\n        12:30    ', 'away': 'Chelsea'}, {'date': 'Saturday 24th August', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 24th August', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 24th August', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 24th August', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 24th August', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 24th August', 'home': 'Liverpool', 'time': '\n        17:30    ', 'away': 'Arsenal'}, {'date': 'Sunday 25th August', 'home': 'Bournemouth', 'time': '\n        14:00    ', 'away': 'Manchester City'}, {'date': 'Sunday 25th August', 'home': 'Tottenham Hotspur', 'time': '\n        16:30    ', 'away': 'Newcastle United'}, {'date': 'Saturday 31st August', 'home': 'Southampton', 'time': '\n        12:30    ', 'away': 'Manchester United'}, {'date': 'Saturday 31st August', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 31st August', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Saturday 31st August', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 31st August', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 31st August', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 31st August', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 31st August', 'home': 'Burnley', 'time': '\n        17:30    ', 'away': 'Liverpool'}, {'date': 'Sunday 1st September', 'home': 'Everton', 'time': '\n        14:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Sunday 1st September', 'home': 'Arsenal', 'time': '\n        16:30    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 14th September', 'home': 'Liverpool', 'time': '\n        12:30    ', 'away': 'Newcastle United'}, {'date': 'Saturday 14th September', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 14th September', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 14th September', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 14th September', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 14th September', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 14th September', 'home': 'Norwich City', 'time': '\n        17:30    ', 'away': 'Manchester City'}, {'date': 'Sunday 15th September', 'home': 'Bournemouth', 'time': '\n        14:00    ', 'away': 'Everton'}, {'date': 'Sunday 15th September', 'home': 'Watford', 'time': '\n        16:30    ', 'away': 'Arsenal'}, {'date': 'Monday 16th September', 'home': 'Aston Villa', 'time': '\n        20:00    ', 'away': 'West Ham United'}, {'date': 'Friday 20th September', 'home': 'Southampton', 'time': '\n        20:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 21st September', 'home': 'Leicester City', 'time': '\n        12:30    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 21st September', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 21st September', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 21st September', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 21st September', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 21st September', 'home': 'Newcastle United', 'time': '\n        17:30    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Sunday 22nd September', 'home': 'West Ham United', 'time': '\n        14:00    ', 'away': 'Manchester United'}, {'date': 'Sunday 22nd September', 'home': 'Arsenal', 'time': '\n        16:30    ', 'away': 'Aston Villa'}, {'date': 'Sunday 22nd September', 'home': 'Chelsea', 'time': '\n        16:30    ', 'away': 'Liverpool'}, {'date': 'Saturday 28th September', 'home': 'Sheffield United', 'time': '\n        12:30    ', 'away': 'Liverpool'}, {'date': 'Saturday 28th September', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 28th September', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 28th September', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 28th September', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 28th September', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 28th September', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 28th September', 'home': 'Leicester City', 'time': '\n        17:30    ', 'away': 'Newcastle United'}, {'date': 'Sunday 29th September', 'home': 'Everton', 'time': '\n        16:30    ', 'away': 'Manchester City'}, {'date': 'Monday 30th September', 'home': 'Manchester United', 'time': '\n        20:00    ', 'away': 'Arsenal'}, {'date': 'Saturday 5th October', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 5th October', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 5th October', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 5th October', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 5th October', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Saturday 5th October', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 5th October', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 5th October', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Sunday 6th October', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Sunday 6th October', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 19th October', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 19th October', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 19th October', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Saturday 19th October', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Manchester City'}, {'date': 'Saturday 19th October', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 19th October', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 19th October', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Saturday 19th October', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Saturday 19th October', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 19th October', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 26th October', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 26th October', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 26th October', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 26th October', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 26th October', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Saturday 26th October', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 26th October', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 26th October', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 26th October', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 26th October', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 2nd November', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 2nd November', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Saturday 2nd November', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 2nd November', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 2nd November', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 2nd November', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 2nd November', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 2nd November', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 2nd November', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 2nd November', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Saturday 9th November', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 9th November', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 9th November', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Manchester City'}, {'date': 'Saturday 9th November', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 9th November', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 9th November', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 9th November', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 9th November', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Sunday 10th November', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Sunday 10th November', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 23rd November', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 23rd November', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Saturday 23rd November', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 23rd November', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 23rd November', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Saturday 23rd November', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 23rd November', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 23rd November', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 23rd November', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 23rd November', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 30th November', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 30th November', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 30th November', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 30th November', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 30th November', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Manchester City'}, {'date': 'Saturday 30th November', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 30th November', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 30th November', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Sunday 1st December', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Sunday 1st December', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Tuesday 3rd December', 'home': 'Arsenal', 'time': '\n        19:45    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Tuesday 3rd December', 'home': 'Burnley', 'time': '\n        19:45    ', 'away': 'Manchester City'}, {'date': 'Tuesday 3rd December', 'home': 'Leicester City', 'time': '\n        19:45    ', 'away': 'Watford'}, {'date': 'Tuesday 3rd December', 'home': 'Sheffield United', 'time': '\n        19:45    ', 'away': 'Newcastle United'}, {'date': 'Tuesday 3rd December', 'home': 'Wolverhampton Wanderers', 'time': '\n        19:45    ', 'away': 'West Ham United'}, {'date': 'Tuesday 3rd December', 'home': 'Manchester United', 'time': '\n        20:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Wednesday 4th December', 'home': 'Chelsea', 'time': '\n        19:45    ', 'away': 'Aston Villa'}, {'date': 'Wednesday 4th December', 'home': 'Southampton', 'time': '\n        19:45    ', 'away': 'Norwich City'}, {'date': 'Wednesday 4th December', 'home': 'Crystal Palace', 'time': '\n        20:00    ', 'away': 'Bournemouth'}, {'date': 'Wednesday 4th December', 'home': 'Liverpool', 'time': '\n        20:00    ', 'away': 'Everton'}, {'date': 'Saturday 7th December', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 7th December', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Saturday 7th December', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 7th December', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 7th December', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 7th December', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 7th December', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 7th December', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 7th December', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 7th December', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Saturday 14th December', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Manchester City'}, {'date': 'Saturday 14th December', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Saturday 14th December', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 14th December', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Saturday 14th December', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Saturday 14th December', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Saturday 14th December', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 14th December', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Saturday 14th December', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Saturday 14th December', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 21st December', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Saturday 21st December', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Saturday 21st December', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 21st December', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Saturday 21st December', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Leicester City'}, {'date': 'Saturday 21st December', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 21st December', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 21st December', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 21st December', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 21st December', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Thursday 26th December', 'home': 'Aston Villa', 'time': '\n        15:00    ', 'away': 'Norwich City'}, {'date': 'Thursday 26th December', 'home': 'Bournemouth', 'time': '\n        15:00    ', 'away': 'Arsenal'}, {'date': 'Thursday 26th December', 'home': 'Chelsea', 'time': '\n        15:00    ', 'away': 'Southampton'}, {'date': 'Thursday 26th December', 'home': 'Crystal Palace', 'time': '\n        15:00    ', 'away': 'West Ham United'}, {'date': 'Thursday 26th December', 'home': 'Everton', 'time': '\n        15:00    ', 'away': 'Burnley'}, {'date': 'Thursday 26th December', 'home': 'Leicester City', 'time': '\n        15:00    ', 'away': 'Liverpool'}, {'date': 'Thursday 26th December', 'home': 'Manchester United', 'time': '\n        15:00    ', 'away': 'Newcastle United'}, {'date': 'Thursday 26th December', 'home': 'Sheffield United', 'time': '\n        15:00    ', 'away': 'Watford'}, {'date': 'Thursday 26th December', 'home': 'Tottenham Hotspur', 'time': '\n        15:00    ', 'away': 'Brighton and Hove Albion'}, {'date': 'Thursday 26th December', 'home': 'Wolverhampton Wanderers', 'time': '\n        15:00    ', 'away': 'Manchester City'}, {'date': 'Saturday 28th December', 'home': 'Arsenal', 'time': '\n        15:00    ', 'away': 'Chelsea'}, {'date': 'Saturday 28th December', 'home': 'Brighton and Hove Albion', 'time': '\n        15:00    ', 'away': 'Bournemouth'}, {'date': 'Saturday 28th December', 'home': 'Burnley', 'time': '\n        15:00    ', 'away': 'Manchester United'}, {'date': 'Saturday 28th December', 'home': 'Liverpool', 'time': '\n        15:00    ', 'away': 'Wolverhampton Wanderers'}, {'date': 'Saturday 28th December', 'home': 'Manchester City', 'time': '\n        15:00    ', 'away': 'Sheffield United'}, {'date': 'Saturday 28th December', 'home': 'Newcastle United', 'time': '\n        15:00    ', 'away': 'Everton'}, {'date': 'Saturday 28th December', 'home': 'Norwich City', 'time': '\n        15:00    ', 'away': 'Tottenham Hotspur'}, {'date': 'Saturday 28th December', 'home': 'Southampton', 'time': '\n        15:00    ', 'away': 'Crystal Palace'}, {'date': 'Saturday 28th December', 'home': 'Watford', 'time': '\n        15:00    ', 'away': 'Aston Villa'}, {'date': 'Saturday 28th December', 'home': 'West Ham United', 'time': '\n        15:00    ', 'away': 'Leicester City'}]

Upvotes: 0

Andrej Kesely
Andrej Kesely

Reputation: 195438

You can use CSS selectors and zip() method:

from bs4 import BeautifulSoup
import requests

url = 'https://www.skysports.com/premier-league-fixtures'
soup = BeautifulSoup(requests.get(url).text, 'lxml')

out = []
for side1, side2, match_time in zip(soup.select('.fixres__header2 ~ .fixres__item .matches__participant--side1'),
                                    soup.select('.fixres__header2 ~ .fixres__item .matches__participant--side2'),
                                    soup.select('.fixres__header2 ~ .fixres__item .matches__date')):
    out.append({'away': side2.get_text(strip=True),
    'date': side1.find_previous('h4').get_text(strip=True),
    'home': side1.get_text(strip=True),
    'time': match_time.get_text(strip=True)})


from pprint import pprint
pprint(out)

Prints:

[{'away': 'Norwich City',
  'date': 'Friday 9th August',
  'home': 'Liverpool',
  'time': '20:00'},
 {'away': 'Manchester City',
  'date': 'Saturday 10th August',
  'home': 'West Ham United',
  'time': '12:30'},
 {'away': 'Sheffield United',
  'date': 'Saturday 10th August',
  'home': 'Bournemouth',
  'time': '15:00'},
 {'away': 'Southampton',
  'date': 'Saturday 10th August',
  'home': 'Burnley',
  'time': '15:00'},

...etc.

Upvotes: 1

abdusco
abdusco

Reputation: 11091

Trick is to start with a date header and loop over the siblings that contain fixture info until you hit another header. Whatever you collect between the date headers belong to the last date.

Try this:

from bs4 import BeautifulSoup, Tag
import requests
from pprint import pprint

def make_soup(url: str) -> BeautifulSoup:
    res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
    res.raise_for_status()
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def extract_fixtures(soup: BeautifulSoup) -> list:
    headers = soup.select('.fixres__header2')
    fixtures = []
    for h in headers:
        date = h.text.strip()
        for s in h.next_siblings:
            if s in headers:
                break
            if not isinstance(s, Tag):
                continue
            if 'fixres__item' not in s.get('class', []):
                break

            home = s.select_one('.matches__participant--side1').text.strip()
            away = s.select_one('.matches__participant--side2').text.strip()
            time = s.select_one('.matches__date').text.strip()
            m = {
                'date': date,
                'home': home,
                'away': away,
                'time': time
            }
            fixtures.append(m)
    return fixtures


url = 'https://www.skysports.com/premier-league-fixtures'
soup = make_soup(url)
fix = extract_fixtures(soup)

pprint(fix)

output:

[{'away': 'Norwich City',
  'date': 'Friday 9th August',
  'home': 'Liverpool',
  'time': '20:00'},
 {'away': 'Manchester City',
  'date': 'Saturday 10th August',
  'home': 'West Ham United',
  'time': '12:30'},
 {'away': 'Sheffield United',
  'date': 'Saturday 10th August',
  'home': 'Bournemouth',
  'time': '15:00'},
 {'away': 'Southampton',
  'date': 'Saturday 10th August',
  'home': 'Burnley',
  'time': '15:00'},
 {'away': 'Everton',
  'date': 'Saturday 10th August',
  'home': 'Crystal Palace',
  'time': '15:00'},
 {'away': 'Brighton and Hove Albion',
  'date': 'Saturday 10th August',
  'home': 'Watford',
  'time': '15:00'},
...
...

Upvotes: 1

Related Questions