Raffel Ravionaldo
Raffel Ravionaldo

Reputation: 40

How to make a dictionary from 2 lists from web scraping

I want to make a dataframe from web scrapping this page : https://www.airlinequality.com/airline-reviews/british-airways.

The value i have is reviews from passenger and rating that passenger give, but i dont know how to make it to be a dataframe

this is my code :

import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 5 #10
page_size = 1 #100

reviews = []
aircraft = []
seat_type = []
route = []
recommended = []
rating = []
category = []

for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
        
    for para2 in parsed_content.find_all("div", {"class" : "review-stats"}):
        for para3 in para2.find_all('td',{'class' : 'review-value'}):
            rating.append(para3.get_text())
        recomend = rating[-1]
        rating = rating[:-1]
        for para4 in para2.find_all('td',{'class' : 'review-rating-stars stars'}):
            para5 = len(para4.find_all('span', {'class' : 'star fill'}))
            rating.append(para5)
        rating.append(recomend)
        #print(rating)
        for para6 in para2.find_all('td',{'class' : 'review-rating-header'}):
            category.append(para6.get_text())
        #print(category)
        
    print(f"   ---> {len(reviews)} total reviews")

output i get : enter image description here enter image description here enter image description here

in a simple way, that is what I was asking :

first looping: category is [a, b, c, d, e] rating is [1, 2, 3, 4, 5]

second looping: the category will append with [a, c, e, o, p, q] and rating will append with [9, 8, 7, 6, 5, 4]

so the final data :

category = [a, b, c, d, e, a, c, e, o, p, q]

rating = [1, 2, 3, 4, 5, 9, 8, 7, 6, 5, 4]

output that I want:

enter image description here

Upvotes: -1

Views: 149

Answers (2)

Driftr95
Driftr95

Reputation: 4720

Since each review's rating categories start with either "Type of Traveller" or "Aircraft" followed by "Type of Traveller", you could split them up into a list of dictionaries with

cr = [(k, v) for k, v in zip(category, rating)]
si = [i for i, (k, v) in  enumerate(cr) if k == 'Type Of Traveller']
si = [(i - 1) if i != 0 and cr[i - 1][0] == 'Aircraft' else i for i in si]
splitCr = [dict(cr[start:end]) for start, end in zip(si, (si[1:] + [len(cr)]))]

output



However, it would be better to build a single list of dictionaries as we scrape [rather than to try to zip and split lists that are not guaranteed to have consistent lengths or contents]

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 3 # 5 # 10
page_size = 5 # 1 # 100

revList = []
avgSelRef = {
    'rating10': '.rating-10 span[itemprop="ratingValue"]',
    'header': 'div.info:has(h1[itemprop="name"])',
    'subheader': '.review-count',
    'reviewBody': '.skytrax-rating-mob img.skytrax-rating[alt]'
}
rbSel = '.body[id^="anchor"]'
revSelRef = {
    'rating10': '.rating-10 span[itemprop="ratingValue"]',
    'header': f'{rbSel} h2.text_header',
    'subheader': f'{rbSel} h3.text_sub_header',
    'reviewBody': f'{rbSel} div[itemprop="reviewBody"]'
} 

avgAdded = False
for i in range(1, pages + 1): 
    print("", end=f"Scraping page {i} of {pages} ")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)
    if response.status_code != 200: 
        print(f' -- !ERROR: "{response.raise_for_status()}"" getting {url}')
        continue
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')

    avSoups = parsed_content.select('div.review-info')
    rvSoups = parsed_content.select(f'article[itemprop="review"]:has({rbSel})')
    if avSoups and not avgAdded: rvSoups += avSoups
    for r in rvSoups:
        isAvg = r.name == 'div'
        if isAvg:
            rDets = {'reviewId': '[Average]'} 
            selRef = avgSelRef.items()
            avgAdded = True
        else:
            revId = r.select_one(rbSel).get('id').replace('anchor', '', 1)
            selRef = revSelRef.items()
            rDets = {'reviewId': revId} 
        
        for k, s in selRef:
            rdt = r.select_one(s) 
            if rdt is None: continue
            if 'img' in s and s.endswith('[alt]'):
                rDets[k] = rdt.get('alt') 
            else:
                rDets[k] = ' '.join(w for w in rdt.get_text(' ').split() if w)

        rhSel = 'td.review-rating-header'
        rRows = r.select(f'tr:has({rhSel} + td:is(.stars, .review-value))')
        for rr in rRows: 
            k = rr.select_one(rhSel).get_text(' ').strip()
            k = k.replace(' For ', ' for ').replace(' & ', ' + ') # bit of cleanup
            if k.endswith('Staff Service'): k = 'Staff Service' # bit of cleanup
            if rr.select('td.stars'): 
                rDets[f'[stars] {k}'] = len(rr.select('td.stars span.star.fill'))
            else: 
                rDets[k] = rr.select_one('td.review-value').get_text().strip()

        revList = ([rDets] + revList) if isAvg else (revList + [rDets])
    print(' - ', len(rvSoups), 'reviews --->', len(revList), 'total reviews') 

op2


You could also view just the star ratings: op3

Upvotes: 1

Kris
Kris

Reputation: 8873

You have the column values, just build the DataFrame

Eg.,

from pandas import DataFrame

category = ["Aircraft", 'Type of Traveller', 'Seat Type']
rating = ['A320', 'Solo', 'Business Class']

# Create the records from both list, using zip and dict calls.
data_dict = dict(zip(category, rating))

# Build the dataframe from the dictionary.
df = DataFrame.from_records(data_dict, columns=category, index=[0])

print(df)

Looks like this. enter image description here

Upvotes: 0

Related Questions