beautifulsoup extract data by searched class

Question

Below code extracts Arbeitsatmosphare and Stadt data from reviews from below website. But extraction is based on index method, so if we would like to extract not Arteitsatmosphare, but Image (rating_tags[12]), we will have error, because sometimes we have only 2 or 3 items in review.

I would like to update this code to get below output. If we dont have Image use 0 or n/a.

         Arbeitsatmosphare | Stadt     | Image | 
   1.      4.00            | Berlin    | 4.00  |
   2.      5.00            | Frankfurt | 3.00  |
   3.      3.00            | Munich    | 3.00  |
   4.      5.00            | Berlin    | 2.00  |
   5.      4.00            | Berlin    | 5.00  |

My code is below

import requests
from bs4 import BeautifulSoup
import pandas as  pd

arbeit = []
stadt = []
with requests.Session() as session:
    session.headers = {
        'x-requested-with': 'XMLHttpRequest'
    }
    page = 1
    while True:
        print(f"Processing page {page}..")
        url = f'https://www.kununu.com/de/volkswagenconsulting/kommentare/{page}'
        response = session.get(url)

        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        print("Number of articles: " + str(len(articles)))
        for article in articles:

            rating_tags = article.find_all('span', {'class' : 'rating-badge'})

            arbeit.append(rating_tags[0].text.strip())


            detail_div = article.find_all('div', {'class' : 'review-details'})[0]
            nodes = detail_div.find_all('li')
            stadt_node = nodes[1]
            stadt_node_div = stadt_node.find_all('div')
            stadt_name = stadt_node_div[1].text.strip()
            stadt.append(stadt_name)

        page += 1

        pagination = soup.find_all('div', {'class' : 'paginationControl'})
        if not pagination:
            break

df = pd.DataFrame({'Arbeitsatmosphäre' : arbeit, 'Stadt' : stadt})
print(df)

chitown88 · Accepted Answer

You can use try/except.

import requests
from bs4 import BeautifulSoup
import pandas as  pd
import re

arbeit = []
stadt = []
image = []
with requests.Session() as session:
    session.headers = {
        'x-requested-with': 'XMLHttpRequest'
    }
    page = 1
    while True:
        print(f"Processing page {page}..")
        url = f'https://www.kununu.com/de/volkswagenconsulting/kommentare/{page}'
        response = session.get(url)

        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        print("Number of articles: " + str(len(articles)))
        for article in articles:

            rating_tags = article.find_all('span', {'class' : 'rating-badge'})

            arbeit.append(rating_tags[0].text.strip())

            try:
                imageText = article.find('span', text=re.compile(r'Image')).find_next('span').text.strip()
                image.append(imageText)
            except:
                image.append('N/A')



            detail_div = article.find_all('div', {'class' : 'review-details'})[0]
            nodes = detail_div.find_all('li')
            stadt_node = nodes[1]
            stadt_node_div = stadt_node.find_all('div')
            stadt_name = stadt_node_div[1].text.strip()
            stadt.append(stadt_name)

        page += 1

        pagination = soup.find_all('div', {'class' : 'paginationControl'})
        if not pagination:
            break

df = pd.DataFrame({'Arbeitsatmosphäre' : arbeit, 'Stadt' : stadt, 'Image': image})
print(df)

Output:

Processing page 1..
Number of articles: 10
Processing page 2..
Number of articles: 10
Processing page 3..
Number of articles: 10
Processing page 4..
Number of articles: 4
   Arbeitsatmosphäre      Stadt Image
0               5,00  Wolfsburg  4,00
1               5,00  Wolfsburg  4,00
2               5,00  Wolfsburg  5,00
3               5,00  Wolfsburg  4,00
4               2,00  Wolfsburg  2,00
5               5,00  Wolfsburg  5,00
6               5,00  Wolfsburg  5,00
7               5,00  Wolfsburg  4,00
8               5,00  Wolfsburg  4,00
9               5,00  Wolfsburg  5,00
10              5,00  Wolfsburg  4,00
11              5,00  Wolfsburg  5,00
12              5,00  Wolfsburg  5,00
13              4,00  Wolfsburg  4,00
14              4,00  Wolfsburg  4,00
15              4,00  Wolfsburg  4,00
16              5,00  Wolfsburg  5,00
17              3,00  Wolfsburg  5,00
18              5,00  Wolfsburg  4,00
19              5,00  Wolfsburg  5,00
20              5,00  Wolfsburg  4,00
21              4,00  Wolfsburg  2,00
22              5,00  Wolfsburg  5,00
23              4,00  Wolfsburg   N/A
24              4,00  Wolfsburg  4,00
25              4,00  Wolfsburg  4,50
26              5,00  Wolfsburg  5,00
27              2,33  Wolfsburg  2,00
28              5,00  Wolfsburg  5,00
29              2,00  Wolfsburg  1,00
30              4,00  Wolfsburg  3,00
31              5,00  Wolfsburg  5,00
32              5,00  Wolfsburg  4,00
33              4,00  Wolfsburg  4,00

beautifulsoup extract data by searched class

Answers (1)

Related Questions