Scrape the employee ratings from Indeed in Python

Question

I am new to the web scrape and I need to scrape the employee ratings and reviews from Indeed but my code cannot work out. Could you please tell what wrong with my code? Thanks so much for your help.

from bs4 import BeautifulSoup
import pandas as pd
import requests

df = pd.DataFrame({'review_title': [],'review':[],'author':[],'rating':[]})

for i in range(0, 140, 20):
    url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    results = soup.find("div", {"id" : 'cmp-container'})
    elems = results.find_all(class_='cmp-Review-container')
    for elem in elems:
            title = elem.find(attrs = {'class':'cmp-Review-title'})
            review = elem.find('div', {'class': 'cmp-Review-text'})
            author = elem.find(attrs = {'class':'cmp-Review-author'})
            rating = elem.find(attrs = {'class':'cmp-ReviewRating-text'})
            df = df.append({'review_title': title.text,
                 'review': review.text,
                 'author': author.text,
                 'rating': rating.text
                }, ignore_index=True)

It only returns the header.

After taking Parikh's suggestion, it can return the employee reviews but it does not show the employee status, the former one or the current one. How can I improve my code to have the employee status?

# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd

# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021

# Meta(Facebook), 
lst=[]
for i in range(0, 460, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
        lst.append([title,author,review,rating])

df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta

The output is shown below and I also want to have the employee status. Thanks so much for your help.

Thanks again for your kind help and time. My last question is that I tried to scrape the pros and cons but it only returns NA. How should I revise it?

import numpy as np
lst=[]
for i in range(0, 240, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
            
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
            
        try:
            status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
        except AttributeError:
            status=np.nan
            
    
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
            
        try:
            pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)            
        except:
            pros=np.nan
        try:
            cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
        except:
            cons=np.nan
            
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
            
        lst.append([title,author,status,pros,cons,review,rating])

Bhavya Parikh · Accepted Answer

See first print out main_data and get overview of in which tag data is being present and according to get particular data also i have added try and except block

import numpy as np
lst=[]
for i in range(0, 140, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')   
    main_data=results.find_all("div",attrs={"data-tn-section":"reviews" })
    for data in main_data:
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
        try:
           status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
        except AttributeError:
           status=np.nan

        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
       except AttributeError:
            rating=np.nan
        lst.append([title,author,status,review,rating])

Now use lst as data inside DataFrame

import pandas as pd
df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
df

Output:

              title            author              status    review rating
0   good exerccise  Provincia di Milano, Lombardia  Senior Manager(Former Employee) working here can be challenging but helps buil...   3.0

Scrape the employee ratings from Indeed in Python

Answers (1)

Related Questions