Rahul Nair
Rahul Nair

Reputation: 1

How to scrape all Steam id, review content, profile_url from reviews of a game in steam into excel file using python?

#The error is either it prints only first 11 reviews (when while n<500 is used) or does not print at all(when while True: is used). Requirement is to save all Steam id, review content, profile_url from reviews of the game into excel.

from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
import re
from time import sleep
from datetime import datetime
from openpyxl import Workbook

game_id= 1097150
url = 'https://steamcommunity.com/app/1097150/positivereviews/?p=1&browsefilter=trendweek&filterLanguage=english'

options = EdgeOptions()
options.use_chromium = True
driver = Edge(options=options)
driver.get(url)

#The page is continously scrolling, and scraping begins

last_position = driver.execute_script("return window.pageYOffset;")
reviews = []
review_ids = set()

while True:
  cards = driver.find_elements_by_class_name('apphub_Card')
  for card in cards[-20:]:
    profile_url = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    date_posted = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]/div').text
    review_content = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()  
     
    review = (steam_id, profile_url, review_content)
    reviews.append(review) 
  
  attempt_count = 0
  while attempt_count < 3:
       driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
       curr_position = driver.execute_script("return window.pageYOffset;")
       
       if curr_position == last_position:
             attempt_count += 1
             sleep(0.5)
         else:
             break
driver.close()

#to save the results

wb = Workbook()
ws = wb.worksheets[0]
ws.append(['SteamId', 'ProfileURL', 'ReviewText'])
for row in reviews:
    ws.append(row)
    
today = datetime.today().strftime('%Y%m%d')    
wb.save(f'Steam_Reviews_{game_id}_{today}.xlsx')    
wb.close()

Upvotes: 0

Views: 764

Answers (1)

Arundeep Chohan
Arundeep Chohan

Reputation: 9969

Here's how to infinitely scroll down or until 500 elements in your case.

while True:
  cards = driver.find_elements_by_class_name('apphub_Card')
  if(len(cards)>=500):
      break
  last_position = driver.execute_script("return window.pageYOffset;")
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(1)
  curr_position = driver.execute_script("return window.pageYOffset;")
  if(last_position==curr_position):
      break
    
for card in cards[:500]:
    profile_url = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    date_posted = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]/div').text
    review_content = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()  
    review = (steam_id, profile_url, review_content)
    reviews.append(review)

Upvotes: 1

Related Questions