Reputation: 1
I am trying to perfect this web scraper that uses Chrome WebDriver to scrape pages. It currently keeps breaking at line 74:
soup = BeautifulSoup(HTML, "html.parser")
with the error code:
AttributeError: 'str' object has no attribute 'text'.
How can I fix this? I'm not sure why it keeps breaking at that point.
import urllib2, sys
from BeautifulSoup import BeautifulSoup
from datetime import datetime
import requests
from lxml import html
import traceback
import csv
import time
import json
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
username = "user" # your email here
password = "pass" # your password here
pages = 10
companyName = "Apple"
companyURL = "https://www.glassdoor.com/Reviews/Apple-US-Reviews-EI_IE7438.0,6_IL.7,9_IN1.htm?filter.defaultEmploymentStatuses=false&filter.defaultLocation=false"
def obj_dict(obj):
return obj.__dict__
#enddef
def json_export(data):
jsonFile = open(companyName + ".json", "w")
jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
jsonFile.close()
#enddef
def init_driver():
driver = webdriver.Chrome('C:\Python27\chromedriver.exe')
driver.wait = WebDriverWait(driver, 10)
return driver
#enddef
def login(driver, username, password):
driver.get("http://www.glassdoor.com/profile/login_input.htm")
try:
user_field = driver.wait.until(EC.presence_of_element_located(
(By.NAME, "username")))
pw_field = driver.find_element_by_class_name("signin-password")
login_button = driver.find_element_by_id("signInBtn")
user_field.send_keys(username)
user_field.send_keys(Keys.TAB)
time.sleep(1)
pw_field.send_keys(password)
time.sleep(1)
login_button.click()
except TimeoutException:
print("TimeoutException! Username/password field or login button not found on glassdoor.com")
#enddef
###
def get_data(driver, URL, startPage, endPage, data, refresh):
if (startPage > endPage):
return data
#endif
print "\nPage " + str(startPage) + " of " + str(endPage)
currentURL = URL + "_IP" + str(startPage) + ".htm"
time.sleep(2)
#endif
if (refresh):
driver.get(currentURL)
print "Getting " + currentURL
#endif
time.sleep(2)
HTML = driver.page_source
soup = BeautifulSoup(HTML, "html.parser")
reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })
if (reviews):
data = parse_reviews_HTML(reviews, data)
print "Page " + str(startPage) + " scraped."
if (startPage % 10 == 0):
print "\nTaking a breather for a few seconds ..."
time.sleep(10)
#endif
get_data(driver, URL, startPage + 1, endPage, data, True)
else:
print "Waiting ... page still loading or CAPTCHA input required"
time.sleep(3)
get_data(driver, URL, startPage, endPage, data, False)
#endif
return data
#enddef
if __name__ == "__main__":
driver = init_driver()
time.sleep(3)
print "Logging into Glassdoor account ..."
login(driver, username, password)
time.sleep(5)
print "\nStarting data scraping ..."
data = get_data(driver, companyURL[:-4], 1, pages, [], True)
print "\nExporting data to " + Apple + ".json"
json_export(data)
driver.quit()
#endif
summary_box = soup.find('span', attrs={'class': 'summary '})
summary = summary_box.text.strip()
print summary
Upvotes: 0
Views: 205
Reputation: 186
You are probably using BeautifulSoup version 3(I tried with it and the problem occured). Even if that's not the case try removing the "html.parser" argument, make it just:
soup = BeautifulSoup(HTML)
I hope it'll work.)
Upvotes: 1