rback
rback

Reputation: 1

Attribute Error - Scraping a page in Python using WebDriver...?

I am trying to perfect this web scraper that uses Chrome WebDriver to scrape pages. It currently keeps breaking at line 74:

soup = BeautifulSoup(HTML, "html.parser")

with the error code:

AttributeError: 'str' object has no attribute 'text'.

How can I fix this? I'm not sure why it keeps breaking at that point.

    import urllib2, sys
    from BeautifulSoup import BeautifulSoup
    from datetime import datetime
    import requests
    from lxml import html
    import traceback
    import csv
    import time
    import json

    import selenium
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.keys import Keys

    username = "user" # your email here
    password = "pass" # your password here

    pages = 10
    companyName = "Apple"
    companyURL = "https://www.glassdoor.com/Reviews/Apple-US-Reviews-EI_IE7438.0,6_IL.7,9_IN1.htm?filter.defaultEmploymentStatuses=false&filter.defaultLocation=false"

    def obj_dict(obj):
        return obj.__dict__
    #enddef

    def json_export(data):
      jsonFile = open(companyName + ".json", "w")
      jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
      jsonFile.close()
    #enddef

    def init_driver():
        driver = webdriver.Chrome('C:\Python27\chromedriver.exe')
        driver.wait = WebDriverWait(driver, 10)
        return driver
    #enddef

    def login(driver, username, password):
        driver.get("http://www.glassdoor.com/profile/login_input.htm")
        try:
            user_field = driver.wait.until(EC.presence_of_element_located(
                (By.NAME, "username")))
            pw_field = driver.find_element_by_class_name("signin-password")
            login_button = driver.find_element_by_id("signInBtn")
            user_field.send_keys(username)
            user_field.send_keys(Keys.TAB)
            time.sleep(1)
            pw_field.send_keys(password)
            time.sleep(1)
            login_button.click()
        except TimeoutException:
            print("TimeoutException! Username/password field or login button not found on glassdoor.com")
    #enddef

    ###
    def get_data(driver, URL, startPage, endPage, data, refresh):
      if (startPage > endPage):
        return data
      #endif
      print "\nPage " + str(startPage) + " of " + str(endPage)
      currentURL = URL + "_IP" + str(startPage) + ".htm"
      time.sleep(2)
      #endif
      if (refresh):
        driver.get(currentURL)
        print "Getting " + currentURL
      #endif
      time.sleep(2)
      HTML = driver.page_source
      soup = BeautifulSoup(HTML, "html.parser")
      reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })
      if (reviews):
        data = parse_reviews_HTML(reviews, data)
        print "Page " + str(startPage) + " scraped."
        if (startPage % 10 == 0):
          print "\nTaking a breather for a few seconds ..."
          time.sleep(10)
        #endif
        get_data(driver, URL, startPage + 1, endPage, data, True)
      else:
        print "Waiting ... page still loading or CAPTCHA input required"
        time.sleep(3)
        get_data(driver, URL, startPage, endPage, data, False)
      #endif
      return data
    #enddef

    if __name__ == "__main__":
      driver = init_driver()
      time.sleep(3)
      print "Logging into Glassdoor account ..."
      login(driver, username, password)
      time.sleep(5)
      print "\nStarting data scraping ..."
      data = get_data(driver, companyURL[:-4], 1, pages, [], True)
      print "\nExporting data to " + Apple + ".json"
      json_export(data)
      driver.quit()
    #endif


    summary_box = soup.find('span', attrs={'class': 'summary '})

    summary = summary_box.text.strip()
    print summary

Upvotes: 0

Views: 205

Answers (1)

v100ev
v100ev

Reputation: 186

You are probably using BeautifulSoup version 3(I tried with it and the problem occured). Even if that's not the case try removing the "html.parser" argument, make it just:

soup = BeautifulSoup(HTML)

I hope it'll work.)

Upvotes: 1

Related Questions