SIM
SIM

Reputation: 22440

Facing issues with my twitter scraper written using python and selenium

I've written a script in python to parse the name, tweets, following and follower of those available in view all section in my profile page of twitter. It is currently doing it's job. However, I find two problems with this scraper:

  1. Every pages it parses the documents from are jamming on the taskbar.
  2. The scraper has got a clumsy look.

Here is what I've written:

from selenium import webdriver
import time

def twitter_data():

    driver = webdriver.Chrome()
    driver.get('https://twitter.com/?lang=en')

    driver.find_element_by_xpath('//input[@id="signin-email"]').send_keys('username')
    driver.find_element_by_xpath('//input[@id="signin-password"]').send_keys('password')
    driver.find_element_by_xpath('//button[@type="submit"]').click()
    driver.implicitly_wait(15)

    #Clicking the viewall link
    driver.find_element_by_xpath("//small[@class='view-all']//a[contains(@class,'js-view-all-link')]").click()
    time.sleep(10)

    for links in driver.find_elements_by_xpath("//div[@class='stream-item-header']//a[contains(@class,'js-user-profile-link')]"):
        processing_files(links.get_attribute("href"))
        #going on to the each profile falling under viewall section
def processing_files(item_link):

    driver = webdriver.Chrome()
    driver.get(item_link)
    # getting information of each profile holder
    for prof in driver.find_elements_by_xpath("//div[@class='route-profile']"):
        name = prof.find_elements_by_xpath(".//h1[@class='ProfileHeaderCard-name']//a[contains(@class,'ProfileHeaderCard-nameLink')]")[0]
        tweet = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[0]
        following = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[1]
        follower = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[2]
        print(name.text, tweet.text, following.text, follower.text)

twitter_data()

I've used both the implicitly_wait and time.sleep in my scraper cause when i found that it was necessary to keep the bot wait a bit longer I used the latter. Thanks in advance to take a look into it.

Upvotes: 1

Views: 398

Answers (1)

Murthi
Murthi

Reputation: 5347

You can use driver.quit() to close the pages as given below. This will reduce pages in the task bar.

from selenium import webdriver
import time

def twitter_data():

    driver = webdriver.Chrome()
    driver.get('https://twitter.com/?lang=en')

    driver.find_element_by_xpath('//input[@id="signin-email"]').send_keys('username')
    driver.find_element_by_xpath('//input[@id="signin-password"]').send_keys('password')
    driver.find_element_by_xpath('//button[@type="submit"]').click()
    driver.implicitly_wait(15)

    #Clicking the viewall link
    driver.find_element_by_xpath("//small[@class='view-all']//a[contains(@class,'js-view-all-link')]").click()
    time.sleep(10)

    for links in driver.find_elements_by_xpath("//div[@class='stream-item-header']//a[contains(@class,'js-user-profile-link')]"):
        processing_files(links.get_attribute("href"))

    driver.quit()
        #going on to the each profile falling under viewall section
def processing_files(item_link):

    driver1 = webdriver.Chrome()
    driver1.get(item_link)
    # getting information of each profile holder
    for prof in driver1.find_elements_by_xpath("//div[@class='route-profile']"):
        name = prof.find_elements_by_xpath(".//h1[@class='ProfileHeaderCard-name']//a[contains(@class,'ProfileHeaderCard-nameLink')]")[0]
        tweet = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[0]
        following = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[1]
        follower = prof.find_elements_by_xpath(".//span[@class='ProfileNav-value']")[2]
        print(name.text, tweet.text, following.text, follower.text)
        driver1.quit ()

twitter_data()

Upvotes: 1

Related Questions