How to scrape data from website which is populated using js?

Question

I am trying to scrape post data ( likes, shares, image, etc ) from sharechat.com but the problem is I cannot find image URL of posts using Selenium as I suspect it uses Javascript to populate it.

I have tried playing around with Selenium to find the outermost HTML (displayed HTML) and I got all the other post information like the number of likes, shares, comments, etc but I am unable to get store Image as I cannot find its URL.

I am doing this for social network research for sentiment analysis and recommendation trends, so I expect to scrape the post data along with tags and number of likes, shares, etc. I am only failing at scraping out tags and URL of images.

Here is geckodriver file you will need to run.
Here's my code:

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
if not os.path.exists(files):
    file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
        print(var) #No of watches
        enter.write("Total No of views:
%s
" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
        print(var) #Title
        enter.write("Title:
%s
" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
        print(var) #owner bio
        enter.write("Writer's Bio:
%s
" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
        print(var) #owner's bio
        enter.write("Writer's Name:
%s
" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
        print(var) #comments
        enter.write("Total Comments:
%s
" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
        print(var) #whatsapp
        enter.write("Whatsapp Share:
%s
" %(var));

        print()
        # driver.save_screenshot("captcha_%s.png"%(i))

    driver.back()

driver.quit()
enter.close()

supputuri · Accepted Answer

Here is the refracted code. Added the tags and images logic at the end.

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
# if not os.path.exists(files):
#     file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #get the number of feeds
    feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
    for ifeedCard in range(len(feedCards)):
        # get Number of watches
        watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
        print(watches)
        enter.write("Total No of views:
%s
" % (watches));
        # get title
        title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
        print(title)
        enter.write("Title:
%s
" % (title));
        # get owner bio
        writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
        print(writerBio)
        enter.write("Writer's Bio:
%s
" % (writerBio));
        # get owner name
        writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
        print(writerName)
        enter.write("Writer Name:
%s
" % (writerName));
        # get comment
        comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
        print(comment)
        enter.write("Number of comments:
%s
" % (comment));
        # get share via whatsapp
        whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
        print(whatsApp)
        enter.write("Whatsapp Share:
%s
" % (whatsApp));
        #get tags
        tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
        print(tags)
        enter.write("Tags:
%s
" % (tags));
        # get onwer image
        image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
        print(image)
        enter.write("Owner Image link:
%s
" % (image));
        # post image
        postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
        print(postImage)
        enter.write("post image link:
%s
" % (postImage))


driver.quit()
enter.close()

If you are trying to download the file to different folder. Use the below code.

profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")

Once you download the file just use the below to rename the file to desired name.

os.rename(download_file_name,desired_name) # you can pass the file name with path.

How to scrape data from website which is populated using js?

Answers (2)

Related Questions