Newbee
Newbee

Reputation: 55

How to scrape data from website which is populated using js?

I am trying to scrape post data ( likes, shares, image, etc ) from sharechat.com but the problem is I cannot find image URL of posts using Selenium as I suspect it uses Javascript to populate it.

I have tried playing around with Selenium to find the outermost HTML (displayed HTML) and I got all the other post information like the number of likes, shares, comments, etc but I am unable to get store Image as I cannot find its URL.

I am doing this for social network research for sentiment analysis and recommendation trends, so I expect to scrape the post data along with tags and number of likes, shares, etc. I am only failing at scraping out tags and URL of images.

Here is geckodriver file you will need to run.
Here's my code:

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
if not os.path.exists(files):
    file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
        print(var) #No of watches
        enter.write("Total No of views:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
        print(var) #Title
        enter.write("Title:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
        print(var) #owner bio
        enter.write("Writer's Bio:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
        print(var) #owner's bio
        enter.write("Writer's Name:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
        print(var) #comments
        enter.write("Total Comments:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
        print(var) #whatsapp
        enter.write("Whatsapp Share:\n%s\n" %(var));

        print()
        # driver.save_screenshot("captcha_%s.png"%(i))

    driver.back()

driver.quit()
enter.close()

Upvotes: 0

Views: 293

Answers (2)

supputuri
supputuri

Reputation: 14135

Here is the refracted code. Added the tags and images logic at the end.

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
# if not os.path.exists(files):
#     file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #get the number of feeds
    feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
    for ifeedCard in range(len(feedCards)):
        # get Number of watches
        watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
        print(watches)
        enter.write("Total No of views:\n%s\n" % (watches));
        # get title
        title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
        print(title)
        enter.write("Title:\n%s\n" % (title));
        # get owner bio
        writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
        print(writerBio)
        enter.write("Writer's Bio:\n%s\n" % (writerBio));
        # get owner name
        writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
        print(writerName)
        enter.write("Writer Name:\n%s\n" % (writerName));
        # get comment
        comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
        print(comment)
        enter.write("Number of comments:\n%s\n" % (comment));
        # get share via whatsapp
        whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
        print(whatsApp)
        enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
        #get tags
        tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
        print(tags)
        enter.write("Tags:\n%s\n" % (tags));
        # get onwer image
        image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
        print(image)
        enter.write("Owner Image link:\n%s\n" % (image));
        # post image
        postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
        print(postImage)
        enter.write("post image link:\n%s\n" % (postImage))


driver.quit()
enter.close()

If you are trying to download the file to different folder. Use the below code.

profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")

Once you download the file just use the below to rename the file to desired name.

os.rename(download_file_name,desired_name) # you can pass the file name with path.

Upvotes: 2

Jortega
Jortega

Reputation: 3790

I changes the web driver path and the range variable. If you create a folder C:\Py my code below will output a text file with the called PageSource_StackOverflowQ2.txt with the image src paths.

I was having a lot of issues with binary characters in the htlm so there is probably a much better way to do this but hopefully this will help get you where you are trying to go.

If the image path contains these 9 characters in a row my code will brake (" title=")

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

# files = "dataset_link_1.txt"
enter = open('C:\\Py\\dataset_link_1.txt','w+')
# if not os.path.exists(files):
#     file(files, 'w').close()
# enter = open(files,'w');

url = serviceurl
# driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver = webdriver.Firefox(executable_path=r'C:\\Py\\geckodriver.exe');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

# for i in range(1,20):
for i in range ( 1, 2 ):

SCROLL_PAUSE_TIME = 0.5

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
    print(var) #No of watches
    enter.write("Total No of views:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
    print(var) #Title
    enter.write("Title:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
    print(var) #owner bio
    enter.write("Writer's Bio:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
    print(var) #owner's bio
    enter.write("Writer's Name:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
    print(var) #comments
    enter.write("Total Comments:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
    print(var) #whatsapp
    enter.write("Whatsapp Share:\n%s\n" %(var));


    PageSource1 = [driver.page_source]
    PageSource1 = PageSource1[0].encode ( "utf-8" )

    file = open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', 'ab' )

    file.write ( PageSource1 )
    file.close ()
    FindPageCount = []

    file = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'w' )

    with open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', "rb" ) as outfile, open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', "a" ) as f1:
        for line in outfile:
            uline = line.decode ( 'ascii', errors='ignore' )
            f1.write ( uline )
    outfile.close ()
    f1.close ()

    data = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'r' ).readlines ()
    with open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt' ) as f, open ( 'C:\\Py\\PageSource_StackOverflowQ2.txt', "w" ) as f1:
        data = f.readlines ()
        for i in range ( len ( data ) ):
            line = data[i]
            if ("img src" in line):
                q = line.split("><")
                for k in q:
                    if("img src" in k):
                        h = 0
                        while h < len ( k ):
                            l = h + 9
                            if k[h:l] == '" title="':
                                f1.write ( k[9:h] )
                                f1.write ( '\n' )
                                print ( h )
                                print ( k[9:h] )
                            h = h + 1


    print()
    # driver.save_screenshot("captcha_%s.png"%(i))

driver.back()

driver.quit()
enter.close()

Upvotes: 0

Related Questions