Reputation: 55
I am trying to scrape post data ( likes, shares, image, etc ) from sharechat.com but the problem is I cannot find image URL of posts using Selenium as I suspect it uses Javascript to populate it.
I have tried playing around with Selenium to find the outermost HTML (displayed HTML) and I got all the other post information like the number of likes, shares, comments, etc but I am unable to get store Image as I cannot find its URL.
I am doing this for social network research for sentiment analysis and recommendation trends, so I expect to scrape the post data along with tags and number of likes, shares, etc. I am only failing at scraping out tags and URL of images.
Here is geckodriver file you will need to run.
Here's my code:
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
files = "dataset_link_1.txt"
if not os.path.exists(files):
file(files, 'w').close()
enter = open(files,'w');
url = serviceurl
driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
for i in range(1,20):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
print(var) #No of watches
enter.write("Total No of views:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
print(var) #Title
enter.write("Title:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
print(var) #owner bio
enter.write("Writer's Bio:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
print(var) #owner's bio
enter.write("Writer's Name:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
print(var) #comments
enter.write("Total Comments:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
print(var) #whatsapp
enter.write("Whatsapp Share:\n%s\n" %(var));
print()
# driver.save_screenshot("captcha_%s.png"%(i))
driver.back()
driver.quit()
enter.close()
Upvotes: 0
Views: 293
Reputation: 14135
Here is the refracted code. Added the tags and images logic at the end.
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
files = "dataset_link_1.txt"
# if not os.path.exists(files):
# file(files, 'w').close()
enter = open(files,'w');
url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
for i in range(1,20):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#get the number of feeds
feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
for ifeedCard in range(len(feedCards)):
# get Number of watches
watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
print(watches)
enter.write("Total No of views:\n%s\n" % (watches));
# get title
title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
print(title)
enter.write("Title:\n%s\n" % (title));
# get owner bio
writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
print(writerBio)
enter.write("Writer's Bio:\n%s\n" % (writerBio));
# get owner name
writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
print(writerName)
enter.write("Writer Name:\n%s\n" % (writerName));
# get comment
comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
print(comment)
enter.write("Number of comments:\n%s\n" % (comment));
# get share via whatsapp
whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
print(whatsApp)
enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
#get tags
tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
print(tags)
enter.write("Tags:\n%s\n" % (tags));
# get onwer image
image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
print(image)
enter.write("Owner Image link:\n%s\n" % (image));
# post image
postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
print(postImage)
enter.write("post image link:\n%s\n" % (postImage))
driver.quit()
enter.close()
If you are trying to download the file to different folder. Use the below code.
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
Once you download the file just use the below to rename the file to desired name.
os.rename(download_file_name,desired_name) # you can pass the file name with path.
Upvotes: 2
Reputation: 3790
I changes the web driver path and the range variable. If you create a folder C:\Py my code below will output a text file with the called PageSource_StackOverflowQ2.txt with the image src paths.
I was having a lot of issues with binary characters in the htlm so there is probably a much better way to do this but hopefully this will help get you where you are trying to go.
If the image path contains these 9 characters in a row my code will brake (" title=")
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
# files = "dataset_link_1.txt"
enter = open('C:\\Py\\dataset_link_1.txt','w+')
# if not os.path.exists(files):
# file(files, 'w').close()
# enter = open(files,'w');
url = serviceurl
# driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver = webdriver.Firefox(executable_path=r'C:\\Py\\geckodriver.exe');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
# for i in range(1,20):
for i in range ( 1, 2 ):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
print(var) #No of watches
enter.write("Total No of views:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
print(var) #Title
enter.write("Title:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
print(var) #owner bio
enter.write("Writer's Bio:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
print(var) #owner's bio
enter.write("Writer's Name:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
print(var) #comments
enter.write("Total Comments:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
print(var) #whatsapp
enter.write("Whatsapp Share:\n%s\n" %(var));
PageSource1 = [driver.page_source]
PageSource1 = PageSource1[0].encode ( "utf-8" )
file = open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', 'ab' )
file.write ( PageSource1 )
file.close ()
FindPageCount = []
file = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'w' )
with open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', "rb" ) as outfile, open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', "a" ) as f1:
for line in outfile:
uline = line.decode ( 'ascii', errors='ignore' )
f1.write ( uline )
outfile.close ()
f1.close ()
data = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'r' ).readlines ()
with open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt' ) as f, open ( 'C:\\Py\\PageSource_StackOverflowQ2.txt', "w" ) as f1:
data = f.readlines ()
for i in range ( len ( data ) ):
line = data[i]
if ("img src" in line):
q = line.split("><")
for k in q:
if("img src" in k):
h = 0
while h < len ( k ):
l = h + 9
if k[h:l] == '" title="':
f1.write ( k[9:h] )
f1.write ( '\n' )
print ( h )
print ( k[9:h] )
h = h + 1
print()
# driver.save_screenshot("captcha_%s.png"%(i))
driver.back()
driver.quit()
enter.close()
Upvotes: 0