Reputation: 63
So I am trying to scrape usernames and comments from multiple posts. Using this code below.
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url=['https://www.instagram.com/p/CRLe53_hmMH','https://www.instagram.com/p/CRX7VL1sL54/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRVB7ykM7-R/?utm_medium=share_sheet', 'https://www.instagram.com/p/CRQ9Bq5M6ce/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRQT1BJMmSi/?utm_medium=share_sheet', 'https://www.instagram.com/p/CM8T3HgMQG0/?utm_medium=copy_link'
'https://www.instagram.com/p/COrn5fYs78O/?utm_medium=share_sheet']
user_names = []
user_comments = []
driver = driver = webdriver.Chrome('E:/chromedriver')
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
username.clear()
username.send_keys('myuname')
password.clear()
password.send_keys('mypassword')
Login_button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
time.sleep(4)
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name('gElp9 ')
for c in comment:
container = c.find_element_by_class_name('C4VMK')
name = container.find_element_by_class_name('_6lAjh ').text
content = container.find_element_by_tag_name('span').text
content = content.replace('\n', ' ').strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
#export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)),
columns =['Name', 'Comments'])
#df.to_excel('ujicoba_gabung_IG_6.xlsx')
print(df)
But somehow instead of returning username and comment, both user_names and user_comments return usernames. Where did I make a mistake?
Here Are My outputs
I think my problem is on the for loop where I declare the container as C4VMK. But I inspected the element on Instagram it is already the same
Upvotes: 0
Views: 841
Reputation: 115
Your container is correct. However, when you search for a span by tag name like this:
content = container.find_element_by_tag_name('span').text
Selenium will find the first span that is under the content. Which in this case is the username span with the class 'Jv7Aj mArmR MqpiF '.
What you are looking for is the other span that I highlighted in the image, which is a direct child of the container with an empty class.
You can select it like this:
content = container.find_element_by_xpath("/span[@class='']")
Upvotes: 0
Reputation: 607
There are two span
in C4VMK
class. First in h3 -> first div -> span
and second is that one you want.
For getting the second span
that is the comment, replace your code with below and get the second element.
content = container.find_elements_by_tag_name('span')[1].text
Upvotes: 1