Reputation: 510
Goal is to scrape all youtube channel links from the youtube video comment section. The current code just gets the usernames and not the channel links and doesn't look at the user inside replies. I don't get how to do this and why my xPaths are wrong.
Code:
from selenium import webdriver
import time
driver=webdriver.Chrome()
driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)
#Scrolling
for i in range(4):
#scroll 1000 px
driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
#waiting for the page to load
time.sleep(1.5)
#replies
replies = driver.find_element_by_xpath('//*[@id="more-replies"]')
time.sleep(1)
replies.click()
comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="author-text"]')
for comment in comments:
print(comment.text)
Upvotes: 1
Views: 809
Reputation: 356
You need to get the href attribute if you want the channel url:
for comment in comments:
print(comment.get_attribute('href'))
If you want the channels of each reply (of each comment) too then you can try the following. I've added comments on some lines for context...
main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments
for mc in main_comments:
main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
print('The commenters channel is: ' + main_comment_channel) # print the channel of the main comment
replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
if replies.text.startswith('View'): # check if there are any replies
replies.find_element_by_css_selector('a').click() # if so open the replies
time.sleep(3) # wait for load (better strategy should be used here
for reply in replies.find_elements_by_id('author-text'):
reply_channel = reply.get_attribute('href')
print('Reply channel: ' + reply_channel) # print the channel of each reply
Full solution including writing to .txt file
file = open("output.txt","w+")
driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)
#new scrolling
while(len(driver.find_elements_by_css_selector('#sections>#continuations #spinner')) > 0):
#scroll 1000 px
driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
#waiting for the page to load
time.sleep(1.5)
main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments
for mc in main_comments:
main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
file.write('The commenters channel is: ' + main_comment_channel + '\n') #write the channel of the main comment to file
replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
if replies.text.startswith('View'): # check if there are any replies
reply = replies.find_element_by_css_selector('a');
driver.execute_script("arguments[0].scrollIntoView();", reply) # bring view replies into view
driver.execute_script('window.scrollTo(0,(window.pageYOffset-150))') # cater for the youtube header
reply.click() # if so open the replies
time.sleep(3) # wait for load (better strategy should be used here
for reply in replies.find_elements_by_id('author-text'):
reply_channel = reply.get_attribute('href')
file.write('Reply channel: ' + reply_channel + '\n') # write the channel of each reply to file
file.close()
Upvotes: 2