request
request

Reputation: 510

Use Selenium to click all youtube comment 'reply' buttons and get channel links

Goal is to scrape all youtube channel links from the youtube video comment section. The current code just gets the usernames and not the channel links and doesn't look at the user inside replies. I don't get how to do this and why my xPaths are wrong.

Code:

from selenium import webdriver
import time

driver=webdriver.Chrome()

driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)

#Scrolling
for i in range(4):
    #scroll 1000 px
    driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
    #waiting for the page to load
    time.sleep(1.5) 


#replies
replies = driver.find_element_by_xpath('//*[@id="more-replies"]')
time.sleep(1)
replies.click()


comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="author-text"]')
for comment in comments:
    print(comment.text)

Upvotes: 1

Views: 809

Answers (1)

Prab G
Prab G

Reputation: 356

You need to get the href attribute if you want the channel url:

for comment in comments:
    print(comment.get_attribute('href'))

If you want the channels of each reply (of each comment) too then you can try the following. I've added comments on some lines for context...

main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments

for mc in main_comments:
    main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
    print('The commenters channel is: ' + main_comment_channel) # print the channel of the main comment

    replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
    if replies.text.startswith('View'): # check if there are any replies
        replies.find_element_by_css_selector('a').click() # if so open the replies
        time.sleep(3) # wait for load (better strategy should be used here

        for reply in replies.find_elements_by_id('author-text'):
            reply_channel = reply.get_attribute('href')
            print('Reply channel: ' + reply_channel) # print the channel of each reply

Full solution including writing to .txt file

file = open("output.txt","w+")

driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)

#new scrolling
while(len(driver.find_elements_by_css_selector('#sections>#continuations #spinner')) > 0):
    #scroll 1000 px
    driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
    #waiting for the page to load
    time.sleep(1.5) 


main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments

for mc in main_comments:
    main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
    file.write('The commenters channel is: ' + main_comment_channel + '\n') #write the channel of the main comment to file

    replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
    if replies.text.startswith('View'): # check if there are any replies
        reply = replies.find_element_by_css_selector('a');
        driver.execute_script("arguments[0].scrollIntoView();", reply) # bring view replies into view
        driver.execute_script('window.scrollTo(0,(window.pageYOffset-150))') # cater for the youtube header
        reply.click() # if so open the replies
        time.sleep(3) # wait for load (better strategy should be used here

        for reply in replies.find_elements_by_id('author-text'):
            reply_channel = reply.get_attribute('href')
            file.write('Reply channel: ' + reply_channel + '\n') # write the channel of each reply to file

file.close()

Upvotes: 2

Related Questions