MarkWP
MarkWP

Reputation: 177

How To Extract Duration and Thumbnail Url From YouTube Search Results Using Python and Selenium

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import csv

# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3') 
options.add_argument('--disable-notifications')
options.add_argument('--headless') # Comment to view browser actions

keyword = 'selenium python tutorial for beginners'

def get_video_results():
    youtube_data = []

    driver = webdriver.Chrome(options=options)
    driver.get('https://www.youtube.com/results?search_query=' + '%22' + keyword + '%22')

    # Auto Consent Youtube
    consent_button_xpath = '/html/body/ytd-app/ytd-consent-bump-v2-lightbox/tp-yt-paper-dialog/div[2]/div[2]/div[5]/div[2]/ytd-button-renderer[2]/a' #full xpath captured
    consent = WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, consent_button_xpath)))
    consent = driver.find_element_by_xpath(consent_button_xpath)
    consent.click()

    while True:

        end_result = driver.find_element_by_css_selector('#message').is_displayed()
        driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")

        if end_result == True:
            break

    for result in driver.find_elements_by_css_selector('.text-wrapper.style-scope.ytd-video-renderer'):
        title = result.find_element_by_css_selector('.title-and-badge.style-scope.ytd-video-renderer').text
        #duration = result.find_element_by_css_selector('').text
        #thumbnail_url = result.find_element_by_css_selector('')

        scraped_keywords = {
          'keyword': keyword,
          'title': title,
          #'duration': duration,
          #'thumbnail': thumbnail_url
        }
        youtube_data.append(scraped_keywords)
    print(scraped_keywords)


    print('lists:\n', pd.DataFrame(youtube_data))
    df = pd.DataFrame(youtube_data)
    df.to_csv('Extracted_info.csv', index=False)

    driver.quit()

get_video_results()

I have the following sample Url:

https://www.youtube.com/results?search_query=%22selenium+python+tutorial+for+beginners%22

whereby the above script manages to extract the title for each video channel found.

I'm struggling to figure out how to extract the video duration and thumbnail url. I've tried using various class and ID combinations without success.

Any help with solving this problem would be much appreciated.

Upvotes: 0

Views: 889

Answers (1)

cruisepandey
cruisepandey

Reputation: 29362

You need to do following things :

  1. Scroll to each element.
  2. Look for a tag which has href tag.
  3. Look for descendant web element which has aria-label of time.

Code :

driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
wait = WebDriverWait(driver, 30)

driver.get("https://www.youtube.com/results?search_query=%22selenium+python+tutorial+for+beginners%22")

size = driver.find_elements(By.XPATH, "//ytd-thumbnail[@class='style-scope ytd-video-renderer']")

j = 1
for i in range(len(size)):
    element = driver.find_element(By.XPATH, f"(//ytd-thumbnail[@class='style-scope ytd-video-renderer'])[{j}]")
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    thumbnail_href = element.find_element(By.XPATH, ".//a").get_attribute('href')
    print(thumbnail_href)
    time = element.find_element(By.XPATH, ".//descendant::span[contains(@class,'ytd-thumbnail-overlay-time-status-renderer')]").get_attribute('aria-label')
    print(time)
    j =  j + 1

Imports :

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Output :

https://www.youtube.com/watch?v=xejR0FugnjY
13 minutes, 37 seconds
https://www.youtube.com/watch?v=bMo9QvRTJ0Y
56 minutes, 36 seconds
https://www.youtube.com/watch?v=zogbqJGBVvQ
9 hours, 20 minutes, 45 seconds
https://www.youtube.com/watch?v=wVPwG7o4Gbg
1 hour, 47 minutes, 35 seconds
https://www.youtube.com/watch?v=xlyxH6bcS9Y
47 minutes, 54 seconds
https://www.youtube.com/watch?v=MXuKl963oIM
5 minutes, 7 seconds
https://www.youtube.com/watch?v=879YnSl80b0
14 minutes, 23 seconds
https://www.youtube.com/watch?v=mcX_dIkBf3U
32 minutes, 15 seconds
https://www.youtube.com/watch?v=GqsgrJyuoMw
26 minutes, 1 second
https://www.youtube.com/watch?v=SGFDoLfdRb4
16 minutes, 22 seconds
https://www.youtube.com/watch?v=o3tYiyE_OXE
8 hours, 22 minutes, 3 seconds
https://www.youtube.com/watch?v=FRn5J31eAMw
11 hours, 37 minutes, 39 seconds
https://www.youtube.com/watch?v=f7LEWxX4AVI
11 minutes, 2 seconds
https://www.youtube.com/watch?v=VBBz9b7Fcuo
7 minutes, 39 seconds
https://www.youtube.com/watch?v=JSxmN1kMvm4
36 minutes, 37 seconds
https://www.youtube.com/watch?v=ATigYVyCoAE
7 minutes, 56 seconds
https://www.youtube.com/watch?v=XEtHoiBLw4g
8 minutes, 11 seconds
https://www.youtube.com/watch?v=pguUQjJWMSQ
11 minutes, 12 seconds
https://www.youtube.com/watch?v=Q0YD8KYoN8c
12 minutes, 53 seconds
https://www.youtube.com/watch?v=bKI16wx_FVU
9 minutes, 45 seconds
https://www.youtube.com/watch?v=28zdhLPZ1Zk
15 minutes
https://www.youtube.com/watch?v=Iw63sO2h4Zg
4 minutes, 8 seconds

Upvotes: 1

Related Questions