Reputation: 99
How I can get the playlist urls stored like
here: https://www.youtube.com/watch?v=VpTRlS7EO6E&list=RDOIhVs0FQ8xc&index=5 with bs4?
Using
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1')
page = r.text
soup=bs(page,'html.parser')
#print(soup)
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
doesn't return anything. Even printing the soup itself doesn't contain the link I'am looking for (like href="/watch?v=puNOG62lf-Y&list=RDOIhVs0FQ8xc&index=2")
Upvotes: 1
Views: 7888
Reputation: 155
The easiest solution is:
from pytube import Playlist
URL_PLAYLIST = "https://www.youtube.com/playlist?list=YOUR-LINK"
# Retrieve URLs of videos from playlist
playlist = Playlist(URL_PLAYLIST)
print('Number Of Videos In playlist: %s' % len(playlist.video_urls))
print(playlist)
Upvotes: 6
Reputation: 101
This works for me:
from selenium import webdriver # pip install selenium
import time
# make sure you download chrome driver from https://chromedriver.chromium.org/downloads and put it in folder 'driver'
driver = webdriver.Chrome('driver\chromedriver.exe')
driver.get('https://www.youtube.com/playlist?list=PLxvodScTx2RtAOoajGSu6ad4p8P8uXKQk') # put here your link
# scroll page down
old_position = 0
new_position = None
position_script = """return (window.pageYOffset !== undefined) ?
window.pageYOffset : (document.documentElement ||
document.body.parentNode || document.body);"""
while new_position != old_position:
old_position = driver.execute_script(position_script)
time.sleep(1)
driver.execute_script(
"""var scrollingElement = (document.scrollingElement ||
document.body);scrollingElement.scrollTop =
scrollingElement.scrollHeight;""")
new_position = driver.execute_script(position_script)
source_page = driver.page_source
driver.quit()
# extract the url's and name's
counter = 1
element_to_find = 'amp;index={}" ar'
video_index = source_page.find(element_to_find.format(counter)) #'amp;index=1" ar'
while video_index != -1:
title_element = ''
count_name = video_index
while title_element != 'title="':
title_element = source_page[count_name: count_name + 7]
count_name += 1
count_name += 6
start_title_position = count_name
end_title = ''
while end_title != '>':
end_title = source_page[count_name] # exit loop if end_title == '>'
count_name += 1
name = source_page[start_title_position:count_name - 2] # extract the name of the video
name = name.replace('"','"')
video_id = source_page[video_index - 56: video_index - 45] # extract video id
print(str(counter)
+ '. link: ' + 'https://www.youtube.com/watch?v=' + video_id +
', name: ' + name)
counter += 1
video_index = source_page.find(element_to_find.format(counter)) # continue the next video
Upvotes: 0
Reputation: 99
Thank you! Here some dirty code working for me:
#---------------------------------
# import modules
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import re
#---------------------------------
#
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
#---------------------------------
# get links from url
def get_links(driver, sleep_time):
# open driver window
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
# wait some seconds
time.sleep(sleep_time)
# get information from url
soup = bs(driver.page_source,'html.parser')
res = soup.find_all('ytd-playlist-panel-video-renderer')
# check if there is information
if len(res) > 0:
main_url = 'https://www.youtube.com/watch?v='
urls = re.findall('watch.*list', str(res))
links = [main_url + str(a[8:-9]) for a in urls[::2]]
# if there is no information return false
else:
links = False
return links
#---------------------------------
# set sleep timer
sleep_time = 10
# call function to get links
links = get_links(driver, sleep_time)
Upvotes: 1
Reputation: 13339
It is a javascript rendered page. You have to use selenium.
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
Install the required package using pip install webdriver-manager
Upvotes: 2