Reputation: 1813
I'm trying to scrape a YouTube playlists URL using Node / puppeteer. It was working, but now I'm getting ERR_TOO_MANY_REDIRECTS error. I can still access the page using chrome from my desktop.
I've tried using the chromium browser and chrome browsers. I've also tried using the puppeteer-extra stealth plugin and the random-useragent.
This is how my code stand at the moment:
const browser = await puppeteer.launch({
stealth: true,
headless: false // true,
executablePath: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
args: [
'--disable-notifications', '--disable-features=site-per-process'
],
defaultViewport: null
});
const page = await browser.newPage()
await page.setUserAgent(random_useragent.getRandom());
await page.goto(<playlist-url, {
waitUntil: 'networkidle2',
timeout: 0
})
await page.waitForSelector('button[aria-label="Agree to the use of cookies and other data for the purposes described"')
It at the page.goto it bombs. And it happens even if I try going to https://www.youtube.com.
Any suggestions what I should try next. I tried a proxy server but couldn't get it to work. I suspect I need a proxy to actually route through.
Upvotes: 1
Views: 1785
Reputation: 474
You can get playlists (and Mixes) links from YouTube like in the code example below (also check full code the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "java course";
const requestParams = {
baseURL: `https://www.youtube.com`,
encodedQuery: encodeURI(searchString), // what we want to search for in URI encoding
};
async function fillPlaylistsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
},
videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
return [...mixes, ...playlists];
}, requestParams);
return dataFromPage;
}
async function getYoutubeSearchResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const playlists = await fillPlaylistsDataFromPage(page);
await browser.close();
return playlists;
}
getYoutubeSearchResults().then(console.log);
📌Note: to get thumbnail you need to scroll playlist into view (using .scrollIntoView()
method).
Output:
[
{
"title":"Java Complete Course | Placement Series",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"channel":{
"name":"Apna College",
"link":"https://www.youtube.com/c/ApnaCollegeOfficial"
},
"videoCount":"35",
"videos":[
{
"title":"Introduction to Java Language | Lecture 1 | Complete Placement Course",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"18:46"
},
{
"title":"Variables in Java | Input Output | Complete Placement Course | Lecture 2",
"link":"https://www.youtube.com/watch?v=LusTv0RlnSU&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"42:36"
}
],
"thumbnail":null
},
{
"title":"Java Tutorials For Beginners In Hindi",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"channel":{
"name":"CodeWithHarry",
"link":"https://www.youtube.com/c/CodeWithHarry"
},
"videoCount":"113",
"videos":[
{
"title":"Introduction to Java + Installing Java JDK and IntelliJ IDEA for Java",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"19:00"
},
{
"title":"Basic Structure of a Java Program: Understanding our First Java Hello World Program",
"link":"https://www.youtube.com/watch?v=zIdg7hkqNE0&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"14:09"
}
],
"thumbnail":null
}
]
You can read more about scraping YouTube playlists from blog post Web scraping YouTube secondary search results with Nodejs.
Upvotes: 1
Reputation: 16856
If all you need is playlist IDs for a given channel, it's possible to query a feed at:
https://youtube.com/feeds/videos.xml?channel_id=<Channel ID>
To get IDs of videos you can query a feed at:
https://youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID
Upvotes: 2