Strontium_99
Strontium_99

Reputation: 1813

Node puppeteer scraping YouTube and encountering redirected you too many times

I'm trying to scrape a YouTube playlists URL using Node / puppeteer. It was working, but now I'm getting ERR_TOO_MANY_REDIRECTS error. I can still access the page using chrome from my desktop.

I've tried using the chromium browser and chrome browsers. I've also tried using the puppeteer-extra stealth plugin and the random-useragent.

This is how my code stand at the moment:

const browser = await puppeteer.launch({
      stealth: true,
      headless: false // true,
      executablePath: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
      args: [
          '--disable-notifications', '--disable-features=site-per-process'
      ],
      defaultViewport: null
        });
      const page = await browser.newPage()
      await page.setUserAgent(random_useragent.getRandom());
      await page.goto(<playlist-url, {
        waitUntil: 'networkidle2',
        timeout: 0
      })


     await page.waitForSelector('button[aria-label="Agree to the use of cookies and other data for the purposes described"')

It at the page.goto it bombs. And it happens even if I try going to https://www.youtube.com.

Any suggestions what I should try next. I tried a proxy server but couldn't get it to work. I suspect I need a proxy to actually route through.

Upvotes: 1

Views: 1785

Answers (2)

Mikhail Zub
Mikhail Zub

Reputation: 474

You can get playlists (and Mixes) links from YouTube like in the code example below (also check full code the online IDE):

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const searchString = "java course";

const requestParams = {
  baseURL: `https://www.youtube.com`,
  encodedQuery: encodeURI(searchString),                            // what we want to search for in URI encoding
};


async function fillPlaylistsDataFromPage(page) {
  const dataFromPage = await page.evaluate((requestParams) => {
    const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
      title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
      link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
      videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
        title: el.querySelector("#video-title")?.textContent.trim(),
        link: `${requestParams.baseURL}${el.getAttribute("href")}`,
        length: el.querySelector("#length")?.textContent.trim(),
      })),
      thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
    }));
    const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
      title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
      link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
      channel: {
        name: el.querySelector("#channel-name a")?.textContent.trim(),
        link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
      },
      videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
      videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
        title: el.querySelector("#video-title")?.textContent.trim(),
        link: `${requestParams.baseURL}${el.getAttribute("href")}`,
        length: el.querySelector("#length")?.textContent.trim(),
      })),
      thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
    }));
    return [...mixes, ...playlists];
  }, requestParams);
  return dataFromPage;
}


async function getYoutubeSearchResults() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });
  const page = await browser.newPage();
  const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);
  await page.waitForSelector("#contents > ytd-video-renderer");
  const playlists = await fillPlaylistsDataFromPage(page);

  await browser.close();

  return playlists;
}

getYoutubeSearchResults().then(console.log);

📌Note: to get thumbnail you need to scroll playlist into view (using .scrollIntoView() method).

Output:

[
   {
      "title":"Java Complete Course | Placement Series",
      "link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
      "channel":{
         "name":"Apna College",
         "link":"https://www.youtube.com/c/ApnaCollegeOfficial"
      },
      "videoCount":"35",
      "videos":[
         {
            "title":"Introduction to Java Language | Lecture 1 | Complete Placement Course",
            "link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
            "length":"18:46"
         },
         {
            "title":"Variables in Java | Input Output | Complete Placement Course | Lecture 2",
            "link":"https://www.youtube.com/watch?v=LusTv0RlnSU&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
            "length":"42:36"
         }
      ],
      "thumbnail":null
   },
   {
      "title":"Java Tutorials For Beginners In Hindi",
      "link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
      "channel":{
         "name":"CodeWithHarry",
         "link":"https://www.youtube.com/c/CodeWithHarry"
      },
      "videoCount":"113",
      "videos":[
         {
            "title":"Introduction to Java + Installing Java JDK and IntelliJ IDEA for Java",
            "link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
            "length":"19:00"
         },
         {
            "title":"Basic Structure of a Java Program: Understanding our First Java Hello World Program",
            "link":"https://www.youtube.com/watch?v=zIdg7hkqNE0&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
            "length":"14:09"
         }
      ],
      "thumbnail":null
   }
]

You can read more about scraping YouTube playlists from blog post Web scraping YouTube secondary search results with Nodejs.

Upvotes: 1

Vaviloff
Vaviloff

Reputation: 16856

If all you need is playlist IDs for a given channel, it's possible to query a feed at:

https://youtube.com/feeds/videos.xml?channel_id=<Channel ID>

To get IDs of videos you can query a feed at:

https://youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID 

Upvotes: 2

Related Questions