Mian Muhammad
Mian Muhammad

Reputation: 516

How to scrape all hrefs using cheerio or puppeteer?

I have a scenario where I need to get all social media links from a website. If I consider all the social links are on home page, it is each to fetch every social media link. below are code samples:

Using cheeriojs

const cheerio = require('cheerio')
const axios = require('axios')
const https = require('https');

const agent = new https.Agent({
    rejectUnauthorized: false
});
// process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';

const getLinks = async (url) => {

    try {
        let body = await axios.get(url, { httpsAgent: agent })
        let hrefs = []
        let $ = cheerio.load(body.data)
        let links = $('a')

        links.each((i, link) => {
            hrefs.push($(link).attr('href'))
        })
        return hrefs
    } catch (error) {
        return error
    }

}

const getSocialLinks = async (socialLinks, url) => {
    try {
        let hrefs = await getLinks(url)

        let handles = []
        hrefs.filter(href => {
            if (href) {
                for (const link of socialLinks) {
                    if (href.includes(link)) {
                        handles.push({ platform: link, handle: href })
                        break
                    }
                }
            }
        })

        console.log(handles);
    } catch (error) {
        console.log(error)
    }
}

getSocialLinks(['facebook', 'twitter', 'instagram', 'youtube', 'linkedin'], 'https://synavos.com')

It works just fine if all social media links are on the home page, but I am not able to figure out that what if the social media links are on some other page of the given website.

Below is the same code sample using puppeteer:

const puppeteer = require('puppeteer')

const getHrefsAttributes = async (website) => {
    try {
        const browser = await puppeteer.launch({
            headless: true,
            ignoreHTTPSErrors: true
        });
        const [page] = await browser.pages();

        await page.goto(website, { waitUntil: 'networkidle2', timeout: 3000000 });

        const hrefs = await page.evaluate(() => Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href')));

        await browser.close();
        return hrefs
    } catch (err) {
        console.error(err);
    }
}

const getSocialLinks = async (url, socialLinks) => {
    let hrefs = await getHrefsAttributes(url)
    // add array for social links which you want to fetch
    let handles = []
    hrefs.filter(href => {
        for (const link of socialLinks) {
            if (href.includes(link)) {
                handles.push({ platform: link, handle: href })
                break
            }
        }
    })

    console.log(handles);
}

getSocialLinks('https://synavos.com/', ['facebook', 'twitter', 'instagram', 'youtube', 'linkedin'])

For example this url https://netsoltech.com/, doesn't has its social media links on home page.

Upvotes: 2

Views: 1137

Answers (1)

ggorlen
ggorlen

Reputation: 56885

You can use a stack/recursion (depth-first) or queue (breadth-first) and run a search up to a certain depth, keeping a set of visited URLs to avoid loops.

A nice design might use a generator so you can potentially keep searching indefinitely, until you hit a certain depth or find a certain number of results, etc. This gives the caller more flexibility at the expense of added verbosity.

You may wish to tweak the href check or URL to ensure you're always on the base site and aren't pulling links from some other domain, but the basic idea is the same.

Note also that this is purely sequential and therefore slow. You may want to parallelize requests using an asynchronous task queue.

const axios = require("axios");
const cheerio = require("cheerio");

const headers = {
  "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
};

async function *findLinksRecursively(baseUrl, maxDepth) {
  const visited = new Set(baseUrl);

  for (const queue = [{url: baseUrl, depth: 0}]; queue.length;) {
    const {url, depth} = queue.shift();

    if (depth < maxDepth && !visited.has(url)) {
      visited.add(url);
      const {data} = await axios.get(url, {headers})
        .catch(() => ({data: {}}))
      ;
      const $ = cheerio.load(data);
      const links = [...$('a[href]:not(a[href^="#"])')
        .map((i, e) => e.attribs["href"])
      ].map(e => e.startsWith("http") ? e : baseUrl + e);
      queue.push(...links.map(e => ({url: e, depth: depth + 1})));
      yield links;
    }
  }
}

(async () => {
  const baseUrl = "https://netsoltech.com";
  const socialKeywords = [
    "facebook", "twitter", "instagram", "youtube", "linkedin"
  ];
  const links = new Set();

  for (const gen = findLinksRecursively(baseUrl, 2);;) {
    const {value, done} = await gen.next();

    if (done) break;

    value.forEach(e => links.add(e));
  }
  
  const socialLinks = [...links]
    .filter(e => socialKeywords.find(x => e.includes(x)))
  ;
  console.log(socialLinks);
})();

The concept is pretty much the same in Puppeteer, only you're using page.goto instead of Axios and page.$eval instead of Cheerio.

Upvotes: 1

Related Questions