kayvera
kayvera

Reputation: 51

How to recursively crawl a page, scrape links, follow links, then scrape again and export in node js?

So far I am able to get my code to export the links of the first page but am having trouble with inserting the subsequent pages links. I just want it to go one deep in terms of search, just follow the links one time and scrape links and insert into my objects array then export to json file. Can someone please tell me what my function is missing?

const cheerio = require('cheerio');
const fs = require('fs');
const chalk = require('chalk');
const axios = require('axios');

const START_URL = 'https://www.reddit.com/r/Frontend/';
const outputFile = 'data.json';
var results = {};
results.sites = [];

var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + '//' + url.hostname;
var MAX_PAGES_TO_VISIT = 1;

pagesToVisit.push(START_URL);

const crawl = () => {
  if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
    console.log('Reached max limit of number of pages to visit.');
    return;
  }
  var nextPage = pagesToVisit.pop();
  if (nextPage in pagesVisited) {
    // We've already visited this page, so repeat the crawl
    crawl();
  } else {
    // New page we haven't visited
    collectURLS(nextPage, crawl);
  }
};

const collectURLS = async (url, callback) => {
  pagesVisited[url] = true;
  numPagesVisited++;

  const { data } = await axios.get(START_URL);
  const $ = cheerio.load(data);

  if (results === null) {
    $("a[href^='http']").each((i, elem) => {
      const link = $(elem).attr('href');
      pagesToVisit.push(baseUrl + $(this).attr('href'));
      if (results === null) {
        var obj = {
          url: link,
          links: [],
        };
        results.sites.push(obj);
        callback();
      }
    });
  } else if (results !== null) {
    $("a[href^='http']").each((i, elem) => {
      const link = $(elem).attr('href');
      results.sites.links.push(link);
    });
  }
  exportResults();
};

//Export results object to json file
const exportResults = () => {
  fs.writeFile(outputFile, JSON.stringify(results, null, 4), function (err) {
    if (err) throw err;
    console.log('complete');
  });
  console.log(
    chalk.black.bgWhite(
      `\n ${chalk.underline.bold(
        results.length
      )} Results exported successfully to ${chalk.underline.bold(outputFile)}\n`
    )
  );
};

crawl();

I want my output to look like this

{
 "sites": [
{
 "url1": "holds this pages link and was scraped from original URL",
"links": [where this URLs scraped links will go]
}
{
"url2": "holds this pages link and was scraped from original URL",
"links": [where this URLs scraped links will go]
}
.
.
.
]
}

Upvotes: 1

Views: 1140

Answers (1)

pguardiario
pguardiario

Reputation: 54984

I would start with this because it's much simpler to follow:

const crawl = async () => {
  var nextPage
  while (nextPage = pagesToVisit.pop()) {
    await collectURLS(nextPage)
  }
  exportResults()
}

Also make sure not to use this in arrow functions unless you understand the difference from normal functions.

Upvotes: 1

Related Questions