Reputation: 51
So far I am able to get my code to export the links of the first page but am having trouble with inserting the subsequent pages links. I just want it to go one deep in terms of search, just follow the links one time and scrape links and insert into my objects array then export to json file. Can someone please tell me what my function is missing?
const cheerio = require('cheerio');
const fs = require('fs');
const chalk = require('chalk');
const axios = require('axios');
const START_URL = 'https://www.reddit.com/r/Frontend/';
const outputFile = 'data.json';
var results = {};
results.sites = [];
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + '//' + url.hostname;
var MAX_PAGES_TO_VISIT = 1;
pagesToVisit.push(START_URL);
const crawl = () => {
if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log('Reached max limit of number of pages to visit.');
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
collectURLS(nextPage, crawl);
}
};
const collectURLS = async (url, callback) => {
pagesVisited[url] = true;
numPagesVisited++;
const { data } = await axios.get(START_URL);
const $ = cheerio.load(data);
if (results === null) {
$("a[href^='http']").each((i, elem) => {
const link = $(elem).attr('href');
pagesToVisit.push(baseUrl + $(this).attr('href'));
if (results === null) {
var obj = {
url: link,
links: [],
};
results.sites.push(obj);
callback();
}
});
} else if (results !== null) {
$("a[href^='http']").each((i, elem) => {
const link = $(elem).attr('href');
results.sites.links.push(link);
});
}
exportResults();
};
//Export results object to json file
const exportResults = () => {
fs.writeFile(outputFile, JSON.stringify(results, null, 4), function (err) {
if (err) throw err;
console.log('complete');
});
console.log(
chalk.black.bgWhite(
`\n ${chalk.underline.bold(
results.length
)} Results exported successfully to ${chalk.underline.bold(outputFile)}\n`
)
);
};
crawl();
I want my output to look like this
{
"sites": [
{
"url1": "holds this pages link and was scraped from original URL",
"links": [where this URLs scraped links will go]
}
{
"url2": "holds this pages link and was scraped from original URL",
"links": [where this URLs scraped links will go]
}
.
.
.
]
}
Upvotes: 1
Views: 1140
Reputation: 54984
I would start with this because it's much simpler to follow:
const crawl = async () => {
var nextPage
while (nextPage = pagesToVisit.pop()) {
await collectURLS(nextPage)
}
exportResults()
}
Also make sure not to use this
in arrow functions unless you understand the difference from normal functions.
Upvotes: 1