Reputation: 154
I've created a script using request
and cheerio
libraries to scrape links of different provinces from this webpage and then use those urls to parse the links of different offices from here. Finally, use those office links to scrape title from here.
When I run the script, I can see that it does it's job accordingly until it gets stuck somewhere in it's execution. When it gets stuck, it doesn't throw any error.
Here are the steps in image what the script is following:
Here is what I've tried with:
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
let getLinks = (link) => {
const items = [];
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const nitems = [];
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(nitems);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const promises = links
.map(turl => new Promise((resolve, reject) => {
request(turl, function(error, response, html) {
if (error) return reject(error);
let $ = cheerio.load(html);
try {
const title = $(".home-title > h2").eq(0).text();
console.log({
title: title,
itemLink: turl
});
resolve(title);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
(async function main() {
const result = await getLinks(link);
const resultSecond = await getData(result);
const merged = resultSecond.flat(1);
const resultFinal = await FetchData(merged);
for (const title of resultFinal) {
console.log(title);
}
})().catch(console.error);
How can I make the script finish it's execution process?
PS Although the script appears to be big, the functions used in there are alomost identical to each other except for the selectors.
Upvotes: 2
Views: 422
Reputation: 380
Ok, so on testing this code, I ran across two problems right off the bat:
resultSecond, containing the data from getData(), returned an Array-like Object, not an Array, so I wasn't able to use the flat(). So I created a function toArray that converts Objects to Arrays and added another variable after resultSecond called resultThird and used this function on resultSecond, turning it to an array.
flat() did not exist in the Array prototype, so I had to add it manually.
After resolving those issues, I was able to run your code, and experienced the hang you were talking about.
An ECONNRESET error occurred, and then proceeded to make probably a couple thousand requests before hanging. An ECONNRESET usually results from not handling asynchronous network errors or the server you're requesting decides to kill the connection. Not sure how the request module would handle such an event, but it seems like the module could potentially not be handling the network errors or terminated connections properly.
The issue was you were making 15,000 requests to this sites API, so the API probably had a rate limiter, saw the amount of requests and terminated most of them, but allowed a couple thousand to go through, but since you're not handling the terminated connections-- most likely due to the request module swallowing those errors-- it's "hanging" there with the node process not exiting.
So I batched the requests into intervals of 300 using the async module and it worked like a charm. No terminated connections because I didn't reach the rate limit. You could probably up the interval limit higher than 300.
However, I would suggest not using the request module and use another http module like axios, which most likely handle these issues. You should consider using async when you're doing a ton of asynchronous requests. It has so many helpful methods. Lmk if you need more explanation to what the async module is doing here, but I'd advise reading the documentation first: https://caolan.github.io/async/v3/docs.html#mapLimit
const request = require('request');
const cheerio = require('cheerio');
const _async = require('async');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
const toArray = (obj) => {
const arr = [];
for (const prop in obj) {
arr.push(obj[prop])
}
return arr;
}
Object.defineProperty(Array.prototype, 'flat', {
value: function(depth = 1) {
return this.reduce(function (flat, toFlatten) {
return flat.concat((Array.isArray(toFlatten) && (depth>1)) ? toFlatten.flat(depth-1) : toFlatten);
}, []);
}
});
let getLinks = (link) => {
const items = [];
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const nitems = [];
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a[href]").attr("href"));
});
return resolve(nitems);
} catch (e) {
return reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const limit = 300;
return new Promise((resolve, reject) => {
const itr = (col, cb) => {
request(col, function(error, response, html) {
if (error) cb(error)
let $ = cheerio.load(html);
try {
const title = $(".home-title > h2").eq(0).text();
console.log({
title: title,
itemLink: col
});
cb(null, title);
} catch (e) {
cb(e);
}
})
}
_async.mapLimit(links, limit, itr, function(err, results) {
if (err) reject(err);
return resolve(results);
})
})
}
(async function main() {
const result = await getLinks(link);
const resultSecond = await getData(result);
const resultThird = toArray(resultSecond);
const merged = resultThird.flat(1);
const resultFinal = await FetchData(merged);
for (const title of resultFinal) {
console.log("title: ", title);
}
})().catch(err => console.log(err))
//good to listen to these
process.on('uncaughtException', err => { console.log(err) });
process.on('unhandledRejection', err => { console.log(err) });
Upvotes: 1