Reputation: 1245
Her's what i am doing in the code
i am Reading a text file with around 3500 links then i am reading each link , filtering the one i want and doing a request to get the status code, link, and page title (using cheerio). after around the looping the 100th or 200th link i get the "connect ETIMEDOUT 40...:443". links look good. Whats going on here? is the web server kicking me out thinking its is a DDOS?, i am doing this for a company i work for and this is not the intention obviously. if any of you want to test with large amount links , i used https://hackertarget.com/extract-links/ to get the links then put it in a text file.
Here is my code
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var axios = require('axios');
const fs = require('fs');
const readline = require('readline');
var main = [];
var linkdata = [];
const rl = readline.createInterface({
input: fs.createReadStream('C:/Users/Jay/Documents/Javascript/crawl/links.txt'),
crlfDelay: Infinity
});
rl.on('line', (link) => {
if (link.startsWith('https://www.example.com')) {
var encodeLink = encodeURI(link)
request(encodeURI(encodeLink), function (error, response, body) {
console.log("Link: ",encodeLink)
if (error) {
console.log("Error:Request " + error);
}
// Check status code (200 is HTTP OK)
if (response.statusCode === 200) {
// Parse the document body
var $ = cheerio.load(body);
var Status_200 = {
"status Code": response.statusCode,
"Page title:": $('title').text(),
"Original Link": encodeLink,
}
main.push(Status_200)
}
if (response.statusCode === 302 || response.statusCode === 404 || response.statusCode === 500) {
// Parse the document body
var Status_Errors = {
"status Code": response.statusCode,
"Page title:": $('title').text(),
"Original Link": encodeLink,
}
main.push(Status_Errors)
}
//console.log(JSON.stringify(main))
fs.writeFile("C:/Users/Jay/Documents/Javascript/crawl/output.json", JSON.stringify(main), (err) => {
if (err) console.log(err);
console.log("Successfully Written to File.");
});
})
}
});
Upvotes: 0
Views: 2861
Reputation: 48
Put a try catch around since using async to see if that helps with memory error you getting, probably good practice anyway
try {
const body = response.data;
if (response.status === 200) {
//do ur thing
}
if (response.status === 302 || response.status === 404 || response.status === 500) {
// Parse the document body
//do ur thing
}
fs.writeFile("C:/Users/T440/Documents/crawl/output.json", JSON.stringify(main), (err) => {
if (err) console.log(err);
console.log("Successfully Written to File.");
});
} catch (error) {
//catch them erros
}
main.push(Status_ErrorsCatch)
Upvotes: 1
Reputation: 1245
With some suggestions from the comments i slowed down the process with readline async iterator structure as well as using axios for more promise friendly
Here is sample of how i fixed the ETIMEDOUT 'ip address' issue, i am having memmory issue now but the original problem is solved i think
async function processLineByLine() {
const rl = readline.createInterface({
input: fs.createReadStream('C:/Users/T440/Documents/crawl/links.txt'),
crlfDelay: Infinity
});
for await (const line of rl) {
if (line.startsWith('https://www.example.com')) {
var encodeLink = encodeURI(line);
const response = await axios.get(encodeLink).catch((err)=>{
Upvotes: 0