Reputation: 2172
I am new to node.js I wrote a scraper as below and result it produces is not fine. All entries are not being written and incomplete broken data is being added to file, though individual data extraction if fine in console log.
The original file is complex sample from all code parts I have added to show my logic please tell what is being done wrong.
var request = require('request');
var cheerio = require('cheerio');
var url = 'http://example.com/index.html';
request(url, function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
var categoryname = $('#mcat span').html();
var subcategoryname = $('span.arrow').html();
$('.listing').each(function() {
var companyname = $(this).find('.company-name > span').html();
var compwebsite = $(this).find('.company-link > a').html();
var phonelumber = "+91-" + $(this).find('span[itemprop="telephone"]').html();
var data = categoryname + ", " + subcategoryname + ", " + companyname + ", " + phonelumber;
var fs = require('fs');
fs.writeFile("data.txt", data, function(err) {
if(err) {
console.log("Error: "+err);
} else {
console.log("Success!");
}
});
});
});
Upvotes: 0
Views: 608
Reputation: 2175
I think you could also do it easier (just call the writedata function after the each loop (because cherio's each() is synchronous, so there will be no problem)
request(url, function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
var categoryname = $('#mcat span').html();
var subcategoryname = $('span.arrow').html();
var data = '';
$('.listing').each(function() {
var companyname = $(this).find('.company-name > span').html();
var compwebsite = $(this).find('.company-link > a').html();
var phonelumber = "+91-" + $(this).find('span[itemprop="telephone"]').html();
data += categoryname + ", " + subcategoryname + ", " + companyname + ", " + phonelumber + "\r\n";
});
writeData(data);
});
function writeData(data) {
var fs = require('fs');
fs.writeFile("data.txt", data, function(err) {
if (err) {
console.log("Error: " + err);
} else {
console.log("Success!");
}
});
}
Upvotes: 0
Reputation: 1360
.each
is called synchronously, hence it is blocking. But the fs.writeFile
is called asynchronously so it makes your data to shuffle, but no way it is going to be incomplete.
Solutions:
Use Callback
request(url, function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
var categoryname = $('#mcat span').html();
var subcategoryname = $('span.arrow').html();
var count = 0;
var len = $('.listing').length;
var data = '';
$('.listing').each(function() {
count++;
var companyname = $(this).find('.company-name > span').html();
var compwebsite = $(this).find('.company-link > a').html();
var phonelumber = "+91-" + $(this).find('span[itemprop="telephone"]').html();
data += categoryname + ", " + subcategoryname + ", " + companyname + ", " + phonelumber + "\r\n";
if(count == len)
writeData(data);
});
});
function writeData(data) {
var fs = require('fs');
fs.writeFile("data.txt", data, function(err) {
if (err) {
console.log("Error: " + err);
} else {
console.log("Success!");
}
});
}
Upvotes: 1