Reputation: 8783
I am trying to build a simple web app scraping a website using nodejs and its 2 modules request and cheerio.
I manage to do it with the following code:
var printURL=function(url){
request(url, (function() {
return function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
$('img').each(function(){
console.log($(this).attr('src'));
});
}
} )());
};
It works fine to print the URL of the pictures on the website but what I am really trying to do here is to create a list of url that I could use outside of the function. I tried it this way but it returns an empty list:
var urlList=[];
var printURL=function(url){
request(url, (function() {
return function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
$('img').each(function(){
urlList.push($(this).attr('src'));
});
}
} )());
};
How can I fix this? Many thanks
Upvotes: 2
Views: 953
Reputation: 63139
You need to wait until all callbacks are done.
var urlList=[];
var printURL=function(url){
request(url, (function() {
return function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
var images = $('img');
var counter = images.length;
images.each(function(){
urlList.push($(this).attr('src'));
counter--;
if (counter==0) {
// now we have all images!!
console.log(urlList);
}
});
}
})());
};
This is part of the asynchronous nature of node.js. If things get more complicated I would recommend you to use a flow control library like async.
Upvotes: 3