MrYanDao
MrYanDao

Reputation: 1263

Make asynchronous calls synchronous

I used to build web apps in PHP, thus the habit to do things synchronously.

I'm currently trying to build a web scraper. The way it works is

  1. Grab list of proxies
  2. Check if proxies are working
  3. Scrape web content using proxies.

However, I've realized that most of the calls are synchronous and I had a hard time understanding async module in nodejs.

This is the main method.

var proxyChecker = require('proxy-checker');
var request = require('request');
var forEach = require('async-foreach').forEach;
var async = require('async');

var proxiesJar = [];
var goodProxies = [];
var proxyCount = 0;    
parseProxiesList(function(error) {
        async.each(proxiesJar, checker, function(err, result) {
            console.log('Result:' + err);
        });
    });

Getting proxy list

function parseProxiesList(callback) {
    console.log("parseProxiesList");
    request('http://hidden.com', function (error, response, body) {
        if (error) {
            console.log("Error [1]");
            return callback(error);
        }
        console.log("Got proxies list");
        if (!error && response.statusCode == 200) {
            proxies = body.split(/\r?\n/);
            var shouldBreak = false;

            for (var i = 0; i < proxies.length; i++) {
                if (/[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\:[0-9]{1,5}/.test(proxies[i])) {
                    p = proxies[i].split(' ');
                    var elts = p[0].split(':');
                    var host = elts[0];
                    var port = elts[1];
                    proxiesJar.push(host + ":" + port);
                }
            }
            callback(null, 1);
        }
    });
}

After getting the proxy list, it checks if the proxy is working.

var checker = function(proxy, callback) {
    var p = proxy.split(':');
    var host = p[0];
    var port = p[1];
    console.log('[Checking] ' + host + ':' + port);
    proxyChecker.checkProxy(host, port, {url: 'http://google.com',regex: /Google/}, function(host, port, ok, statusCode, err) {

        if(!ok) {
            console.log("Proxy don't work: " + host + ":" + port);
            return callback(err);
        } else {
            console.log("Working proxy: " + host + ":" + port);
            goodProxies.push(host + ":" + port);
            return callback(null, host + ":" + port);
        }
    });
};

The logs, however, turns out to be

[Checking] 1.1.1.1:80
[Checking] 2.2.2.2:80
.
.
.
Working proxy: 1.1.1.1:80
Working proxy: 2.2.2.2:80

instead of

[Checking] 1.1.1.1:80
Working proxy: 1.1.1.1:80

[Checking] 2.2.2.2:80
Working proxy: 2.2.2.2:80

Upvotes: 0

Views: 368

Answers (2)

V31
V31

Reputation: 7666

You can try promises, creating a promise to get a proxy and then a execute a promise to check it.

You can find more on promises over here

Example:

function parseProxiesList() {
    var deferred = Q.defer()
    console.log("parseProxiesList");
    request('http://hidden.com', function (error, response, body) {
        if (error) {
            console.log("Error [1]");
            deferred.reject(error);
        }
        console.log("Got proxies list");
        if (!error && response.statusCode == 200) {
            proxies = body.split(/\r?\n/);
            var shouldBreak = false;

            for (var i = 0; i < proxies.length; i++) {
                if (/[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\:[0-9]{1,5}/.test(proxies[i])) {
                    p = proxies[i].split(' ');
                    var elts = p[0].split(':');
                    var host = elts[0];
                    var port = elts[1];
                    proxiesJar.push(host + ":" + port);
                }
            }
            deffered.resolve(proxiesJar);
        }
        return deffered.promise;
    });
}

This way you have created a promise to get a proxy list. Similarly you can do it for checking proxy.

Upvotes: 0

Peter Chung
Peter Chung

Reputation: 457

async.each executes iterators for each item in parallel.

Use async.eachSeries for synchronous calls.

Upvotes: 1

Related Questions