gnidoc
gnidoc

Reputation: 47

Asynchronous request for web crawler

I have an array of URLs from each of which I want to crawl an html table and save it in another array in the same order as the original array.

Due to the asynchronous nature of node I assume it's not working as I expect, the results are in a different order every time.

I googled a lot and tried different things like using a custom async-forEach-function or request-promise instead of request, but nothing worked.

const request = require('request');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');

let verbs = [];
let conjugations = [];
fs.readFileSync('verbs.txt', 'utf-8').split(/\r?\n/).forEach
(function(line){
  verbs.push(line);
});

verbs.forEach((verb) => {
    const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);


    var options = {
        uri: URI,
        transform: function (body) {
            return cheerio.load(body);
        }


    };

    rp(options).then(function ($) {
        let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
        conjugations.push(table.text());
        console.log(conjugations[0]);

    })
    .catch(function (err) {
    });


})




        
  
    

Upvotes: 0

Views: 400

Answers (1)

kockburn
kockburn

Reputation: 17616

Use Promise.all if the order is important.

The Promise.all() method returns a single Promise that resolves when all of the promises passed as an iterable have resolved or when the iterable contains no promises. It rejects with the reason of the first promise that rejects.

Example of keeping things in order:

const verbs = ["hello", "world", "example"];

let timeout = 2000;
const promises = verbs.map(verb=>{
  timeout -= 500;
  return new Promise((resolve,reject)=>{
    setTimeout(function(){
      resolve(verb);
    }, timeout);
  });
});

Promise.all(promises).then(dataArray=>console.log(dataArray));

Solution with your code.

const promises = verbs.map((verb) => {
  const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);
  var options = {
    uri: URI,
    transform: function(body) {
      return cheerio.load(body);
    }


  };

  return rp(options);
})

Promise.all(promises).then(dataArray=>{
     dataArray.forEach(function($) {
      let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
      conjugations.push(table.text());
      console.log(conjugations[0]);
    })
}).catch(function(err) {});

Downside, if one request fails they all fail.

Alternatively, you could do something like this by using the index of each verb (Using Promise.all to determine when everything is done but that step can be ignored...)

const verbs = ["hello", "world", "example"];

const conjugations = [];
let timeout = 2000;
const promises = verbs.map((verb, index)=>{
  return new Promise((resolve, reject)=>{
    setTimeout(function(){
      conjugations[index] = verb;
      resolve();
    }, timeout);
    timeout -= 500;
  });
});

Promise.all(promises).then(()=>console.log(conjugations));

Example with your code.

const request = require('request');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');

let verbs = [];
let conjugations = [];
fs.readFileSync('verbs.txt', 'utf-8').split(/\r?\n/).forEach(function(line) {
  verbs.push(line);
});

verbs.forEach((verb, index) => {
      const URI = encodeURI("https://ru.wiktionary.org/wiki/" + verb);


      var options = {
        uri: URI,
        transform: function(body) {
          return cheerio.load(body);
        }
      };

      rp(options).then(function($) {
          let table = $('span#Русский.mw-headline').parent().nextAll('table').first();
          conjugations[index] = table.text();
          console.log(conjugations[index]);

        })
        .catch(function(err) {});

Upvotes: 2

Related Questions