Aaron
Aaron

Reputation: 217

NodeJS Express web scraping header issues

I am scraping a fan site for character info to display in my web app, but I am running into Cant Set headers after they are sent. I am trying to use promises during my request, but I think I may have a fundamental misunderstanding of what my code is actually doing.

The end goal is to scrape 100ish pages for data by looping through an array of boss names, store that data in an array, then eventually export it to use later. Currently I am able to store the data in an array, but still getting errors even though my code executes and scrapes the data.

server.js

var express = require('express');
var cheerio = require('cheerio');
var app = express();
var rp = require('request-promise');
var fsp = require('fs-promise');

app.get('/', function(req, res){

  urls = [
    'fansite/boss1', 'fansite/boss2'
  ];

  var bosses = [];

  function parse(html) {

    var $ = cheerio.load(html);

    $('.page-header__title').filter(function () {
      var data = $(this);
      name = data.text();
      bosses.push(name);
    })
    console.log(bosses);
    return bosses;
  }

  urls.forEach(function (url) {
    rp(url)
    .then(parse)
    .then(res.send('Bosses Updated.'))  
    .catch(err => console.log('Error:', err));
  });
})

app.listen('8081')
console.log('Running on port 8081');
exports = module.exports = app;

Output:

node server.js start
Running on port 8081
[ 'Obor' ]
[ 'Obor', 'Zulrah' ]
Error: Error: Can't set headers after they are sent.
    at ServerResponse.OutgoingMessage.setHeader (_http_outgoing.js:356:11)
    at ServerResponse.header (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:767:10)
    at ServerResponse.send (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:170:12)
    at rp.then.then (/Users/aaron/Personal Projects/node-scraper/server.js:31:21)
    at tryCatcher (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/util.js:16:23)
    at Promise._settlePromiseFromHandler (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:512:31)
    at Promise._settlePromise (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:569:18)
    at Promise._settlePromise0 (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:693:18)
    at Async._drainQueue (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:672:20)
    at tryOnImmediate (timers.js:645:5)
    at processImmediate [as _immediateCallback] (timers.js:617:5)

Upvotes: 1

Views: 690

Answers (3)

HMR
HMR

Reputation: 39280

You can use Promise.all and catch individual request so you don't loose the successful ones. Report back in the response after Promise.all (all the requests are finished) with a response:

const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const isNotFail = x=>!isFail(x);
Promise.all(
  urls.map(
    url=>
      rp(url)
      .then(parse)
      .catch(err => new Fail([url,err]))
  )
)
.then(
  results=>
    res.json(results)
);

If you are making many requests to a site you may want to limit requests. Either by how many open requests you are making or how many requests you want to make within a certain time period. You can do this with throttling but if your express application is a public site where potentially many users can start the scraping you'd better make sure not to have the target site see your scraping as an attack.

const max = throttle(8)//maximum 8 open connections
//const max = throttlePeriod(8,1000);//maximum 8 requests per second
Promise.all(
  urls.map(
    url=>
      max(rp)(url)//throttle requests made
      .then(parse)
      .catch(err => new Fail([url,err]))
  )
)
.then(
  results=>
    res.send(JSON.parse)
)

Upvotes: 1

Jaromanda X
Jaromanda X

Reputation: 1

If you want to wait for all the urls to be processed before sending a response

Promise.all(urls.map(function (url) {
  return rp(url).then(parse);
}))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

or

Promise.all(urls.map(url => rp(url).then(parse)))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

Upvotes: 2

Cr.
Cr.

Reputation: 916

res.send sends an entire HTTP response to the client, including headers and content, which is why you are unable to call it multiple times.

Upvotes: 1

Related Questions