Robert Baindourov
Robert Baindourov

Reputation: 31

Best way to scrape and parse html in nodejs with request package

It has come to my attention, during my experimentation that I am unable to parse the text received from request.get with either the htmlparser or htmlparser2 packages.

In comparison to https.request the string returned is not the same, line breaks wise. I am scraping a human readable page, and http.request handler is able to aggregate the data to match the server response in terms of outputting.

But with request.get I am receiving a response that is without line breaks. Therefore I can only assume I am not using a parsing package that is not support by requestjs, and am led to question:

What is the best way to actually parse out the HTML received from the reqeust.get and requst.post calls?

Thank You.

Upvotes: 1

Views: 2302

Answers (1)

Robert Baindourov
Robert Baindourov

Reputation: 31

My apologies, I was using the response instead of the body, here is the proper way:

var request = require('request');
var htmlparser = require('htmlparser2');
var select = require('soupselect').select

var url =  '';


function httpsHandler(err, response, body) {
    if(err){
      console.error('to err is human')
      process.exit(1);
    }

    var parser = new htmlparser.Parser(htmlHandler);
    parser.parseComplete(body);
}


var htmlHandler = new htmlparser.DefaultHandler( (error, dom) => {
  if (error){
    console.log( 'error', error );
    process.exit(1);
  }
  var options = extractData( dom );
});


function extractData( dom ){

  var collection = select(dom, '#ctl00_LeftColumnMiddle_Table1 table td');

  collection.forEach( ( licenses ) => {
    licenses.children.forEach( ( license, i ) => {
      var data  = ( license.data  ) ? license.data : license.children[0].data
      console.log( data );
    });
  })
}

//entry point
request.get( url , httpsHandler );

Upvotes: 1

Related Questions