Reputation: 479
I am a beginner, and doing an assignment to scrape the content of this page using node.io
http://www.nycourts.gov/reporter/3dseries/2013/2013_06966.htm.
I want to save the text content which are under < P > tags as a string in a variable.
My code is this:
var nodeio = require('node.io'); var methods = { input: false, run: function() { this.getHtml('http://www.nycourts.gov/reporter/3dseries/2013/2013_06966.htm', function(err, $) {
//Handle any request / parsing errors if (err) this.exit(err); var content = $('P'); this.emit(content); }); } }
exports.job = new nodeio.Job({timeout:10}, methods);
This is showing error: No elements matching 'P'. Please help..
Upvotes: 1
Views: 91
Reputation: 7585
I got Error: No elements matching 'P'
too when performing command:
$ ./node_modules/.bin/node.io query http://www.nycourts.gov/reporter/3dseries/2013/2013_06966.htm P
The root cause is no ending </P>
in that page and node.io doesn't support auto correction for such malformed HTML like modern web browser. while it works well when querying <blockquote>
:
$ ./node_modules/.bin/node.io query http://www.nycourts.gov/reporter/3dseries/2013/2013_06966.htm blockquote
However, you can make it by parsing HTML document over a real browser with selenium technology.
Here's example javascript can run with node and a selenium grid on your host to get what you want. you can refer to my other answer to question How do you get webdriverjs working?:
var webdriverjs = require('webdriverjs');
var client = webdriverjs.remote({
host: 'localhost',
port: 4444,
desiredCapabilities: {
browserName: 'safari', // you can change this accordingly
version: '7',
platform: "MAC" // you can change this accordingly
}
});
client.init();
client.url('http://www.nycourts.gov/reporter/3dseries/2013/2013_06966.htm')
.getText("P",function(err, text) { console.log (text)}).call(function () {});
client.end();
Upvotes: 1