Reputation: 20560
I'm fetching this page with with this request library in Node.JS, and parsing the body using cheerio.
Calling $.html()
on the parsed response body reveals that the title attribute for the page is:
<title>Le Relais de l'Entrec?te</title>
... when it should be:
<title>Le Relais de l'Entrecôte</title>
I've tried setting the options for the request library to include encoding: 'utf8'
, but that didn't seem to change anything.
How do I preserve these characters?
Upvotes: 16
Views: 10840
Reputation: 17541
You can use iconv (or better iconv-lite) for the conversion itself, but to detect the encoding you should check out the charset and jschardet modules. Here's an example of them both in action:
var charset = require('charset'),
jschardet = require('jschardet'),
Iconv = require('iconv').Iconv;
request.get({url: 'http://www.example.com', encoding: 'binary'}, function(err, res, body) {
var enc = charset(res.headers, body) || jschardet.detect(body).encoding.toLowerCase();
if(enc !== 'utf8') {
var iconv = new Iconv(enc, 'UTF-8//TRANSLIT//IGNORE');
body = iconv.convert(new Buffer(body, 'binary')).toString('utf8');
}
console.log(body);
});
Upvotes: 33
Reputation: 2697
The page appears to be encoded with iso-8859-1. You'll need to tell request
to hand you back an un-encoded buffer by passing encoding: null
and use something like node-iconv to convert it.
If you're writing a generalized crawler, you'll have to figure out how to detect the encoding of each page you encounter to decode it correctly, otherwise the following should work for your case:
var request = require('request');
var iconv = require('iconv');
request.get({
url: 'http://www.relaisentrecote.fr',
encoding: null,
}, function(err, res, body) {
var ic = new iconv.Iconv('iso-8859-1', 'utf-8');
var buf = ic.convert(body);
var utf8String = buf.toString('utf-8');
// .. do something with utf8String ..
});
Upvotes: 23