Reputation: 4956
I have some strings in dutch language. I know how to encode them using PHP
$str = iconv( "Windows-1252", "UTF-8", $str );
What would be the equivalent in Javascript?
Upvotes: 5
Views: 16736
Reputation: 430
The accepted answer didn't work for me, so let me show what worked.
My misinterpreted string came from a ReadableStream. When I ran await ReadableStreamVar.text()
, the resulting text was replacing all diacritics with strange "�" characters. When I tried to open the same page manually in the browser, it showed correctly, and typing document.characterSet
in he console I saw the charset was "windows-1252", not UTF-8.
Ry's answer didn't work out because the text
method in the ReadableStream was already coming with the wrong encoding, replacing all different diacritics with the same � character.
Consulting ReadableStream's API, I didn't found any form of passing the desired charset to the .text
method, so I tried a different approach: to read the ReadableStream as an array (ReadableStream.arrayBuffer()
), and it worked. In the array, I could see the diacritic characters had the charcodes for the correct characters. So, I used the TextDecoder API (according to this answer), and my now-functional code looks like this:
async function getDocumentTextContent(uri) {
const response = await fetch(uri, { method: "GET" })
if (!response.ok) throw new Error(`Problem retrieving the resource. Error message: ${response.statusText}`)
const dec = new TextDecoder("windows-1252") //Here I can inform the desired charset
const arrBuffer = await response.arrayBuffer()
const ui8array = new Uint8Array(arrBuffer)
const text = dec.decode(ui8array)
console.log(text)
return text
}
Upvotes: 3
Reputation: 690
I did this using brute force, probably not the most elegant, but it works:
function bruteForceWindows1252toUTF16(s) {
for (var i = 0; i < globalWin1252toUTF16table.length; i++) {
if (s.includes(globalWin1252toUTF16table[i]['win1252'])) {
s = s.replaceAll(globalWin1252toUTF16table[i]['win1252'], globalWin1252toUTF16table[i]['utf16']);
}
}
return s;
}
const globalWin1252toUTF16table = [
{ win1252:'\xe2\x82\xac', utf16:'\u20AC' },
{ win1252:'\xe2\x80\x9a', utf16:'\u201A' },
{ win1252:'\xc6\x92', utf16:'\u0192' },
{ win1252:'\xe2\x80\x9e', utf16:'\u201E' },
{ win1252:'\xe2\x80\xa6', utf16:'\u2026' },
{ win1252:'\xe2\x80\xa0', utf16:'\u2020' },
{ win1252:'\xe2\x80\xa1', utf16:'\u2021' },
{ win1252:'\xcb\x86', utf16:'\u02C6' },
{ win1252:'\xe2\x80\xb0', utf16:'\u2030' },
{ win1252:'\xc5\xa0', utf16:'\u0160' },
{ win1252:'\xe2\x80\xb9', utf16:'\u2039' },
{ win1252:'\xc5\x92', utf16:'\u0152' },
{ win1252:'\xc5\xbd', utf16:'\u017D' },
{ win1252:'\xe2\x80\x98', utf16:'\u2018' },
{ win1252:'\xe2\x80\x99', utf16:'\u2019' },
{ win1252:'\xe2\x80\x9c', utf16:'\u201C' },
{ win1252:'\xe2\x80\x9d', utf16:'\u201D' },
{ win1252:'\xe2\x80\xa2', utf16:'\u2022' },
{ win1252:'\xe2\x80\x93', utf16:'\u2013' },
{ win1252:'\xe2\x80\x94', utf16:'\u2014' },
{ win1252:'\xcb\x9c', utf16:'\u02DC' },
{ win1252:'\xe2\x84\xa2', utf16:'\u2122' },
{ win1252:'\xc5\xa1', utf16:'\u0161' },
{ win1252:'\xe2\x80\xba', utf16:'\u203A' },
{ win1252:'\xc5\x93', utf16:'\u0153' },
{ win1252:'\xc5\xbe', utf16:'\u017E' },
{ win1252:'\xc5\xb8', utf16:'\u0178' },
{ win1252:'\xc2\xa0', utf16:'\u00A0' },
{ win1252:'\xc2\xa1', utf16:'\u00A1' },
{ win1252:'\xc2\xa2', utf16:'\u00A2' },
{ win1252:'\xc2\xa3', utf16:'\u00A3' },
{ win1252:'\xc2\xa4', utf16:'\u00A4' },
{ win1252:'\xc2\xa5', utf16:'\u00A5' },
{ win1252:'\xc2\xa6', utf16:'\u00A6' },
{ win1252:'\xc2\xa7', utf16:'\u00A7' },
{ win1252:'\xc2\xa8', utf16:'\u00A8' },
{ win1252:'\xc2\xa9', utf16:'\u00A9' },
{ win1252:'\xc2\xaa', utf16:'\u00AA' },
{ win1252:'\xc2\xab', utf16:'\u00AB' },
{ win1252:'\xc2\xac', utf16:'\u00AC' },
{ win1252:'\xc2\xad', utf16:'\u00AD' },
{ win1252:'\xc2\xae', utf16:'\u00AE' },
{ win1252:'\xc2\xaf', utf16:'\u00AF' },
{ win1252:'\xc2\xb0', utf16:'\u00B0' },
{ win1252:'\xc2\xb1', utf16:'\u00B1' },
{ win1252:'\xc2\xb2', utf16:'\u00B2' },
{ win1252:'\xc2\xb3', utf16:'\u00B3' },
{ win1252:'\xc2\xb4', utf16:'\u00B4' },
{ win1252:'\xc2\xb5', utf16:'\u00B5' },
{ win1252:'\xc2\xb6', utf16:'\u00B6' },
{ win1252:'\xc2\xb7', utf16:'\u00B7' },
{ win1252:'\xc2\xb8', utf16:'\u00B8' },
{ win1252:'\xc2\xb9', utf16:'\u00B9' },
{ win1252:'\xc2\xba', utf16:'\u00BA' },
{ win1252:'\xc2\xbb', utf16:'\u00BB' },
{ win1252:'\xc2\xbc', utf16:'\u00BC' },
{ win1252:'\xc2\xbd', utf16:'\u00BD' },
{ win1252:'\xc2\xbe', utf16:'\u00BE' },
{ win1252:'\xc2\xbf', utf16:'\u00BF' },
{ win1252:'\xc3\x80', utf16:'\u00C0' },
{ win1252:'\xc3\x81', utf16:'\u00C1' },
{ win1252:'\xc3\x82', utf16:'\u00C2' },
{ win1252:'\xc3\x83', utf16:'\u00C3' },
{ win1252:'\xc3\x84', utf16:'\u00C4' },
{ win1252:'\xc3\x85', utf16:'\u00C5' },
{ win1252:'\xc3\x86', utf16:'\u00C6' },
{ win1252:'\xc3\x87', utf16:'\u00C7' },
{ win1252:'\xc3\x88', utf16:'\u00C8' },
{ win1252:'\xc3\x89', utf16:'\u00C9' },
{ win1252:'\xc3\x8a', utf16:'\u00CA' },
{ win1252:'\xc3\x8b', utf16:'\u00CB' },
{ win1252:'\xc3\x8c', utf16:'\u00CC' },
{ win1252:'\xc3\x8d', utf16:'\u00CD' },
{ win1252:'\xc3\x8e', utf16:'\u00CE' },
{ win1252:'\xc3\x8f', utf16:'\u00CF' },
{ win1252:'\xc3\x90', utf16:'\u00D0' },
{ win1252:'\xc3\x91', utf16:'\u00D1' },
{ win1252:'\xc3\x92', utf16:'\u00D2' },
{ win1252:'\xc3\x93', utf16:'\u00D3' },
{ win1252:'\xc3\x94', utf16:'\u00D4' },
{ win1252:'\xc3\x95', utf16:'\u00D5' },
{ win1252:'\xc3\x96', utf16:'\u00D6' },
{ win1252:'\xc3\x97', utf16:'\u00D7' },
{ win1252:'\xc3\x98', utf16:'\u00D8' },
{ win1252:'\xc3\x99', utf16:'\u00D9' },
{ win1252:'\xc3\x9a', utf16:'\u00DA' },
{ win1252:'\xc3\x9b', utf16:'\u00DB' },
{ win1252:'\xc3\x9c', utf16:'\u00DC' },
{ win1252:'\xc3\x9d', utf16:'\u00DD' },
{ win1252:'\xc3\x9e', utf16:'\u00DE' },
{ win1252:'\xc3\x9f', utf16:'\u00DF' },
{ win1252:'\xc3\xa0', utf16:'\u00E0' },
{ win1252:'\xc3\xa1', utf16:'\u00E1' },
{ win1252:'\xc3\xa2', utf16:'\u00E2' },
{ win1252:'\xc3\xa3', utf16:'\u00E3' },
{ win1252:'\xc3\xa4', utf16:'\u00E4' },
{ win1252:'\xc3\xa5', utf16:'\u00E5' },
{ win1252:'\xc3\xa6', utf16:'\u00E6' },
{ win1252:'\xc3\xa7', utf16:'\u00E7' },
{ win1252:'\xc3\xa8', utf16:'\u00E8' },
{ win1252:'\xc3\xa9', utf16:'\u00E9' },
{ win1252:'\xc3\xaa', utf16:'\u00EA' },
{ win1252:'\xc3\xab', utf16:'\u00EB' },
{ win1252:'\xc3\xac', utf16:'\u00EC' },
{ win1252:'\xc3\xad', utf16:'\u00ED' },
{ win1252:'\xc3\xae', utf16:'\u00EE' },
{ win1252:'\xc3\xaf', utf16:'\u00EF' },
{ win1252:'\xc3\xb0', utf16:'\u00F0' },
{ win1252:'\xc3\xb1', utf16:'\u00F1' },
{ win1252:'\xc3\xb2', utf16:'\u00F2' },
{ win1252:'\xc3\xb3', utf16:'\u00F3' },
{ win1252:'\xc3\xb4', utf16:'\u00F4' },
{ win1252:'\xc3\xb5', utf16:'\u00F5' },
{ win1252:'\xc3\xb6', utf16:'\u00F6' },
{ win1252:'\xc3\xb7', utf16:'\u00F7' },
{ win1252:'\xc3\xb8', utf16:'\u00F8' },
{ win1252:'\xc3\xb9', utf16:'\u00F9' },
{ win1252:'\xc3\xba', utf16:'\u00FA' },
{ win1252:'\xc3\xbb', utf16:'\u00FB' },
{ win1252:'\xc3\xbc', utf16:'\u00FC' },
{ win1252:'\xc3\xbd', utf16:'\u00FD' },
{ win1252:'\xc3\xbe', utf16:'\u00FE' },
{ win1252:'\xc3\xbf', utf16:'\u00FF' }
];
Upvotes: 2
Reputation: 224942
Windows-1252 is a single-byte encoding, which is pretty convenient: you can just build a lookup table.
<?php
$s = '';
for ($i = 0; $i < 256; $i++) {
$converted = iconv('Windows-1252', 'UTF-8', chr($i));
if ($converted === false) {
$s .= "\xef\xbf\xbd"; # UTF-8 replacement character
} else {
$s .= $converted;
}
}
echo $s;
Assuming you want a regular JavaScript string as a result (rather than UTF-8) and that the input is a string where each character’s Unicode codepoint actually represents a Windows-1252 one, the resulting table can be read as UTF-8, put in a JavaScript string literal, and voilà:
var WINDOWS_1252 = '\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~€�‚ƒ„…†‡ˆ‰Š‹Œ�Ž��‘’“”•–—˜™š›œ�žŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
function fromWindows1252(binaryString) {
var text = '';
for (var i = 0; i < binaryString.length; i++) {
text += WINDOWS_1252.charAt(binaryString.charCodeAt(i));
}
return text;
}
Upvotes: 5