rik
rik

Reputation: 491

How to decode HTML entities

I have string variable with HTML entities:

var str = 'Some text & text';

I want to convert (decode) it to original characters:

Some text & text.

JavaScript doesn't have built-in function to achieve wanted result. I can't use jQuery or DOM objects because I need it to work in Google Apps Script.

How can I do that in simple way?

Upvotes: 13

Views: 8289

Answers (3)

Tiago Gouvêa
Tiago Gouvêa

Reputation: 16780

In 2024 none of those solutions (XML, XmlService) worked 😟 for me, so I did it 💪 manually.

Following is my hard work solution.

To use it just call const fixedString = htmlEntitiesDecode(crazyEncodedString);

function htmlEntitiesDecode(input) {
  entities.forEach(function(substitution) {
    var regex = new RegExp(substitution.entity, 'g');
    input = input.replace(regex, substitution.character);
  });
  return input;
}

const entities = [
  { entity: "À", character: "À" },
  { entity: "Á", character: "Á" },
  { entity: "Â", character: "Â" },
  { entity: "Ã", character: "Ã" },
  { entity: "Ä", character: "Ä" },
  { entity: "Å", character: "Å" },
  { entity: "à", character: "à" },
  { entity: "á", character: "á" },
  { entity: "â", character: "â" },
  { entity: "ã", character: "ã" },
  { entity: "ä", character: "ä" },
  { entity: "å", character: "å" },
  { entity: "Æ", character: "Æ" },
  { entity: "æ", character: "æ" },
  { entity: "ß", character: "ß" },
  { entity: "Ç", character: "Ç" },
  { entity: "ç", character: "ç" },
  { entity: "È", character: "È" },
  { entity: "É", character: "É" },
  { entity: "Ê", character: "Ê" },
  { entity: "Ë", character: "Ë" },
  { entity: "è", character: "è" },
  { entity: "é", character: "é" },
  { entity: "ê", character: "ê" },
  { entity: "ë", character: "ë" },
  { entity: "ƒ", character: "ƒ" },
  { entity: "Ì", character: "Ì" },
  { entity: "Í", character: "Í" },
  { entity: "Î", character: "Î" },
  { entity: "Ï", character: "Ï" },
  { entity: "ì", character: "ì" },
  { entity: "í", character: "í" },
  { entity: "î", character: "î" },
  { entity: "ï", character: "ï" },
  { entity: "Ñ", character: "Ñ" },
  { entity: "ñ", character: "ñ" },
  { entity: "Ò", character: "Ò" },
  { entity: "Ó", character: "Ó" },
  { entity: "Ô", character: "Ô" },
  { entity: "Õ", character: "Õ" },
  { entity: "Ö", character: "Ö" },
  { entity: "ò", character: "ò" },
  { entity: "ó", character: "ó" },
  { entity: "ô", character: "ô" },
  { entity: "õ", character: "õ" },
  { entity: "ö", character: "ö" },
  { entity: "Ø", character: "Ø" },
  { entity: "ø", character: "ø" },
  { entity: "Œ", character: "Œ" },
  { entity: "œ", character: "œ" },
  { entity: "Š", character: "Š" },
  { entity: "š", character: "š" },
  { entity: "Ù", character: "Ù" },
  { entity: "Ú", character: "Ú" },
  { entity: "Û", character: "Û" },
  { entity: "Ü", character: "Ü" },
  { entity: "ù", character: "ù" },
  { entity: "ú", character: "ú" },
  { entity: "û", character: "û" },
  { entity: "ü", character: "ü" },
  { entity: "µ", character: "µ" },
  { entity: "×", character: "×" },
  { entity: "Ý", character: "Ý" },
  { entity: "Ÿ", character: "Ÿ" },
  { entity: "ý", character: "ý" },
  { entity: "ÿ", character: "ÿ" },
  { entity: "°", character: "°" },
  { entity: "†", character: "†" },
  { entity: "‡", character: "‡" },
  { entity: "&lt;", character: "<" },
  { entity: "&gt;", character: ">" },
  { entity: "&#177;", character: "±" },
  { entity: "&#171;", character: "«" },
  { entity: "&#187;", character: "»" },
  { entity: "&#191;", character: "¿" },
  { entity: "&#161;", character: "¡" },
  { entity: "&#183;", character: "·" },
  { entity: "&#149;", character: "•" },
  { entity: "&#153;", character: "™" },
  { entity: "&copy;", character: "©" },
  { entity: "&reg;", character: "®" },
  { entity: "&#167;", character: "§" },
  { entity: "&#182;", character: "¶" },
  { entity: "&quot;", character: "\"" },
  { entity: "&nbsp;", character: " " },
  { entity: "&ndash;", character: "-" },
  { entity: "&amp;", character: "&" },
  { entity: "&ldquo;", character: "“" },
  { entity: "&bull;", character: "•" },
  { entity: "&rdquo;", character: "”" },
  { entity: "&ordf;", character: "ª" },
  { entity: "&ordm;", character: "º" },
  { entity: "&ordf;", character: "ª" },
  { entity: "&ordf;", character: "ª" },
  { entity: "&ordf;", character: "ª" },
  { entity: "&ordf;", character: "ª" },
  { entity: "&ordf;", character: "ª" },
];

Here is the gist with the code.

If I missed some symbol, please, comment here or there.

Upvotes: 2

vstepaniuk
vstepaniuk

Reputation: 868

You can use Drive API Advanced Service for this. First you need to enable it. Then when you insert (create) a new Google Doc file with data from an HTML blob, it automatically renders HTML in your Doc. After that you get the text your Doc with the following code:

function htmltotext(html) {
  var id = Drive.Files.insert(
{title: 'temp',
mimeType: MimeType.GOOGLE_DOCS},
Utilities.newBlob(html, MimeType.HTML)).id;
  var doc = DocumentApp.openById(id);
  var text = doc.getBody().getText();
  doc.saveAndClose();
  Drive.Files.remove(id); // to remove completely avoiding trash
  return text;
}

Thanks @tanaike for suggestion

Upvotes: 0

rik
rik

Reputation: 491

You can use built-in Xml Services (reference):

var str = 'Some text &#x26; text';
var decode = XmlService.parse('<d>' + str + '</d>');
var strDecoded = decode.getRootElement().getText();

or you can use built-in E4X XML class.

var str = 'Some text &#x26; text';
var decode = new XML('<d>' + str + '</d>');
var strDecoded = decode.toString();

Upvotes: 21

Related Questions