Reputation: 1048
I am processing xhtml using javascript. I am getting the text content for a div node by concatenating the nodeValue of all child nodes where nodeType == Node.TEXT_NODE.
The resulting string sometimes contains a non-breaking space entity. How do I replace this with a regular space character?
My div looks like this...
<div><b>Expires On</b> Sep 30, 2009 06:30 AM</div>
The following suggestions found on the web did not work:
var cleanText = text.replace(/^\xa0*([^\xa0]*)\xa0*$/g,"");
var cleanText = replaceHtmlEntities(text);
var replaceHtmlEntites = (function() {
var translate_re = /&(nbsp|amp|quot|lt|gt);/g;
var translate = {
"nbsp": " ",
"amp" : "&",
"quot": "\"",
"lt" : "<",
"gt" : ">"
};
return function(s) {
return ( s.replace(translate_re, function(match, entity) {
return translate[entity];
}) );
}
})();
Any suggestions?
Upvotes: 91
Views: 161165
Reputation: 1362
Maybe this helps someone ...Pure javascript function.
var array = [{text: 'test & & "', id:1}, {text: 'test222 " \' 22222 "', id:2}];
console.log('in', JSON.stringify(array));
array.map((object, i) => {
//console.log('i', i, object);
Object.keys(object).map(key => {
var value = String(object[key]);
var replacewith = {'&': '&', '<': '<', '>': '>', '"': '"', ''': '\''};
['&', '<', '>', '"', '''].map(checkme => {
if(value.indexOf(checkme) != -1){
console.log('htmlConvertBack found ' + checkme, value);
var re = new RegExp(checkme, 'g');
object[key] = value.replace(re, replacewith[checkme]);
}
});
});
});
console.log('out', JSON.stringify(array));
Upvotes: 0
Reputation: 11038
A way to hack this in is to replace any empty line with two or more spaces with some newlines and a token. Then post markdown, replace paragraphs with just that token to line breaks.
// replace empty lines with "EMPTY_LINE"
rawMdText = rawMdText.replace(/\n +(?=\n)/g, "\n\nEMPTY_LINE\n");
// put <br> at the end of any other line with two spaces
rawMdText = rawMdText.replace(/ +\n/, "<br>\n");
// parse
let rawHtml = markdownParse(rawMdText);
// for any paragraphs that end with a newline (injected above)
// and are followed by multiple empty lines leading to
// another paragraph, condense them into one paragraph
mdHtml = mdHtml.replace(/(<br>\s*<\/p>\s*)(<p>EMPTY_LINE<\/p>\s*)+(<p>)/g, (match) => {
return match.match(/EMPTY_LINE/g).map(() => "<br>").join("");
});
// for basic newlines, just replace them
mdHtml = mdHtml.replace(/<p>EMPTY_LINE<\/p>/g, "<br>");
What this does is finds every new line with nothing but a couple spaces+. It uses look ahead so that it starts at the right place for the next replace, it'll break on two lines in a row without that.
Then markdown will parse those lines into paragraphs containing nothing but the token "EMPTY_LINE". So you can go through the rawHtml and replace those with line breaks.
As a bonus, the replace function will condense all line break paragraphs into an uppper and lower paragraph if they exist.
In effect, you'd use it like this:
A line with spaces at end
and empty lines with spaces in between will condense into a multi-line paragraph.
A line with no spaces at end
and lines with spaces in between will be two paragraphs with extra lines between.
And the output would be this:
<p>
A line with spaces at end<br>
<br>
<br>
and empty lines with spaces in between will condense into a multi-line paragraph.
</p>
<p>A line with no spaces at end</p>
<br>
<br>
<p>and lines with spaces in between will be two paragraphs with extra lines between.</p>
Upvotes: 0
Reputation: 16261
for me replace doesn't work... try this code:
str = str.split(""").join('"');
Upvotes: 1
Reputation: 4182
Removes everything between &
and ;
which all such symbols have. if you juts want to get rid of them.
text.replace(/&.*;/g,'');
Upvotes: 1
Reputation: 51
var text = "" &<>";
text = text.replaceHtmlEntites();
String.prototype.replaceHtmlEntites = function() {
var s = this;
var translate_re = /&(nbsp|amp|quot|lt|gt);/g;
var translate = {"nbsp": " ","amp" : "&","quot": "\"","lt" : "<","gt" : ">"};
return ( s.replace(translate_re, function(match, entity) {
return translate[entity];
}) );
};
try this.....this worked for me
Upvotes: 5
Reputation: 324707
This is much easier than you're making it. The text node will not have the literal string " "
in it, it'll have have the corresponding character with code 160.
function replaceNbsps(str) {
var re = new RegExp(String.fromCharCode(160), "g");
return str.replace(re, " ");
}
textNode.nodeValue = replaceNbsps(textNode.nodeValue);
UPDATE
Even easier:
textNode.nodeValue = textNode.nodeValue.replace(/\u00a0/g, " ");
Upvotes: 188
Reputation: 804
i used this, and it worked:
var cleanText = text.replace(/&nbsp;/g,"");
Upvotes: 7
Reputation: 109503
I think when you define a function with "var foo = function() {...};
", the function is only defined after that line. In other words, try this:
var replaceHtmlEntites = (function() {
var translate_re = /&(nbsp|amp|quot|lt|gt);/g;
var translate = {
"nbsp": " ",
"amp" : "&",
"quot": "\"",
"lt" : "<",
"gt" : ">"
};
return function(s) {
return ( s.replace(translate_re, function(match, entity) {
return translate[entity];
}) );
}
})();
var cleanText = text.replace(/^\xa0*([^\xa0]*)\xa0*$/g,"");
cleanText = replaceHtmlEntities(text);
Edit: Also, only use "var
" the first time you declare a variable (you're using it twice on the cleanText
variable).
Edit 2: The problem is the spelling of the function name. You have "var replaceHtmlEntites =". It should be "var replaceHtmlEntities ="
Upvotes: 8
Reputation: 9332
That first line is pretty messed up. It only needs to be:
var cleanText = text.replace(/\xA0/g,' ');
That should be all you need.
Upvotes: 17
Reputation: 24177
If you only need to replace
then you can use a far simpler regex:
var textWithNBSpaceReplaced = originalText.replace(/ /g, ' ');
Also, there is a typo in your div example, it says &nnbsp;
instead of
.
Upvotes: 43