Remove html tags using regex in javascript

Question

I want to remove all html tags except and </code> from document using this code: <pre><code>var regex = "<(?!a )(?!img )(?!iframe )([\s\S]*?)>"; var temp; while (source.match(regex)) { temp = source.match(regex)[0]; source = source.replace(temp, ""); } return source; </code></pre> It works in online regex testers, but for some reason it doesn't work on my page. For example it returns an original string when the input is: <pre><code> "<span style="font-size: 16pt; line-height: 200%; color: rgb(131, 60, 11); background-image: initial; background-attachment: initial; background-size: initial; background-origin: initial; background-clip: initial; background-position: initial; background-repeat: initial;">test<o:p></o:p>" </code></pre> Please help!

plalx · Accepted Answer

You can do it without a regex. It's usually not a good idea to try parsing HTML with regexes, unless the use case is very simple...

The way I implemented stripHtmlElementsMatching, you can pass it any CSS selector and it will strip all matching entities.

Therefore, to remove anything but a, img, iframe you can pass :not(a):not(img):not(iframe).

PS: The htmlstripping-root custom tag is only to avoid creating a parser element that interferes with the passed selector. For instance, if I used div as a parser element and you would pass the selector div > div, all divs would be removed even if they were not nested in your html string.

var stripHtmlElementsMatching = (function(doc) {
  
  doc.registerElement('htmlstripping-root');
  
  return function(text, selector) {
    
    var parser = document.createElement('htmlstripping-root'),
        matchingEls, i, len, el;
    
    selector = typeof selector == 'string' ? selector : ':not(*)';
    parser.innerHTML = text;
    
    matchingEls = parser.querySelectorAll(selector);
    
    for (i = 0, len = matchingEls.length; i < len; i++) {
      el = matchingEls[i];
      el.parentNode.replaceChild(newFragFrom(el.childNodes), el);
    }
    
    return parser.innerHTML;
  };
  
  function newFragFrom(nodes) {
    var frag = document.createDocumentFragment();
    
    while (nodes.length) frag.appendChild(nodes[0]);
    
    return frag;
  }
  
})(document);


var text = 'test';

var tagsToKeep = ['a', 'img', 'iframe'];

var sanitizeSelector = tagsToKeep.map(function(tag) {
  return ':not(' + tag + ')';
}).join('');

var sanitizedText = stripHtmlElementsMatching(text, sanitizeSelector);

document.body.appendChild(document.createTextNode(sanitizedText));

Remove html tags using regex in javascript

Answers (2)

Related Questions