Reputation: 739
I'm Validating HTML inputs (form an rss feed) To be displayed in an Mvc View
I'm using the following whitelist approach to sanitise my html
private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
if (String.IsNullOrEmpty(html)) return html;
string tagname;
Match tag;
// match every HTML tag in the input
MatchCollection tags = _tags.Matches(html);
for (int i = tags.Count - 1; i > -1; i--)
{
tag = tags[i];
tagname = tag.Value.ToLowerInvariant();
if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
{
html = html.Remove(tag.Index, tag.Length);
}
}
return html;
}
I'd like to also allow for video content from Youtube or Vimeo to be displayed using iFrames or the html5 video tag
Can anyone point me in the right direction for a reg ex that's a bit more flexible?
Here's my attempt for the Iframe
private static Regex _whitelist_iframe = new Regex(@"
^<iframe\s
src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\sframeborder=""\d{1,3}"")?
(\sallowfullscreen)?
\s?>$|^</iframe>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
Upvotes: 0
Views: 799
Reputation: 739
The RegEx approach above was too strict, not to mention kevin's well made point!
Here's what I did:
used the html-agility-pack to parse the Html, and sanitised it as mentioned in this stackoverflow answer
I also added some code to check the src tag for images or iframes against a regex. (I'm pretty sure it could be done better)
public class HtmlSanitizer
{
private readonly IDictionary<string, string[]> _whitelist;
private readonly List<string> _deletableNodesXpath = new List<string>();
public HtmlSanitizer()
{
_whitelist = new Dictionary<string, string[]>
{
{"a", new[] {"href", "target", "title"}},
{"img", new[] {"src", "alt", "width", "height"}},
{"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }},
{"strong", null},
{"em", null},
{"blockquote", null},
{"b", null},
{"p", null},
{"ul", null},
{"ol", null},
{"li", null},
{"div", new[] {"align"}},
{"strike", null},
{"u", null},
{"sub", null},
{"sup", null},
{"table", null},
{"tr", null},
{"td", null},
{"th", null},
{"dd", null},
{"dt", null},
{"dl", null},
{"h1", null},
{"h2", null},
{"h3", null},
};
}
public string Sanitize(string input)
{
if (input.Trim().Length < 1)
return string.Empty;
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(input);
SanitizeNode(htmlDocument.DocumentNode);
string xPath = CreateXPath();
return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
}
private void SanitizeChildren(HtmlNode parentNode)
{
for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
{
SanitizeNode(parentNode.ChildNodes[i]);
}
}
private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase
| RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase
| RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private void SanitizeNode(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (!_whitelist.ContainsKey(node.Name))
{
if (!_deletableNodesXpath.Contains(node.Name))
{
//DeletableNodesXpath.Add(node.Name.Replace("?",""));
node.Name = "removeableNode";
_deletableNodesXpath.Add(node.Name);
}
if (node.HasChildNodes)
{
SanitizeChildren(node);
}
return;
}
if (node.HasAttributes)
{
for (int i = node.Attributes.Count - 1; i >= 0; i--)
{
HtmlAttribute currentAttribute = node.Attributes[i];
string[] allowedAttributes = _whitelist[node.Name];
if (allowedAttributes != null)
{
if (!allowedAttributes.Contains(currentAttribute.Name))
{
node.Attributes.Remove(currentAttribute);
}
// if img src ensure matches regex
if (node.Name == "img" && currentAttribute.Name == "src")
{
if (!_srcAttribute.IsMatch(currentAttribute.Value))
{
// remove node
node.Name = "removeableNode";
_deletableNodesXpath.Add(node.Name);
}
}
// if iframe - ensure it within allowed src tags
if (node.Name == "iframe" && currentAttribute.Name == "src")
{
if (!_iframeSrc.IsMatch(currentAttribute.Value))
{
// remove node
node.Name = "removeableNode";
_deletableNodesXpath.Add(node.Name);
}
}
}
else
{
node.Attributes.Remove(currentAttribute);
}
}
}
}
if (node.HasChildNodes)
{
SanitizeChildren(node);
}
}
private string StripHtml(string html, string xPath)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
if (xPath.Length > 0)
{
HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
foreach (HtmlNode node in invalidNodes)
{
node.ParentNode.RemoveChild(node, true);
}
}
return htmlDoc.DocumentNode.WriteContentTo();
;
}
private string CreateXPath()
{
string xPath = string.Empty;
for (int i = 0; i < _deletableNodesXpath.Count; i++)
{
if (i != _deletableNodesXpath.Count - 1)
{
xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
}
else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
}
return xPath;
}
}
Upvotes: 1