Axe
Axe

Reputation: 739

Whitelisting RegEx to allow youtube iframe

I'm Validating HTML inputs (form an rss feed) To be displayed in an Mvc View

I'm using the following whitelist approach to sanitise my html

private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);


/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using 
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
    if (String.IsNullOrEmpty(html)) return html;

    string tagname;
    Match tag;

    // match every HTML tag in the input
    MatchCollection tags = _tags.Matches(html);
    for (int i = tags.Count - 1; i > -1; i--)
    {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
        {
            html = html.Remove(tag.Index, tag.Length);

        }
    }

    return html;
}

I'd like to also allow for video content from Youtube or Vimeo to be displayed using iFrames or the html5 video tag

Can anyone point me in the right direction for a reg ex that's a bit more flexible?

Here's my attempt for the Iframe

private static Regex _whitelist_iframe = new Regex(@"
             ^<iframe\s
            src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+""
            (\swidth=""\d{1,3}"")?
            (\sheight=""\d{1,3}"")?
            (\sframeborder=""\d{1,3}"")?
            (\sallowfullscreen)?
            \s?>$|^</iframe>$",
            RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

Upvotes: 0

Views: 799

Answers (1)

Axe
Axe

Reputation: 739

The RegEx approach above was too strict, not to mention kevin's well made point!

Here's what I did:

used the html-agility-pack to parse the Html, and sanitised it as mentioned in this stackoverflow answer

I also added some code to check the src tag for images or iframes against a regex. (I'm pretty sure it could be done better)

public class HtmlSanitizer
{
    private readonly IDictionary<string, string[]> _whitelist;
    private readonly List<string> _deletableNodesXpath = new List<string>();

    public HtmlSanitizer()
    {
        _whitelist = new Dictionary<string, string[]>
                        {
                            {"a", new[] {"href", "target", "title"}},
                            {"img", new[] {"src", "alt", "width", "height"}},
                            {"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }},
                            {"strong", null},
                            {"em", null},
                            {"blockquote", null},
                            {"b", null},
                            {"p", null},
                            {"ul", null},
                            {"ol", null},
                            {"li", null},
                            {"div", new[] {"align"}},
                            {"strike", null},
                            {"u", null},
                            {"sub", null},
                            {"sup", null},
                            {"table", null},
                            {"tr", null},
                            {"td", null},
                            {"th", null},
                            {"dd", null},
                            {"dt", null},
                            {"dl", null},
                            {"h1", null},
                            {"h2", null},
                            {"h3", null},
                        };
    }

    public string Sanitize(string input)
    {
        if (input.Trim().Length < 1)
            return string.Empty;
        var htmlDocument = new HtmlDocument();

        htmlDocument.LoadHtml(input);
        SanitizeNode(htmlDocument.DocumentNode);
        string xPath = CreateXPath();

        return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
    }

    private void SanitizeChildren(HtmlNode parentNode)
    {
        for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
        {
            SanitizeNode(parentNode.ChildNodes[i]);
        }
    }

    private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private void SanitizeNode(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (!_whitelist.ContainsKey(node.Name))
            {
                if (!_deletableNodesXpath.Contains(node.Name))
                {
                    //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode";
                    _deletableNodesXpath.Add(node.Name);
                }
                if (node.HasChildNodes)
                {
                    SanitizeChildren(node);
                }

                return;
            }

            if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];
                    string[] allowedAttributes = _whitelist[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (!allowedAttributes.Contains(currentAttribute.Name))
                        {
                            node.Attributes.Remove(currentAttribute);
                        }

                        // if img src ensure matches regex 
                        if (node.Name == "img" && currentAttribute.Name == "src")
                        {
                            if (!_srcAttribute.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                        // if iframe - ensure it within allowed src tags 
                        if (node.Name == "iframe" && currentAttribute.Name == "src")
                        {
                            if (!_iframeSrc.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            SanitizeChildren(node);
        }
    }

    private string StripHtml(string html, string xPath)
    {
        HtmlDocument htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);
        if (xPath.Length > 0)
        {
            HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
            foreach (HtmlNode node in invalidNodes)
            {
                node.ParentNode.RemoveChild(node, true);
            }
        }
        return htmlDoc.DocumentNode.WriteContentTo();
        ;
    }

    private string CreateXPath()
    {
        string xPath = string.Empty;
        for (int i = 0; i < _deletableNodesXpath.Count; i++)
        {
            if (i != _deletableNodesXpath.Count - 1)
            {
                xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
            }
            else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
        }
        return xPath;
    }
}

Upvotes: 1

Related Questions