Reputation: 5500
I wish to remove from html things like
<!--[if gte mso 9]>
...
<![endif]-->
<!--[if gte mso 10]>
...
<![endif]-->
How to do this in C# using HTMLAgilityPack?
I'm using
static void RemoveTag(HtmlNode node, string tag)
{
var nodeCollection = node.SelectNodes("//"+ tag );
if(nodeCollection!=null)
foreach (HtmlNode nodeTag in nodeCollection)
{
nodeTag.Remove();
}
}
for normal tags.
Upvotes: 4
Views: 7121
Reputation: 5500
@Mark, incorporated your 3rd example to produce this, for reference:
public static string CleanUpRteOutput(this string s)
{
if (s != null)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(s);
RemoveTag(doc, "script");
RemoveTag(doc, "link");
RemoveTag(doc, "style");
RemoveTag(doc, "meta");
RemoveTag(doc, "comment");
...
and the removeTag function:
static void RemoveTag(HtmlAgilityPack.HtmlDocument doc, string tag)
{
foreach (var n in doc.DocumentNode.SelectNodes("//" + tag) ?? new HtmlAgilityPack.HtmlNodeCollection(doc.DocumentNode))
n.Remove();
}
Upvotes: 0
Reputation: 283355
public static void RemoveComments(HtmlNode node)
{
foreach (var n in node.ChildNodes.ToArray())
RemoveComments(n);
if (node.NodeType == HtmlNodeType.Comment)
node.Remove();
}
static void Main(string[] args)
{
var doc = new HtmlDocument();
string html = @"<!--[if gte mso 9]>
...
<![endif]-->
<body>
<span>
<!-- comment -->
</span>
<!-- another comment -->
</body>
<!--[if gte mso 10]>
...
<![endif]-->";
doc.LoadHtml(html);
RemoveComments(doc.DocumentNode);
Console.WriteLine(doc.DocumentNode.OuterHtml);
Console.ReadLine();
}
Or a fun little LINQ-style:
public static IEnumerable<HtmlNode> Walk(HtmlNode node)
{
yield return node;
foreach (var child in node.ChildNodes)
foreach (var x in Walk(child))
yield return x;
}
...
foreach (var n in Walk(doc.DocumentNode).OfType<HtmlCommentNode>().ToArray())
n.Remove();
Even easier (forgot we could use xpath to find comment nodes)
var doc = new HtmlDocument();
string html = @"
<!--[if gte mso 9]>
...
<![endif]-->
<body>
<span>
<!-- comment -->
</span>
<!-- another comment -->
</body>
<!--[if gte mso 10]>
...
<![endif]-->";
doc.LoadHtml(html);
foreach (var n in doc.DocumentNode.SelectNodes("//comment()") ?? new HtmlNodeCollection(doc.DocumentNode))
n.Remove();
Upvotes: 12