Reputation: 162
In My program i have used string variable content. I have assigned a small HTML program for this string. For Example,
String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:[email protected]">[email protected]</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
From this i want to get "This is a Medium Header Send me mail at [email protected] is a new sentence without a paragraph break." alone.
This string available inside the tag. how i get this string using c#.
Upvotes: 2
Views: 106
Reputation: 460288
Don't use string methods or regex to parse HTML. You can use HtmlAgilityPack
.
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:[email protected]\">[email protected]</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(content);
string headerText = doc.DocumentNode.Descendants("H2").First().InnerText;
Result:
This is a Medium Header Send me mail [email protected] is a new sentence without a paragraph break.
Upvotes: 7
Reputation: 2761
HtmlFormatHelper.cs:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Tools
{
/// <summary>
/// набор утилит для форматирования HTML текста
/// </summary>
public static class HtmlFormatHelper
{
private static Regex _regexLineBreak;
private static Regex _regexStripFormatting;
private static Regex _regexTagWhiteSpace;
private static Regex _regexHyperlink;
/// <summary>
/// статический конструктор
/// </summary>
static HtmlFormatHelper()
{
_regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
_regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
_regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
_regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
}
/// <summary>
/// конвертировать HTML в текст
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainText(string html)
{
var text = html;
text = System.Net.WebUtility.HtmlDecode(text);
text = _regexTagWhiteSpace.Replace(text, "><");
text = _regexLineBreak.Replace(text, Environment.NewLine);
text = _regexStripFormatting.Replace(text, string.Empty);
return text;
}
/// <summary>
/// конвертировать HTML в текст с "умным" оформлением
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainTextSmart(string html)
{
// обрабатываем ссылки
html = _regexHyperlink.Replace(html, e =>
{
string url = e.Groups[1].Value.Trim();
string text = e.Groups[2].Value.Trim();
if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
{
// ссылки идентичны или ссылка отсутствует
return e.Value;
}
else
{
// ссылки отличаются
return string.Format("{0} ({1})", text, url);
}
});
return HtmlToPlainText(html);
}
/// <summary>
/// кодировать HTML код с "мягком" режиме
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string SoftHtmlEncode(string html)
{
if (html == null)
{
return null;
}
else
{
StringBuilder sb = new StringBuilder(html.Length);
foreach (char c in html)
{
if (c == '<')
{
sb.Append("<");
}
else if (c == '>')
{
sb.Append(">");
}
else
{
sb.Append(c);
}
}
return sb.ToString();
}
}
}
}
How to use:
// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:[email protected]\">[email protected]</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;
// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail [email protected] is a new sentence without a paragraph break.
// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail [email protected] (mailto:[email protected]).This is a new sentence without a paragraph break.
Upvotes: -1