Reputation: 11
I'm having a problem. I have a survey builder written with different types of questions (list, checkbox, dropdowns...) and one with a table to fill. The problem is now to store the table I have 3 different tables on my DB and to create the table later is a bit complex. I'm trying to do it, using a JavaScript editor so the user can create the table as in word. On my code behind I receive the table markup as an String and I want to store it as JSON on the DB.
I have this code here to convert the table to a DataSet object and then from here I can convert to XML and from here to JSON using the library JSON.NET. All of this is working fine, but the problem is when the table have "colspan" & "rowspan" parameters, is not working with them. Can you help me to finish this piece of code so everyone can have this as an example to parse HTML table to JSON in C#.
protected void Page_Load(object sender, EventArgs e)
{
string table = @"<table>
<tbody><tr>
<th rowspan='4'>Project</th><th rowspan='4'>Country</th><th colspan='3' rowspan='1'>Header 1</th><th colspan='2' rowspan='1'>Header 2</th>
</tr><tr>
<th colspan='1' rowspan='1'>Child Header 1</th><th colspan='2' rowspan='1'>Child Header 2</th><th colspan='1' rowspan='3'>Child Header 3</th><th colspan='1' rowspan='3'>Child Header 4</th>
</tr><tr>
<th colspan='1' rowspan='2'>Child Child Header 1</th><th colspan='1' rowspan='1'>tee</th><th colspan='1' rowspan='2'>ssss</th>
</tr><tr>
<th colspan='1' rowspan='1'>aas</th>
</tr><tr>
<td>EUS</td><td>ES</td><td> </td><td> </td><td> </td><td> </td><td> </td>
</tr><tr>
<td>ARP</td><td>IE</td><td> </td><td> </td><td> </td><td> </td><td> </td>
</tr><tr>
<td>ARM</td><td>UK</td><td> </td><td> </td><td> </td><td> </td><td> </td>
</tr><tr>
<td>SMRT</td><td>US</td><td> </td><td> </td><td> </td><td> </td><td> </td>
</tr><tr>
<td>CM</td><td></td><td> </td><td> </td><td> </td><td> </td><td> </td>
</tr>
</tbody></table>";
DataSet dataSet = HtmlTableParser.ParseDataSet(table);
StringWriter sw = new StringWriter();
dataSet.WriteXml(sw, XmlWriteMode.IgnoreSchema);
XmlDocument xd = new XmlDocument();
xd.LoadXml(sw.ToString());
string jsonText = JsonConvert.SerializeXmlNode(xd).Replace("_x0020_", " ");
}
/// <summary>
/// HtmlTableParser parses the contents of an html string into a System.Data DataSet or DataTable.
/// </summary>
public class HtmlTableParser
{
private const RegexOptions ExpressionOptions = RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnoreCase;
private const string CommentPattern = "<!--(.*?)-->";
private const string TablePattern = "<table[^>]*>(.*?)</table>";
private const string HeaderPattern = "<th[^>]*>(.*?)</th>";
private const string RowPattern = "<tr[^>]*>(.*?)</tr>";
private const string CellPattern = "<td[^>]*>(.*?)</td>";
/// <summary>
/// Given an HTML string containing n table tables, parse them into a DataSet containing n DataTables.
/// </summary>
/// <param name="html">An HTML string containing n HTML tables</param>
/// <returns>A DataSet containing a DataTable for each HTML table in the input HTML</returns>
public static DataSet ParseDataSet(string html)
{
DataSet dataSet = new DataSet();
MatchCollection tableMatches = Regex.Matches(
WithoutComments(html),
TablePattern,
ExpressionOptions);
foreach (Match tableMatch in tableMatches)
dataSet.Tables.Add(ParseTable(tableMatch.Value));
return dataSet;
}
/// <summary>
/// Given an HTML string containing a single table, parse that table to form a DataTable.
/// </summary>
/// <param name="tableHtml">An HTML string containing a single HTML table</param>
/// <returns>A DataTable which matches the input HTML table</returns>
public static DataTable ParseTable(string tableHtml)
{
string tableHtmlWithoutComments = WithoutComments(tableHtml);
DataTable dataTable = new DataTable();
MatchCollection rowMatches = Regex.Matches(
tableHtmlWithoutComments,
RowPattern,
ExpressionOptions);
dataTable.Columns.AddRange(tableHtmlWithoutComments.Contains("<th")
? ParseColumns(tableHtml)
: GenerateColumns(rowMatches));
ParseRows(rowMatches, dataTable);
return dataTable;
}
/// <summary>
/// Strip comments from an HTML stirng
/// </summary>
/// <param name="html">An HTML string potentially containing comments</param>
/// <returns>The input HTML string with comments removed</returns>
private static string WithoutComments(string html)
{
return Regex.Replace(html, CommentPattern, string.Empty, ExpressionOptions);
}
/// <summary>
/// Add a row to the input DataTable for each row match in the input MatchCollection
/// </summary>
/// <param name="rowMatches">A collection of all the rows to add to the DataTable</param>
/// <param name="dataTable">The DataTable to which we add rows</param>
private static void ParseRows(MatchCollection rowMatches, DataTable dataTable)
{
foreach (Match rowMatch in rowMatches)
{
// if the row contains header tags don't use it - it is a header not a row
if (!rowMatch.Value.Contains("<th"))
{
DataRow dataRow = dataTable.NewRow();
MatchCollection cellMatches = Regex.Matches(
rowMatch.Value,
CellPattern,
ExpressionOptions);
for (int columnIndex = 0; columnIndex < cellMatches.Count; columnIndex++)
dataRow[columnIndex] = cellMatches[columnIndex].Groups[1].ToString();
dataTable.Rows.Add(dataRow);
}
}
}
/// <summary>
/// Given a string containing an HTML table, parse the header cells to create a set of DataColumns
/// which define the columns in a DataTable.
/// </summary>
/// <param name="tableHtml">An HTML string containing a single HTML table</param>
/// <returns>A set of DataColumns based on the HTML table header cells</returns>
private static DataColumn[] ParseColumns(string tableHtml)
{
MatchCollection headerMatches = Regex.Matches(
tableHtml,
HeaderPattern,
ExpressionOptions);
return (from Match headerMatch in headerMatches
select new DataColumn(headerMatch.Groups[1].ToString())).ToArray();
}
/// <summary>
/// For tables which do not specify header cells we must generate DataColumns based on the number
/// of cells in a row (we assume all rows have the same number of cells).
/// </summary>
/// <param name="rowMatches">A collection of all the rows in the HTML table we wish to generate columns for</param>
/// <returns>A set of DataColumns based on the number of celss in the first row of the input HTML table</returns>
private static DataColumn[] GenerateColumns(MatchCollection rowMatches)
{
int columnCount = Regex.Matches(
rowMatches[0].ToString(),
CellPattern,
ExpressionOptions).Count;
return (from index in Enumerable.Range(0, columnCount)
select new DataColumn("Column " + Convert.ToString(index))).ToArray();
}
}
Upvotes: 1
Views: 6878
Reputation: 2562
This smells like very bad things. Why are using regular expressions to clean up HTML? The most famous answer ever on Stack Overflow pertains to this very thing. Do not do this.
Your requirements for parsing this HTML into this DataTable do not really make sense to me. What are you doing with the HTML table later than requires you to parse it into JSON now? From your question, it sounds like this is a fairly straightforward survey editor.
If you really need to parse this data into objects and store each individual field from this table into the database, please tell us why. It's possible to do something like this, but you I seriously urge you to reconsider parsing HTML.
Upvotes: 1