john
john

Reputation: 707

How to validate html table string using VBA?

My VBA string is an HTML table.

Let myTable = "<table><tr><th>LocationID</th><th>Lastname</th><th>Age</th>  </tr>  <tr>    <td>1234</td>    <td>Smith</td>    <td>50</td>  </tr>  <tr>    <td>1234</td>    <td>Jackson</td>    <td>94</td>  </tr><tr>    <td>1237</td>    <td>Doe</td>    <td>80</td>  </tr></table> 

Visually looks like this

LocationID   Lastname   Age
1234         Smith      50
1234         Jackson    94
1237         Doe        80

I need to make sure this string has the same LocationID for each row. How can I parse this string and use VBA to assert the first column has the same values? Is there a build in parser for html in VBA

Upvotes: 0

Views: 455

Answers (2)

Tim Williams
Tim Williams

Reputation: 166341

Here's an example approach:

Sub Tester()
    Dim arr
    With ActiveSheet
        'parse HTML (stored in a worksheet cell)
        arr = HTMLTableToArray(.Range("A1").Value)
        'put the array onto the sheet
        .Range("B3").Resize(UBound(arr, 1), UBound(arr, 2)).Value = arr
    End With
End Sub


'convert a regular HTML table (no merged cells etc) to a 2D array
Function HTMLTableToArray(HTMLText As String)
    Dim o As New HTMLDocument, tbl
    Dim nr As Long, nc As Long, r As Long, c As Long
    o.body.innerHTML = HTMLText
    DoEvents
    Set tbl = o.getElementsByTagName("table")(0)
    nr = tbl.Rows.Length
    nc = tbl.Rows(0).Cells.Length
    ReDim arr(1 To nr, 1 To nc) 'size output array
    'loop over rows and cells and fill the array
    For r = 1 To nr
        For c = 1 To nc
            arr(r, c) = tbl.Rows(r - 1).Cells(c - 1).innerText
        Next c
    Next r
    HTMLTableToArray = arr
End Function

Note: this assumes your HTML string is valid and contains a complete HTML table with no merged cells (ie. no use of rowspan or colspan)

Upvotes: 2

QHarr
QHarr

Reputation: 84465

You could read html from file into MSHTML.HTMLDocument using Microsoft HTML Object Library reference then gather a nodeList of all the rows. Assuming there are headers in first row then loop the the other rows and test the value of the firstChild of each row.

Option Explicit

Public Sub test()
    Dim html As MSHTML.HTMLDocument, firstColumnSecondRow As String, rows As Object, i As Long

    Set html = GetHTMLFileContent("C:\Users\User\Desktop\test.html")
    Set rows = html.querySelectorAll("tr")

    Select Case rows.Length

    Case 1
        Debug.Print False
        Exit Sub
    Case 2
        Debug.Print True
    Case Is >= 3
        firstColumnSecondRow = rows.item(1).firstChild.innerText

        For i = 3 To rows.Length - 1
            If rows.item(i).firstChild.innerText <> firstColumnSecondRow Then
                Debug.Print False
                Exit Sub
            End If
        Next
        Debug.Print True
    End Select
End Sub

Public Function GetHTMLFileContent(ByVal filePath As String) As MSHTML.HTMLDocument
    '"C:\Users\User\Desktop\test.html"
    Dim fso As Object, hFile As Object, hString As String, html As MSHTML.HTMLDocument
    Set html = New MSHTML.HTMLDocument
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set hFile = fso.OpenTextFile(filePath)

    Do Until hFile.AtEndOfStream
        hString = hFile.ReadAll()
    Loop

    html.body.innerHTML = hString
    Set GetHTMLFileContent = html
End Function

Public Function GetHTMLFromFile(ByVal url As String) As String
    Dim fso As Object, f As Object, outputString As String

    Set fso = CreateObject("Scripting.FileSystemObject")
    Set f = fso.OpenTextFile("C:\Users\HarrisQ\Desktop\HTML.txt", 1)

    Do Until f.AtEndOfStream
        outputString = f.ReadAll()
    Loop
    f.Close

    GetHTMLFromFile = outputString
End Function

If you have the string already you can just directly assign to html document with

Set html = New MSHTML.HTMLDocument
html.body.innerHTML = yourTableString ' no need for function call to read html and return HTMLDocument

Upvotes: 2

Related Questions