Yousef
Yousef

Reputation: 403

Get Webpage table with hyperlinks and tables with VB in Excel

I am using this script to get text data of a webpage with Microsoft Excel, however, it returns only the text, but I want to get the hyperlink in a separated column. Could you please help me? It seems the command returns only text data, but I'm looking for saving the text and the corresponded URL, as text (of course not a hyperlink!).

I reviewed https://msdn.microsoft.com/en-us/library/office/ff836520.aspx but I could not find anything.

You may see the webpage with the provided url in the code.

Sub SaveUrl()
    Set shFirstQtr = Workbooks(1).Worksheets(1)
    Set qtQtrResults = shFirstQtr.QueryTables _
                       .Add(Connection:="URL;http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", _
                            Destination:=shFirstQtr.Cells(1, 1))
    With qtQtrResults
        .WebFormatting = xlNone
        .WebSelectionType = xlSpecifiedTables
        .WebTables = "1"
        .Refresh
    End With
End Sub

Upvotes: 2

Views: 386

Answers (1)

omegastripes
omegastripes

Reputation: 12602

Here are the examples showing how to automate IE and retrieve the necessary data from DOM (run TestIE()), and to make request with XHR and parse response with RegEx (run TestXHR()):

Option Explicit

' The code to automate IE and retrieve the necessary data from DOM

Sub TestIE()

    Dim aText() As Variant
    Dim aHref() As Variant
    Dim aHrefExists() As Boolean
    Dim aRes() As Variant
    Dim lRowsCount As Long
    Dim lCellsCount As Long
    Dim i As Long
    Dim j As Long
    Dim lCellsTotal As Long
    Dim x As Long

    With CreateObject("InternetExplorer.Application")
        ' Make visible for debug
        .Visible = True
        ' Navigate to page
        .Navigate "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417"
        ' Wait for IE ready
        Do While .ReadyState <> 4 Or .Busy
            DoEvents
        Loop
        ' Wait for document complete
        Do While .Document.ReadyState <> "complete"
            DoEvents
        Loop
        ' Wait for target table accessible
        Do While TypeName(.Document.getElementById("tblToGrid")) = "Null"
            DoEvents
        Loop
        ' Process target table
        With .Document.getElementById("tblToGrid")
            ' Get table size
            lRowsCount = .Rows.Length
            lCellsCount = .Rows(0).Cells.Length
            ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
            ReDim aText(1 To lRowsCount, 1 To lCellsCount)
            ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
            ReDim aHrefExists(1 To lCellsCount)
            ' Process each table row
            For i = 1 To lRowsCount
                With .Rows(i - 1)
                    ' Process each cell
                    For j = 1 To lCellsCount
                        ' Retrieve text content
                        aText(i, j) = .Cells(j - 1).innerText
                        ' Retrieve hyperlink if exists
                        With .Cells(j - 1).getElementsByTagName("a")
                            If .Length = 1 Then
                                aHrefExists(j) = True
                                aHref(i, j) = .Item(0).href
                            End If
                        End With
                    Next
                End With
            Next
        End With
        .Quit
    End With
    ' Create resulting array that includes texts and urls
    lCellsTotal = lCellsCount
    For j = 1 To lCellsCount
        If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
    Next
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
    ' Populate array with texts and urls
    x = 1
    For j = 1 To lCellsCount
        For i = 1 To lRowsCount
            aRes(i, x) = aText(i, j)
        Next
        x = x + 1
        If aHrefExists(j) Then
            For i = 1 To lRowsCount
                aRes(i, x) = aHref(i, j)
            Next
            x = x + 1
        End If
    Next
    ' Result output to sheet 1
    With Sheets(1)
        .Cells.Delete
        Output .Cells(1, 1), aRes
    End With
End Sub

' The code to make request with XHR and parse response with RegEx

Sub TestXHR()

    Dim sRespText As String
    Dim oRERows As Object
    Dim oRECells As Object
    Dim aRes() As Variant
    Dim lRowsCount As Long
    Dim lCellsCount As Long
    Dim i As Long
    Dim j As Long
    Dim lCellsTotal As Long
    Dim x As Long

    ' Retrieve HTML content
    With CreateObject("MSXML2.XMLHttp")
        .Open "GET", "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", False
        .Send
        sRespText = .responseText
    End With
    ' Regular expression for table rows setup
    Set oRERows = CreateObject("VBScript.RegExp")
    With oRERows
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "<tr.*?>[\s\S]*?</tr>"
    End With
    ' Regular expression for table cells setup
    Set oRECells = CreateObject("VBScript.RegExp")
    With oRECells
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "<td.*?>(?:.*?<a.*?href=(""|')(.*?)\1.*?>(.*?)</a>.*?|(.*?))</td>"
    End With
    ' Execute 1st regexp on response
    With oRERows.Execute(sRespText)
        ' Get table size
        lRowsCount = .Count
        lCellsCount = oRECells.Execute(.Item(0).Value).Count
        ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
        ReDim aText(1 To lRowsCount, 1 To lCellsCount)
        ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
        ReDim aHrefExists(1 To lCellsCount)
        ' Process each table row
        For i = 1 To lRowsCount
            ' Get 1st regexp match value, and execute 2nd regexp on it
            With oRECells.Execute(.Item(i - 1).Value)
            ' Process each cell
            For j = 1 To .Count
                With .Item(j - 1)
                    If .SubMatches(3) <> "" Then
                        ' Retrieve text content only
                        aText(i, j) = .SubMatches(3)
                    Else
                        ' Retrieve text content and hyperlink
                        aText(i, j) = .SubMatches(2)
                        aHref(i, j) = "http://www.tsetmc.com/" & .SubMatches(1)
                        aHrefExists(j) = True
                    End If
                End With
            Next
            End With
        Next
    End With
    ' Create resulting array that includes texts and urls
    lCellsTotal = lCellsCount
    For j = 1 To lCellsCount
        If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
    Next
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
    ' Populate array with texts and urls
    x = 1
    For j = 1 To lCellsCount
        For i = 1 To lRowsCount
            aRes(i, x) = aText(i, j)
        Next
        x = x + 1
        If aHrefExists(j) Then
            For i = 1 To lRowsCount
                aRes(i, x) = aHref(i, j)
            Next
            x = x + 1
        End If
    Next
    ' Result output to sheet 2
    With Sheets(2)
        .Cells.Delete
        Output .Cells(1, 1), aRes
    End With

End Sub

' Utility section

Sub Output(objDstRng As Range, arrCells As Variant)
    With objDstRng
        .Parent.Select
        With .Resize( _
                UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _
                UBound(arrCells, 2) - LBound(arrCells, 2) + 1)
            .NumberFormat = "@"
            .Value = arrCells
            .Columns.AutoFit
        End With
    End With
End Sub

Both methods gives the same result (on sheet 1 and 2):

result

Upvotes: 1

Related Questions