Reputation: 403
I am using this script to get text data of a webpage with Microsoft Excel, however, it returns only the text, but I want to get the hyperlink in a separated column. Could you please help me? It seems the command returns only text data, but I'm looking for saving the text and the corresponded URL, as text (of course not a hyperlink!).
I reviewed https://msdn.microsoft.com/en-us/library/office/ff836520.aspx but I could not find anything.
You may see the webpage with the provided url in the code.
Sub SaveUrl()
Set shFirstQtr = Workbooks(1).Worksheets(1)
Set qtQtrResults = shFirstQtr.QueryTables _
.Add(Connection:="URL;http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", _
Destination:=shFirstQtr.Cells(1, 1))
With qtQtrResults
.WebFormatting = xlNone
.WebSelectionType = xlSpecifiedTables
.WebTables = "1"
.Refresh
End With
End Sub
Upvotes: 2
Views: 386
Reputation: 12602
Here are the examples showing how to automate IE and retrieve the necessary data from DOM (run TestIE()
), and to make request with XHR and parse response with RegEx (run TestXHR()
):
Option Explicit
' The code to automate IE and retrieve the necessary data from DOM
Sub TestIE()
Dim aText() As Variant
Dim aHref() As Variant
Dim aHrefExists() As Boolean
Dim aRes() As Variant
Dim lRowsCount As Long
Dim lCellsCount As Long
Dim i As Long
Dim j As Long
Dim lCellsTotal As Long
Dim x As Long
With CreateObject("InternetExplorer.Application")
' Make visible for debug
.Visible = True
' Navigate to page
.Navigate "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417"
' Wait for IE ready
Do While .ReadyState <> 4 Or .Busy
DoEvents
Loop
' Wait for document complete
Do While .Document.ReadyState <> "complete"
DoEvents
Loop
' Wait for target table accessible
Do While TypeName(.Document.getElementById("tblToGrid")) = "Null"
DoEvents
Loop
' Process target table
With .Document.getElementById("tblToGrid")
' Get table size
lRowsCount = .Rows.Length
lCellsCount = .Rows(0).Cells.Length
' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
ReDim aText(1 To lRowsCount, 1 To lCellsCount)
ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
ReDim aHrefExists(1 To lCellsCount)
' Process each table row
For i = 1 To lRowsCount
With .Rows(i - 1)
' Process each cell
For j = 1 To lCellsCount
' Retrieve text content
aText(i, j) = .Cells(j - 1).innerText
' Retrieve hyperlink if exists
With .Cells(j - 1).getElementsByTagName("a")
If .Length = 1 Then
aHrefExists(j) = True
aHref(i, j) = .Item(0).href
End If
End With
Next
End With
Next
End With
.Quit
End With
' Create resulting array that includes texts and urls
lCellsTotal = lCellsCount
For j = 1 To lCellsCount
If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
Next
ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
' Populate array with texts and urls
x = 1
For j = 1 To lCellsCount
For i = 1 To lRowsCount
aRes(i, x) = aText(i, j)
Next
x = x + 1
If aHrefExists(j) Then
For i = 1 To lRowsCount
aRes(i, x) = aHref(i, j)
Next
x = x + 1
End If
Next
' Result output to sheet 1
With Sheets(1)
.Cells.Delete
Output .Cells(1, 1), aRes
End With
End Sub
' The code to make request with XHR and parse response with RegEx
Sub TestXHR()
Dim sRespText As String
Dim oRERows As Object
Dim oRECells As Object
Dim aRes() As Variant
Dim lRowsCount As Long
Dim lCellsCount As Long
Dim i As Long
Dim j As Long
Dim lCellsTotal As Long
Dim x As Long
' Retrieve HTML content
With CreateObject("MSXML2.XMLHttp")
.Open "GET", "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", False
.Send
sRespText = .responseText
End With
' Regular expression for table rows setup
Set oRERows = CreateObject("VBScript.RegExp")
With oRERows
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = "<tr.*?>[\s\S]*?</tr>"
End With
' Regular expression for table cells setup
Set oRECells = CreateObject("VBScript.RegExp")
With oRECells
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = "<td.*?>(?:.*?<a.*?href=(""|')(.*?)\1.*?>(.*?)</a>.*?|(.*?))</td>"
End With
' Execute 1st regexp on response
With oRERows.Execute(sRespText)
' Get table size
lRowsCount = .Count
lCellsCount = oRECells.Execute(.Item(0).Value).Count
' Create 2d arrays for texts and hyperlinks values, and for column url existance flag
ReDim aText(1 To lRowsCount, 1 To lCellsCount)
ReDim aHref(1 To lRowsCount, 1 To lCellsCount)
ReDim aHrefExists(1 To lCellsCount)
' Process each table row
For i = 1 To lRowsCount
' Get 1st regexp match value, and execute 2nd regexp on it
With oRECells.Execute(.Item(i - 1).Value)
' Process each cell
For j = 1 To .Count
With .Item(j - 1)
If .SubMatches(3) <> "" Then
' Retrieve text content only
aText(i, j) = .SubMatches(3)
Else
' Retrieve text content and hyperlink
aText(i, j) = .SubMatches(2)
aHref(i, j) = "http://www.tsetmc.com/" & .SubMatches(1)
aHrefExists(j) = True
End If
End With
Next
End With
Next
End With
' Create resulting array that includes texts and urls
lCellsTotal = lCellsCount
For j = 1 To lCellsCount
If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1
Next
ReDim aRes(1 To lRowsCount, 1 To lCellsTotal)
' Populate array with texts and urls
x = 1
For j = 1 To lCellsCount
For i = 1 To lRowsCount
aRes(i, x) = aText(i, j)
Next
x = x + 1
If aHrefExists(j) Then
For i = 1 To lRowsCount
aRes(i, x) = aHref(i, j)
Next
x = x + 1
End If
Next
' Result output to sheet 2
With Sheets(2)
.Cells.Delete
Output .Cells(1, 1), aRes
End With
End Sub
' Utility section
Sub Output(objDstRng As Range, arrCells As Variant)
With objDstRng
.Parent.Select
With .Resize( _
UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _
UBound(arrCells, 2) - LBound(arrCells, 2) + 1)
.NumberFormat = "@"
.Value = arrCells
.Columns.AutoFit
End With
End With
End Sub
Both methods gives the same result (on sheet 1 and 2):
Upvotes: 1