GenZ
GenZ

Reputation: 23

Retrieving all Excel file links from a webpage

I'm trying to get all the downloadable Excel file links from the website, but having difficulty. Please help to guide me. Thanks.

Sub TYEX()

    Dim internet As Object
    Dim internetdata As Object
    Dim div_result As Object
    Dim header_links As Object
    Dim link As Object
    Dim URL As String

    Set internet = CreateObject("InternetExplorer.Application")
    internet.Visible = True

    URL = "https://www.jpx.co.jp/markets/public/short-selling/index.html"
    internet.Navigate URL

    Do Until internet.ReadyState >= 4
        DoEvents
    Loop

    Application.Wait Now + TimeSerial(0, 0, 5)

    Set internetdata = internet.Document
    Set div_result = internetdata.getElementById("readArea")

    Set header_links = div_result.getElementsByTagName("td")

    For Each h In header_links
        Set link = h.ChildNodes.item(0)
        Cells(Range("A" & Rows.Count).End(xlUp).Row + 1, 1) = link.href
    Next

    MsgBox "done"
End Sub

Upvotes: 2

Views: 184

Answers (2)

QHarr
QHarr

Reputation: 84465

You can use an attribute = value CSS selector with $ operator to say the href value must end with .xls. Then use querySelectorAll to retrieve all matched results. Using CSS selectors is a very fast and generally robust method.

Dim list As Object
Set list = ie.document.querySelectorAll("[href$='.xls']")

It is much faster to use XMLHTTP as well, rather than opening IE. Note that you can then pass these links to a function to perform a binary download or to URLMon for downloading.

Option Explicit   
Public Sub Links()
    Dim sResponse As String, html As HTMLDocument, list As Object, i As Long

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://www.jpx.co.jp/markets/public/short-selling/index.html", False
        .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    Set html = New HTMLDocument
    With html
        .body.innerHTML = sResponse
        Set list = html.querySelectorAll("[href$='.xls']")
    End With
    For i = 0 To list.Length - 1
        Debug.Print Replace$(list.item(i), "about:", "https://www.jpx.co.jp")
    Next
End Sub

Example download function (though you could re-use your existing XMLHTTP object - this is just to illustrate):

Public Function DownloadFile(ByVal downloadFolder As String, ByVal downloadURL As String) As String
    Dim http As Object , tempArr As Variant
    Set http =  CreateObject("WinHttp.WinHttpRequest.5.1")
    http.Open "GET", downloadURL, False
    http.send
    On Error GoTo errhand
    With CreateObject("ADODB.Stream")
        .Open
        .Type = 1
        .write http.responseBody
        tempArr = Split(downloadURL, "/")
        tempArr = tempArr(UBound(tempArr))
        .SaveToFile downloadFolder & tempArr, 2  '< "/" on enter of downloadFolder. 2 for overwrite which is Ok if no file modifications.
        .Close
    End With
    DownloadFile = downloadFolder & tempArr
    Exit Function
errhand:
    If Err.Number <> 0 Then
        Debug.Print Err.Number, Err.Description
        MsgBox "Download failed"
    End If
    DownloadFile = vbNullString
End Function

References (VBE > Tools > References):

  1. Microsoft HTML Object Library

Upvotes: 3

K.Dᴀᴠɪs
K.Dᴀᴠɪs

Reputation: 10139

You had the idea down correctly, but here's a different approach:

Sub TYEX()

    Dim ie As Object
    Set ie = CreateObject("InternetExplorer.Application")
    With ie
        .navigate "https://www.jpx.co.jp/markets/public/short-selling/index.html"
        .Visible = True

        Do While .Busy Or .readyState < 4
            DoEvents
        Loop

        Dim doc As Object, tbl As Object
        Set doc = .document
        Set tbl = doc.getElementsByClassName("component-normal-table")(0).Children(0)

        Dim r As Long, xlsArr(), a As Object

        With tbl.Rows
            ReDim xlsArr(1 To .Length - 1)
            For r = 1 To .Length - 1   ' 0 is the table header
                xlsArr(r) = .Item(r).Children(1).innerHTML
            Next r
        End With

        With CreateObject("VBScript.RegExp")
            .Pattern = "<a href=""(\/markets.*?\.xls)"
            For r = 1 To UBound(xlsArr)
                xlsArr(r) = "https://www.jpx.co.jp" & .Execute(xlsArr(r))(0).SubMatches(0)
                Debug.Print xlsArr(r)
            Next
        End With
    End With

    'Add to sheet
    Dim ws As Worksheet, rng As Range
    Set ws = ThisWorkbook.Worksheets(1)
    With ws
        Set rng = .Range(.Cells(NextRow(ws), 1), .Cells(NextRow(ws) + UBound( _
                xlsArr) - 1, 1))
        rng.Value = Application.Transpose(xlsArr)
    End With

End Sub

Public Function NextRow(ByVal ws As Worksheet, Optional ByVal col As Variant = 1) As Long
    With ws
        NextRow = .Cells(.Rows.Count, col).End(xlUp).Row + 1
    End With
End Function

Breaking Down the Code

This will loop your html table rows. We start at 1, because 0 is actually just the table header.

With tbl.Rows
    ReDim xlsArr(1 To .Length - 1)
    For r = 1 To .Length - 1   ' 0 is the table header
        xlsArr(r) = .Item(r).Children(1).innerHTML
    Next r
End With

This uses regular expressions to extract the url from the innerHTML property. You can see how this particular regex works here: Regex101

With CreateObject("VBScript.RegExp")
    .Pattern = "<a href=""(\/markets.*?\.xls)"
    For r = 1 To UBound(xlsArr)
        xlsArr(r) = "https://www.jpx.co.jp" & .Execute(xlsArr(r))(0).SubMatches(0)
        Debug.Print xlsArr(r)
    Next
End With

You will size your range to be the same size as your array that contains the links, then write the array to the worksheet. This is usually much faster than writing cells one-by-one.

'Add to sheet
Dim ws As Worksheet, rng As Range
Set ws = ThisWorkbook.Worksheets(1)
With ws
    Set rng = .Range(.Cells(NextRow(ws), 1), .Cells(NextRow(ws) + UBound( _
            xlsArr) - 1, 1))
    rng.Value = Application.Transpose(xlsArr)
End With

Upvotes: 4

Related Questions