TCritical
TCritical

Reputation: 71

VBA: Web scraping with <ul and <li and <div and <span

I am using VBA to extract the data from HTML in <span code which is under <Div which is under <li which is under <ul

I'm trying to extract the "date and matter" from HTML. "Date" should be in A column and "Matter" should be in B Column in Excel.

The drawback of my code is, it is pulling all the Date and matter into single cell.

Sub GetDat()
    Dim IE As New InternetExplorer, html As HTMLDocument
    Dim elem As Object, data As String

    With IE
        .Visible = True
        .navigate "https://www.MyURL/sc/wo/Worders/index?id=76888564"
        Do While .readyState <> READYSTATE_COMPLETE: Loop
        Set html = .document
    End With

    data = ""

    For Each elem In html.getElementsByClassName("simple-list")(0).getElementsByTagName("li")
        data = data & " " & elem.innerText
    Next elem

    Range("A1").Value = data

    IE.Quit
End Sub

The output that I need is shown in the image:

HTML:

Upvotes: 3

Views: 1360

Answers (1)

QHarr
QHarr

Reputation: 84465

You could grab two nodeLists, one for dates and one for matters, and then loop those writing out to sheet. Match dates based on data-bind attribute value; matters on classname:

Dim dates As Object, matters As Object, i As Long, ws As Worksheet

Set ws = ThisWorkbook.Worksheets("Sheet1")
Set dates = ie.document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
Set matters = ie.document.querySelectorAll(".wo-notes")

With ws

    For i = 0 To dates.Length - 1
        .Cells(i + 1, 1) = dates.Item(i).innertext
        .Cells(i + 1, 2) = matters.Item(i).innertext
    Next

End With

Example reading values from column C:

Option Explicit

Public Sub GetMatters()
    Dim ws As Worksheet, lastRow As Long, urls(), results(), ie As SHDocVw.InternetExplorer, r As Long

    Set ie = New SHDocVw.InternetExplorer
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    lastRow = ws.Cells(ws.Rows.Count, "C").End(xlUp).Row
    urls = Application.Transpose(ws.Range("C2:C" & lastRow).Value)
    ReDim results(1 To 1000, 1 To 2)

    With ie
        .Visible = True

        For i = LBound(urls) To UBound(urls)
            .navigate2 "https://www.MyURL/sc/wo/Worders/index?id=" & urls(i)
            While .Busy Or .readyState <> 4: DoEvents: Wend

            Dim dates As Object, matters As Object, i As Long

            Set dates = .document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
            Set matters = .document.querySelectorAll(".wo-notes")

            For i = 0 To dates.Length - 1
                r = r + 1
                results(r, 1) = dates.Item(i).innertext
                results(r, 2) = matters.Item(i).innertext
            Next
            Set dates = Nothing: Set matter = dates
        Next
        .Quit
    End With

    ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

References:

  1. document.querySelectorAll
  2. css selectors

Upvotes: 1

Related Questions