Serge Atareev
Serge Atareev

Reputation: 43

Optimizing Scraping and Looping

I want to find a collection and loop over it to load each page. I am attempting to:

  1. Find the number of pages - it's done (column D)
  2. Loop over links (column A) and past last page number as i
    • I know that I can loop to the next page, from page 1, with:
      For i = 1 To "number of last page (column D)"

| https://voronezh.leroymerlin.ru/catalogue/krovelnye-mastiki/                          |       | -7    | 1     |
| https://voronezh.leroymerlin.ru/catalogue/vodostok/                                   |       | -125  | 2     |
| https://voronezh.leroymerlin.ru/catalogue/rozetki-i-vyklyuchateli/                    |       | -898  | 10    |
| https://voronezh.leroymerlin.ru/catalogue/ramki-dlya-rozetok-i-vyklyuchateley/        |       | -398  | 5     |
| https://voronezh.leroymerlin.ru/catalogue/nakladki-dlya-rozetok-i-vyklyuchateley/     |       | -35   | 1     |
| https://voronezh.leroymerlin.ru/catalogue/podrozetniki/                               |       | -11   | 1     |
| https://voronezh.leroymerlin.ru/catalogue/silovye-kabeli/                             |       | -175  | 2     |

enter image description here

I tried the following code to loop all urls, but it is not working correctly.

Sub get_data()
Dim wsSheet As Worksheet, REZULTSheet As Worksheet, Rows As Long, http As New XMLHTTP60, html As New HTMLDocument
Dim i As Integer, topic As HTMLHtmlElement, link As Variant, x As Long, num_pages As Variant, links As Variant
Set wsSheet = Sheets("URLs_2")
Set REZULTSheet = Sheets("Products")
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).row
links = wsSheet.Range("A1:A" & Rows)
num_pages = wsSheet.Range("D1:D" & Rows)
REZULTSheet.Select

For i = 1 To ??? 'num_pages?
    Application.ScreenUpdating = False
    With http
        For Each link In links
            .Open "GET", link & "?display=90&sortby=1&page=" & i, False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
            .send
            Do: DoEvents: Loop Until .readyState = 4
            html.body.innerHTML = .responseText
            For Each topic In html.getElementsByClassName("ui-product-card__info")
                With topic.getElementsByClassName("product-name")
                    If .Length Then x = x + 1: Cells(x, 2) = .item(0).innerText
                End With
                With topic.getElementsByClassName("price-section-inner")
                    If .Length Then Cells(x, 3) = .item(0).innerText
                End With
                With topic.getElementsByClassName("madein__text") '
                    If .Length Then Cells(x, 1) = .item(1).innerText
                End With
            Next topic        
        Next link
    End With
Next i
End Sub

Since most of the code repeats, is there any way to run a loop to reduce the amount of code.

Current test file

Upvotes: 0

Views: 112

Answers (2)

QHarr
QHarr

Reputation: 84465

What you really want to do is a re-factor (which I may add in with time) to reduce code complexity. For now, to fix your looping problem see how you can generate two 1D arrays containing urls and page counts and then use 1 loop over page counts and index into the other array using the current loop counter value.

Get rid of auto-instantiation, qualify with class names, use Long rather than Integer, worksheets rather than sheets.

In a re-factor work with arrays so as to write out faster to sheet.

Option Explicit

Public Sub GetData()
    Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    Dim i As Long, topic As MSHTML.HTMLHtmlElement
    Dim x As Long, pageCounts(), numPages As Long, page As Long, links()

    Application.ScreenUpdating = False

    On Error GoTo errHand

    Set http = New MSXML2.XMLHTTP60
    Set html = New MSHTML.HTMLDocument
    Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send

                html.body.innerHTML = .responseText

                For Each topic In html.getElementsByClassName("ui-product-card__info")
                    x = x + 1
                    With topic.getElementsByClassName("product-name")
                        If .Length Then rezultSheet.Cells(x, 2) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("price-section-inner")
                        If .Length Then rezultSheet.Cells(x, 3) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("madein__text") '
                        If .Length Then rezultSheet.Cells(x, 1) = .Item(1).innerText
                    End With
                Next topic

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
errHand:
    Application.ScreenUpdating = True
End Sub

Re-factoring (not my best I'm afraid but a starting point. You'll want some lower level error handling for example). It is more code but starts to allocate separate logical tasks into their own sub/func:

Option Explicit
Public wsSheet As Worksheet, rezultSheet As Worksheet

Public Sub GetData()

    Dim http As MSXML2.XMLHTTP60, rowCount As Long, pageCounts(), links()

    Application.ScreenUpdating = False

    On Error GoTo errHand

    Set http = New MSXML2.XMLHTTP60
    Set wsSheet = Sheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    With rezultSheet.Cells
        .ClearContents
        .ClearFormats
    End With

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    GetResults http, pageCounts, links

errHand:
    Application.ScreenUpdating = True
    If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
End Sub


Public Sub GetResults(ByVal http As MSXML2.XMLHTTP60, ByRef pageCounts(), ByRef links())
    Dim i As Long, numPages As Long
    Dim page As Long, html As MSHTML.HTMLDocument

    Set html = New MSHTML.HTMLDocument

    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send

                html.body.innerHTML = .responseText

                WriteOutResults html

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
End Sub


Public Sub WriteOutResults(ByVal html As MSHTML.HTMLDocument)
    Dim topic As MSHTML.HTMLHtmlElement, results()
    Dim r As Long, productCards As Object

    Set productCards = html.getElementsByClassName("ui-product-card__info")

    ReDim results(1 To productCards.Length, 1 To 3)

    For Each topic In productCards
        r = r + 1
        With topic.getElementsByClassName("product-name")
            If .Length Then results(r, 2) = .Item(0).innerText
        End With
        With topic.getElementsByClassName("price-section-inner")
            If .Length Then results(r, 3) = .Item(0).innerText
        End With
        With topic.getElementsByClassName("madein__text") '
            If .Length Then results(r, 1) = .Item(1).innerText
        End With
    Next topic

    Dim lastRow As Long

    lastRow = GetLastRow(rezultSheet)
    rezultSheet.Cells(lastRow + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

Public Function GetLastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    GetLastRow = sh.Cells.Find(What:="*", _
                               After:=sh.Range("A1"), _
                               Lookat:=xlPart, _
                               LookIn:=xlFormulas, _
                               SearchOrder:=xlByRows, _
                               SearchDirection:=xlPrevious, _
                               MatchCase:=False).Row
    On Error GoTo 0
End Function

Upvotes: 2

FaneDuru
FaneDuru

Reputation: 42236

Code adapting @QHar solution, in order to work only in computer memory...

Option Explicit

Private Sub GetDataInMemory()
    Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    Dim i As Long, topic As MSHTML.HTMLHtmlElement
    Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
    Dim Data() As String, k As Long

    Set http = New MSXML2.XMLHTTP60
    Set html = New MSHTML.HTMLDocument
    Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    ReDim Data(2, k)
    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send
                html.body.innerHTML = .responseText
                For Each topic In html.getElementsByClassName("ui-product-card__info")
                    With topic.getElementsByClassName("product-name")
                        If .Length Then Data(1, x) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("price-section-inner")
                        If .Length Then Data(2, x) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("madein__text")
                        If .Length Then Data(0, x) = .Item(0).innerText
                    End With
                    x = x + 1: ReDim Preserve Data(2, x)
                Next topic

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
    rezultSheet.Range("A1:C" & x - 1).Value = Application.Transpose(Data)
End Sub

Upvotes: 2

Related Questions