Reputation: 43
I want to find a collection and loop over it to load each page. I am attempting to:
i
For i = 1 To "number of last page (column D)"
| https://voronezh.leroymerlin.ru/catalogue/krovelnye-mastiki/ | | -7 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/vodostok/ | | -125 | 2 |
| https://voronezh.leroymerlin.ru/catalogue/rozetki-i-vyklyuchateli/ | | -898 | 10 |
| https://voronezh.leroymerlin.ru/catalogue/ramki-dlya-rozetok-i-vyklyuchateley/ | | -398 | 5 |
| https://voronezh.leroymerlin.ru/catalogue/nakladki-dlya-rozetok-i-vyklyuchateley/ | | -35 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/podrozetniki/ | | -11 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/silovye-kabeli/ | | -175 | 2 |
I tried the following code to loop all urls, but it is not working correctly.
Sub get_data()
Dim wsSheet As Worksheet, REZULTSheet As Worksheet, Rows As Long, http As New XMLHTTP60, html As New HTMLDocument
Dim i As Integer, topic As HTMLHtmlElement, link As Variant, x As Long, num_pages As Variant, links As Variant
Set wsSheet = Sheets("URLs_2")
Set REZULTSheet = Sheets("Products")
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).row
links = wsSheet.Range("A1:A" & Rows)
num_pages = wsSheet.Range("D1:D" & Rows)
REZULTSheet.Select
For i = 1 To ??? 'num_pages?
Application.ScreenUpdating = False
With http
For Each link In links
.Open "GET", link & "?display=90&sortby=1&page=" & i, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
Do: DoEvents: Loop Until .readyState = 4
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
With topic.getElementsByClassName("product-name")
If .Length Then x = x + 1: Cells(x, 2) = .item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then Cells(x, 3) = .item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then Cells(x, 1) = .item(1).innerText
End With
Next topic
Next link
End With
Next i
End Sub
Since most of the code repeats, is there any way to run a loop to reduce the amount of code.
Upvotes: 0
Views: 112
Reputation: 84465
What you really want to do is a re-factor (which I may add in with time) to reduce code complexity. For now, to fix your looping problem see how you can generate two 1D arrays containing urls and page counts and then use 1 loop over page counts and index into the other array using the current loop counter value.
Get rid of auto-instantiation, qualify with class names, use Long rather than Integer, worksheets rather than sheets.
In a re-factor work with arrays so as to write out faster to sheet.
Option Explicit
Public Sub GetData()
Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Dim i As Long, topic As MSHTML.HTMLHtmlElement
Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
Application.ScreenUpdating = False
On Error GoTo errHand
Set http = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
x = x + 1
With topic.getElementsByClassName("product-name")
If .Length Then rezultSheet.Cells(x, 2) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then rezultSheet.Cells(x, 3) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then rezultSheet.Cells(x, 1) = .Item(1).innerText
End With
Next topic
html.body.innerHTML = vbNullString
Next
End With
Next
errHand:
Application.ScreenUpdating = True
End Sub
Re-factoring (not my best I'm afraid but a starting point. You'll want some lower level error handling for example). It is more code but starts to allocate separate logical tasks into their own sub/func:
Option Explicit
Public wsSheet As Worksheet, rezultSheet As Worksheet
Public Sub GetData()
Dim http As MSXML2.XMLHTTP60, rowCount As Long, pageCounts(), links()
Application.ScreenUpdating = False
On Error GoTo errHand
Set http = New MSXML2.XMLHTTP60
Set wsSheet = Sheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
With rezultSheet.Cells
.ClearContents
.ClearFormats
End With
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
GetResults http, pageCounts, links
errHand:
Application.ScreenUpdating = True
If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
End Sub
Public Sub GetResults(ByVal http As MSXML2.XMLHTTP60, ByRef pageCounts(), ByRef links())
Dim i As Long, numPages As Long
Dim page As Long, html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
WriteOutResults html
html.body.innerHTML = vbNullString
Next
End With
Next
End Sub
Public Sub WriteOutResults(ByVal html As MSHTML.HTMLDocument)
Dim topic As MSHTML.HTMLHtmlElement, results()
Dim r As Long, productCards As Object
Set productCards = html.getElementsByClassName("ui-product-card__info")
ReDim results(1 To productCards.Length, 1 To 3)
For Each topic In productCards
r = r + 1
With topic.getElementsByClassName("product-name")
If .Length Then results(r, 2) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then results(r, 3) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then results(r, 1) = .Item(1).innerText
End With
Next topic
Dim lastRow As Long
lastRow = GetLastRow(rezultSheet)
rezultSheet.Cells(lastRow + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
Upvotes: 2
Reputation: 42236
Code adapting @QHar solution, in order to work only in computer memory...
Option Explicit
Private Sub GetDataInMemory()
Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Dim i As Long, topic As MSHTML.HTMLHtmlElement
Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
Dim Data() As String, k As Long
Set http = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
ReDim Data(2, k)
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
With topic.getElementsByClassName("product-name")
If .Length Then Data(1, x) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then Data(2, x) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text")
If .Length Then Data(0, x) = .Item(0).innerText
End With
x = x + 1: ReDim Preserve Data(2, x)
Next topic
html.body.innerHTML = vbNullString
Next
End With
Next
rezultSheet.Range("A1:C" & x - 1).Value = Application.Transpose(Data)
End Sub
Upvotes: 2