Reputation: 161
I am pulling some data of yellowpages, which is pulling off fine. However my issue is around the page navigation. Although It navigates fine from page 1 to 2 when it trys to navigate to page 3 my code goes back to page 1 and extracts the data again. The data extraction is fine the issue is the navigation.
This is what I have identified and I think is the issue, but do not know how to resolve it.
When the page navigates to page 2, the class for the 'emptyPageButton' changes to the same class to navigate to the NEXT PAGE, so instead of going forward to the next page, which would be page 3, it goes back to page 1. If I stated that 10 pages should be extracted it will extract each page 1 + 2 five times each as it will keep going back and forth between the two pages.
I have made several attempts, but they do not work. I can get as far as page2 and then it goes back to page 1
WITH CLASS works up to page 2 then goes back to page 1
''' Searches Number of Pages entered in Sheet20 rage J9
If pageNumber >= Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0).children (0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1).children (0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1).children (1)
'Set nextPageElement = HTML.getElementsByClassName("view_more_section_noScroll ")(0).getElementsByTagName("a")(1)
If nextPageElement Is Nothing Then Exit Do
nextPageElement.Click 'next web page
Application.Wait Now + TimeValue("00:00:05")
WITH QUERY SELECTOR works up to page 2 then goes back to page 1
''' Searches Number of Pages entered in Sheet20 rage J9
If pageNumber >= Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.querySelector(".view_more_section_noScroll .pageButton")
If Not nextPageElement Is Nothing Then
nextPageElement.Click
Application.Wait Now + TimeValue("00:00:05")
Else:
Exit Do
End If
Snippet for page1
<div class="view_more_section_noScroll">
<div class="emptyPageButton"></div>
<span class="pageCount">
<span class="bold">
1 /
</span>
<span class="">
37</span>
</span>
<a href="/search/si/2/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - load_more - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"next_serp"}"
class="ypbtn btn-theme pageButton">Next
>></a>
</div>
Snippet for page2 and beyond
<div class="view_more_section_noScroll">
<a href="/search/si/1/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - previous_page - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"previous_serp"}"
class="ypbtn btn-theme pageButton"><< Previous</a>
<span class="pageCount">
<span class="bold">
2 /
</span>
<span class="">
37</span>
</span>
<a href="/search/si/3/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - load_more - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"next_serp"}"
class="ypbtn btn-theme pageButton">Next
>></a>
</div>
QUESTION, Can someone advise what the correct class or querySelector is for the navigation?
As aways thanks in advance.
'''########################## UPDATED THUR 8/4/2021 #####################
The full code is large, I have reduced the code a lot to make it much easier to read as the ONLY ISSUE is the page navigation. This code should give you and idea of what i am trying to do. Currently it overides previous extracted results, I have deleted something in the code by error, please ignore this for now as ONLY THE PAGE NAVIGATION IS AN ISSUE
Private Sub YellowPagesCa()
Dim HTML As htmlDocument
Dim objIE As Object
Dim result As String 'string variable that will hold our result link
Dim pageNumber As Long ' page no.
Dim nextPageElement As Object 'page element
Dim HtmlText As Variant ' for html data
Dim wsSheet As Worksheet ' WorkSheet
Dim wb As Workbook
Dim sht As Worksheet
Set wb = ThisWorkbook
Set wsSheet = wb.Sheets("YellowPages")
Set sht = ThisWorkbook.Worksheets("YellowPages")
'+++++ Internet Explorer ++++++
Set objIE = New InternetExplorer 'initiating a new instance of Internet Explorer and asigning it to objIE
objIE.Visible = True
objIE.navigate "https://www.yellowpages.ca/search/si/1/car+dealer/Toronto+ON"
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop 'wait here a few seconds while the browser is busy
Set HTML = objIE.document
Set elements = HTML.getElementsByClassName("listing_right_section")
For Each element In elements
DoEvents
''' Element 1
If element.getElementsByClassName("listing__name--link listing__link jsListingName")(0) Is Nothing Then
wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = "-"
Else
HtmlText = element.getElementsByClassName("listing__name--link listing__link jsListingName")(0).href
wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = HtmlText
End If
'End If
Next element
Do
'''############### PAGE NAVIGATION ##############
'Searches Number of Pages entered in
If pageNumber >= 5 Then Exit Do 'Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.querySelector(".view_more_section_noScroll .pageButton")
' Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0)
If Not nextPageElement Is Nothing Then
nextPageElement.Click
Application.Wait Now + TimeValue("00:00:05")
Else:
Exit Do
End If
Do While objIE.Busy = True Or objIE.readyState <> 4
DoEvents
Loop
Set HTML = objIE.document
pageNumber = pageNumber + 1
Loop
objIE.Quit ' end and clear browser
Set objIE = Nothing
Set HTML = Nothing
Set nextPageElement = Nothing
Set HtmlText = Nothing
Set element = Nothing
Complete.show
'End If
End Sub
Upvotes: 0
Views: 87
Reputation: 161
Thanks to QHarr answer I was able to fix the issue by using parts of it. I have used my Class and QuerySelector code with parts of QHarr QuerySelector answer. I can now navigate the pages fine.
Do
' Searches Number of Pages entered in Sheet20 J9
If pageNumber >= Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
'Set nextPageElement = HTML.querySelector(".view_more_section_noScroll .pageButton")
Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton")(0) '' using class and NOT QuerySelector here
If Not nextPageElement Is Nothing Then
nextPageElement.document.querySelector(".pageCount + a").Click ''NEW PART
Application.Wait Now + TimeValue("00:00:05")
Else:
Exit Do
End If
Upvotes: 0
Reputation: 84475
You could loop while
ie.document.querySelectorAll(".pageCount + a").Length <> 0
and
click the next
button inside that loop with:
ie.document.querySelector(".pageCount + a").click
or
ie.Navigate2 ie.document.querySelector(".pageCount + a").href
This will terminate when there is no more next
button.
Alternatively, extract the page count from the first page and loop to that number of pages, substituting the current page number into the url (e.g. replacing 1 with 2 to get page 2)
Option Explicit
Public Sub PrintSomeInfo()
Dim ie As SHDocVw.InternetExplorer, re As Object
Set ie = New SHDocVw.InternetExplorer
Set re = CreateObject("VBScript.RegExp")
With re
.Global = False
.MultiLine = False
.Pattern = "(si\/)(\d+)(\/)"
End With
With ie
.Visible = True
.Navigate2 "https://www.yellowpages.ca/search/si/1/car+dealership/Toronto+ON"
While .Busy Or .readyState <> READYSTATE_COMPLETE: DoEvents: Wend
Dim pageCount As Long, i As Long
pageCount = CLng(.document.querySelector(".pageCount .bold + span").innerText)
'already on page one so just loop from 2 to pageCount
For i = 2 To pageCount
.Navigate2 re.Replace(.document.url, "$1" & CStr(i) & "$3")
While .Busy Or .readyState <> READYSTATE_COMPLETE: DoEvents: Wend
'do something with new page
Next
Stop
.Quit
End With
End Sub
Regex:
The regex pattern matches 3 groups in the url and then substitutes the second group, the current page number, with the new page number:
Upvotes: 1