Reputation: 9538
I am trying to scrape a site with this code to extract names and contacts ...
Sub Test()
Dim htmlDoc As Object
Dim htmlDoc2 As Object
Dim elem As Variant
Dim tag As Variant
Dim dns As String
Dim pageSource As String
Dim pageSource2 As String
Dim url As String
Dim row As Long
row = 2
dns = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", dns, True
.send
While .readyState <> 4: DoEvents: Wend
If .statusText <> "OK" Then
MsgBox "ERROR" & .Status & " - " & .statusText, vbExclamation
Exit Sub
End If
pageSource = .responseText
End With
Set htmlDoc = CreateObject("htmlfile")
htmlDoc.body.innerHTML = pageSource
Dim xx 'Got error here Set xx = htmlDoc.getElementsByClassName("ldb-contact-summary")
Set htmlDoc = Nothing
Set htmlDoc2 = Nothing
End Sub
When trying to use this line
Set xx = htmlDoc.getElementsByClassName("ldb-contact-summary")
I got an error 'Object doesn't support that property or method' (438) Can you help me please as I am not so good at scraping issues?
Upvotes: 2
Views: 647
Reputation: 84465
As you mention all the pages in the comment above I will use a class to hold the XMLHTTP object and provide it with methods to extract the data, whilst incorporating a method to find the number of results pages and loop them. Testing this gave me 251 rows of results.
Note: Discovered through debugging that keeping the SetRequestHeader was causing, for you, requests for human verification. Removing this meant the XMLHTTP method worked. It worked with and without for me.
Class clsHTTP
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal url As String) As String
Dim sResponse As String
With http
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
GetString = sResponse
End With
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
Set names = html.querySelectorAll("[class*='ldb-contact-name']")
Set telNums = html.querySelectorAll(".ldb-phone-number")
ReDim namesArray(0 To names.Length - 1)
ReDim telsArray(0 To telNums.Length - 1)
For i = 0 To names.Length - 1
namesArray(i) = names.item(i).innerText
telsArray(i) = telNums.item(i).innerText
Next
GetInfo = Array(namesArray, telsArray)
End Function
Standard module 1
Option Explicit
Public Sub GetReviewData()
Dim sResponse As String, html As HTMLDocument, http As clsHTTP
Dim numPages As Long, pageNum As Long, url As String
Dim results As Collection, item As Variant, ws As Worksheet
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"
Set http = New clsHTTP
Set html = New HTMLDocument
Set results = New Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
With html
.body.innerHTML = http.GetString(url)
numPages = .querySelectorAll("[data-idx]").item(html.querySelectorAll("[data-idx]").Length - 2).innerText
results.Add http.GetInfo(html)
If numPages > 1 Then
For pageNum = 2 To numPages
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
.body.innerHTML = http.GetString(url)
results.Add http.GetInfo(html)
Next
End If
Dim numResults As Long
If results.Count > 0 Then
Application.ScreenUpdating = False
For Each item In results
numResults = UBound(item(0)) + 1
With ws
.Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
.Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
End With
Next
Application.ScreenUpdating = True
End If
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Selenium:
Option Explicit
Public Sub GetReviewData()
Dim html As HTMLDocument
Dim numPages As Long, pageNum As Long, url As String
Dim results As Collection, item As Variant, ws As Worksheet
Dim d As WebDriver, elements As WebElements
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=1&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
Set html = New HTMLDocument
Set results = New Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set d = New ChromeDriver
With d
.Start "Chrome"
.get url
Set elements = .FindElementsByCss("[data-idx]")
numPages = elements(elements.Count - 1).Text
html.body.innerHTML = .PageSource
results.Add GetInfo(html)
If numPages > 1 Then
For pageNum = 2 To numPages
Application.Wait Now + TimeSerial(0, 0, 2)
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
.get url
html.body.innerHTML = .PageSource
results.Add GetInfo(html)
Next
End If
Dim numResults As Long
If results.Count > 0 Then
Application.ScreenUpdating = False
For Each item In results
numResults = UBound(item(0)) + 1
With ws
.Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
.Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
End With
Next
Application.ScreenUpdating = True
End If
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
Set names = html.querySelectorAll("[class*='ldb-contact-name']")
Set telNums = html.querySelectorAll(".ldb-phone-number")
ReDim namesArray(0 To names.Length - 1)
ReDim telsArray(0 To telNums.Length - 1)
For i = 0 To names.Length - 1
namesArray(i) = names.item(i).innerText
telsArray(i) = telNums.item(i).innerText
Next
GetInfo = Array(namesArray, telsArray)
End Function
Upvotes: 2
Reputation: 22440
To get the names and their corresponding phone numbers, you can try the below snippet:
Sub GetProfileInfo()
Const URL$ = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page="
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim post As HTMLDivElement, R&, P&
For p = 1 To 3 'put here the highest number you wanna traverse
With Http
.Open "GET", URL & p, False
.send
Html.body.innerHTML = .responseText
End With
For Each post In Html.getElementsByClassName("ldb-contact-summary")
With post.querySelectorAll(".ldb-contact-name a")
If .Length Then R = R + 1: Cells(R, 1) = .item(0).innerText
End With
With post.getElementsByClassName("ldb-phone-number")
If .Length Then Cells(R, 2) = .item(0).innerText
End With
Next post
Next p
End Sub
Reference to add to the library to execute the above script:
Microsoft xml, v6.0
Microsoft Html Object Library
Upvotes: 4