Reputation: 115
I want to extract data from a URL.
I want data in an Excel column.
Title Name (Image 1 and 2)
Street Address
AddressLocality
postalcode
adressregion
addresscountry
Here is my code.
url = Sheets("ExtData").Range("A" & N).Value
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
With html
.body.innerHTML = sResponse
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Til = titles.Item(i).outerHTML
Add = addresses.Item(i).innerText
Sheets("ExtData").Range("B" & N) = .getElementsByClassName("Title").Item(0)
Sheets("ExtData").Range("C" & N) = .getElementById("comp_add").outerHTML
Range("A" & N + 1).Select
End With
Upvotes: 2
Views: 2962
Reputation: 84465
You can use a css class selector
Option Explicit
Public Sub GetInfo()
Dim html As HTMLDocument
Set html = New HTMLDocument '< VBE > Tools > References > Microsoft Scripting Runtime
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/Agartala/Abhay-Varieties-Nor-Banamalipu/9999PX381-X381-141028162716-U1Z5_BZDET", False
.send
html.body.innerHTML = .responseText
End With
Debug.Print html.querySelector(".ph_hdr").innerText
End Sub
If you want separate lines then use
Dim items() As String, i As Long
items = Split(html.querySelector(".ph_hdr").innerText, ", ")
For i = LBound(items) To UBound(items)
If items(i) <> vbNullString Then
Activesheet.Cells(i + 1, 1) = items(i)
End If
Next
An odd situation is that normally I would grab all the script tags with json using a css selector of script\[type='application/ld+json'\]
and loop that looking for the info1. However, despite the info being present when I look at a text file write out, the moment I use a DOM parser I cannot find that info. So, much as I hate to advocate regex with html, here is a regex solution:
Option Explicit
Public Sub GetInfo()
Dim html As HTMLDocument, s As String, re As Object
Set re = CreateObject("vbscript.regexp")
Set html = New HTMLDocument '< VBE > Tools > References > Microsoft Scripting Runtime
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/Agartala/Abhay-Varieties-Nor-Banamalipu/9999PX381-X381-141028162716-U1Z5_BZDET", False
.send
s = .responseText
html.body.innerHTML = s
Debug.Print html.querySelector(".fn").innerText
Debug.Print Trim$(Replace$(GetString(re, s, "title>(.*)<"), Chr$(34), vbNullString))
Debug.Print Trim$(Replace$(GetString(re, s, "streetAddress"":(.*"")"), Chr$(34), vbNullString))
Debug.Print Trim$(Replace$(GetString(re, s, "addressLocality"":(.*"")"), Chr$(34), vbNullString))
Debug.Print Trim$(Replace$(GetString(re, s, "postalCode"":(.*"")"), Chr$(34), vbNullString))
Debug.Print Trim$(Replace$(GetString(re, s, "addressRegion"":(.*"")"), Chr$(34), vbNullString))
Debug.Print Trim$(Replace$(GetString(re, s, "addressCountry"":(.*"")"), Chr$(34), vbNullString))
End With
End Sub
Public Function GetString(ByVal re As Object, ByVal inputString As String, ByVal pattern As String) As Variant
Dim matches As Object
With re
.Global = True
.MultiLine = True
.IgnoreCase = True
.pattern = pattern
If .test(inputString) Then
Set matches = .Execute(inputString)
GetString = matches(0).SubMatches(0)
Exit Function
End If
End With
GetString = "No match"
End Function
"address":
Upvotes: 2