Reputation: 775
I have created a macro which scrapes relevant information from Brief profiles (BP)
that can be searched for at: https://echa.europa.eu/information-on-chemicals
This works using an XMLHTTP request to the URL of the Brief Profile
and works fine.
I now wish to create a macro which searches the same website to find the URL(href) of the brief profile.
As a beginner to VBA I have successfully achieved this using a browser but I wish to convert this to XML HTTP request to improve efficiency.
Using IE Browser Automation:
Sub Gethref()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement
Dim HTMLhref As MSHTML.IHTMLElement
'Go to Website
IE.Visible = True
IE.navigate "https://echa.europa.eu/information-on-chemicals"
'Check Website is ready for search and set HTMLDoc to IE.Document for elements
Do While IE.readyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc = IE.document
'Set value of Searchbox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value = "Potassium mercaptoacetate"
'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click
'Check page has loaded
Do While IE.readyState = READYSTATE_COMPLETE or IE.Busy
Loop
Set HTMLDoc = IE.document
'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")
End Sub
This should print the href for Potassium mercaptoacetate
as https://echa.europa.eu/brief-profile/-/briefprofile/100.000.602
I have started attempted to convert as much as I can using XML HTTP but Im running into issues which I dont quite understand
Using XML HTTP Request (Not working)
Sub Gethref()
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement
Dim HTMLhref As MSHTML.IHTMLElement
'Go to Website
XMLPage.Open "GET", "https://echa.europa.eu/information-on-chemicals", False
XMLPage.send
'Set value of Searchbox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value
'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click
'Check page has loaded
HTMLDoc.body.innerHTML = IE.document.responseText
'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")
End Sub
I will update as I make progress with this but if anyone can offer help it will be great.
Upvotes: 1
Views: 366
Reputation: 22440
Okay, this should do it. Turn out that you need to issue post http requests with appropriate parameters to get required response containing desired links.
Public Sub GetContent()
Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
Set oHtml = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
Set Ws = ThisWorkbook.Worksheets("Sheet1")
searchKeyword = "Acetone"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = IIf(Len(DictKey) = 0, WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)), _
payload & "&" & WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)))
Next DictKey
With oHttp
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send (payload)
oHtml.body.innerHTML = .responseText
End With
With oHtml.querySelectorAll("table.table > tbody > tr > td > a.substanceNameLink")
For I = 0 To .Length - 1
R = R + 1: Ws.Cells(R, 1) = .item(I).getAttribute("href")
Next I
End With
End Sub
If you are interested in the first link only, try the following instead of the last with block:
MsgBox oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
Or you can directly copy those parameters from dev tool and use them:
Public Sub GetContent()
Const Url = "https://echa.europa.eu/search-for-chemicals?"
Dim oHttp As Object, oHtml As HTMLDocument
Dim payload$, Ws As Worksheet, urlSuffix$
Set oHtml = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set Ws = ThisWorkbook.Worksheets("Sheet1")
urlSuffix = "p_auth=69hDou3E&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=" & _
"_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=" & _
"doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals" & _
"%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview" & _
"%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
payload = "_disssimplesearchhomepage_WAR_disssearchportlet_formDate=1621042609544&_disssimplesearch_WAR_disssearchportlet_searchOccurred=" & _
"true&_disssimplesearch_WAR_disssearchportlet_sskeywordKey=Acetone&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer" & _
"=true&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox=on"
With oHttp
.Open "POST", Url & urlSuffix, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send (payload)
oHtml.body.innerHTML = .responseText
End With
Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Sub
Upvotes: 3