Nick
Nick

Reputation: 775

Converting Web Browser Automation to XmlHTTP Request

I have created a macro which scrapes relevant information from Brief profiles (BP) that can be searched for at: https://echa.europa.eu/information-on-chemicals

This works using an XMLHTTP request to the URL of the Brief Profile and works fine.

I now wish to create a macro which searches the same website to find the URL(href) of the brief profile.

As a beginner to VBA I have successfully achieved this using a browser but I wish to convert this to XML HTTP request to improve efficiency.

Using IE Browser Automation:

Sub Gethref()

Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
IE.Visible = True
IE.navigate "https://echa.europa.eu/information-on-chemicals"

'Check Website is ready for search and set HTMLDoc to IE.Document for elements
Do While IE.readyState <> READYSTATE_COMPLETE
Loop

Set HTMLDoc = IE.document

'Set value of Searchbox to keyword
Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
HTMLInput.Value = "Potassium mercaptoacetate"

'Search for Result
Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
HTMLButton.Click

'Check page has loaded
Do While IE.readyState = READYSTATE_COMPLETE or IE.Busy
Loop
Set HTMLDoc = IE.document

'Find Desired href
Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
Debug.Print HTMLhref.getAttribute("href")

End Sub

This should print the href for Potassium mercaptoacetate as https://echa.europa.eu/brief-profile/-/briefprofile/100.000.602

I have started attempted to convert as much as I can using XML HTTP but Im running into issues which I dont quite understand

Using XML HTTP Request (Not working)

Sub Gethref()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLInput As MSHTML.IHTMLElement
    Dim HTMLButtom As MSHTML.IHTMLElement

Dim HTMLhref As MSHTML.IHTMLElement

'Go to Website
    XMLPage.Open "GET", "https://echa.europa.eu/information-on-chemicals", False
    XMLPage.send
    
'Set value of Searchbox to keyword
    Set HTMLInput = HTMLDoc.getElementById("_disssimplesearch_WAR_disssearchportlet_sskeywordKey")
    HTMLInput.Value

'Search for Result
    Set HTMLButton = HTMLDoc.getElementById("_disssimplesearchhomepage_WAR_disssearchportlet_searchButton")
    HTMLButton.Click

'Check page has loaded
HTMLDoc.body.innerHTML = IE.document.responseText

'Find Desired href
    Set HTMLhref = HTMLDoc.getElementsByClassName("briefProfileLink")(0)
    Debug.Print HTMLhref.getAttribute("href")

End Sub

I will update as I make progress with this but if anyone can offer help it will be great.

Upvotes: 1

Views: 366

Answers (1)

SIM
SIM

Reputation: 22440

Okay, this should do it. Turn out that you need to issue post http requests with appropriate parameters to get required response containing desired links.

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
    Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
    Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")

    searchKeyword = "Acetone"
    
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
    MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
    MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"

    payload = ""
    For Each DictKey In MyDict
        payload = IIf(Len(DictKey) = 0, WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)), _
        payload & "&" & WorksheetFunction.encodeURL(DictKey) & "=" & WorksheetFunction.encodeURL(MyDict(DictKey)))
    Next DictKey
    
    With oHttp
        .Open "POST", Url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    With oHtml.querySelectorAll("table.table > tbody > tr > td > a.substanceNameLink")
        For I = 0 To .Length - 1
            R = R + 1: Ws.Cells(R, 1) = .item(I).getAttribute("href")
        Next I
    End With
End Sub

If you are interested in the first link only, try the following instead of the last with block:

MsgBox oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")

Or you can directly copy those parameters from dev tool and use them:

Public Sub GetContent()
    Const Url = "https://echa.europa.eu/search-for-chemicals?"
    Dim oHttp As Object, oHtml As HTMLDocument
    Dim payload$, Ws As Worksheet, urlSuffix$
    
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
    urlSuffix = "p_auth=69hDou3E&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=" & _
                "_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=" & _
                "doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals" & _
                "%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview" & _
                "%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"

    payload = "_disssimplesearchhomepage_WAR_disssearchportlet_formDate=1621042609544&_disssimplesearch_WAR_disssearchportlet_searchOccurred=" & _
              "true&_disssimplesearch_WAR_disssearchportlet_sskeywordKey=Acetone&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer" & _
              "=true&_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox=on"
    
    With oHttp
        .Open "POST", Url & urlSuffix, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
    
    Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Sub

Upvotes: 3

Related Questions