MITHU
MITHU

Reputation: 164

Failed to scrape the names of a few products from a webpage using the requests module

I'm trying to scrape the name of the sofas from this webpage using the requests module, as shown below. When I observe network activity for that request, I see the same logic applied there as I've tried below, but I always end up getting status 400. How can I scrape the name of the sofas from that webpage using the requests module?

import json
import requests

url = "https://www.wayfair.de/graphql?hash=22125c9a747b989fb251b014c2628213%23adda13419b757ac36f2b6c49a3bcd81c%23d0e1538c8199ca3253f0ba6c6f96b840%23f905cea77203534e928de4366ee3779d%2356bc66914e917daca7b7b6fbf0a6bcbb%2385a60692d411b1b94fcaf7e769b299f7%23937b5986d523ec2cd8ea2e68291b69e4%23226dd1f891ead00134d0c77687add073%2369fcc9e7651b988309f3197763ba4579%23d878a5fca245086d6cd18dec8a364f30%2383d1f1132338d12cf533926fa527c5a7%233d54fb8f7ef35c34629aa853236e0fb2%23201c0252e9b59dc9bb102e35e1c303fe%2382032c3ee9d609c60f37d9172e2dfebe%2370ac03a548872795355b4b2d9ee86698%23c5a76122f3b3d9dddfaccf682f3c5e19%23adc79abec9f62457c811fbde536bcc59%23e5ba723f1dbb69724c115115fd923a47%23787e38482534a0d1832e19a21a36700e%239c61a681637cd2ae4bc71227fe3cf711%2348c76b09e216b294dd5eaba432f05598%23ddeed8a890c5e7e562f69812d05c1cc7"

payload = {"variables":{"categoryId":479496,"browseInput":{"sortId":189,"filters":[],"pagination":{"page":2,"itemsPerPage":48},"boostedSkus":[],"isAjax":True,"skipLoadPricingModel":False},"usePricingField":True},"_pageId":"i1g5H18AMh07TKM2idhtCw==","_isPageRequest":True}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'application/json',
    'Origin': 'https://www.wayfair.de',
    'Referer': 'https://www.wayfair.de/moebel/sb0/sofas-c479496.html',
    'X-Parent-Txid': 'I/bHHmXsMEuIX3rEX/ejAg==',
    'Apollographql-Client-Name': '@wayfair/sf-ui-browse',
    'Apollographql-Client-Version': '6c7f9310e161d153c0d6b90b5f2961dae465981e',
}

with requests.Session() as s:
    s.headers.update(headers)
    res = s.post(url, json=payload)
    print(res.status_code)
    print(res.json())

Upvotes: 1

Views: 69

Answers (1)

Andrej Kesely
Andrej Kesely

Reputation: 195573

It seems that the HTML page already contains the same Json data that the graphql endpoint sends, so you can just get it from there:

import json
import re

import requests


def get_data(page_no):
    url = f"https://www.wayfair.de/moebel/sb0/sofas-c479496.html?curpage={page_no}"

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0"
    }

    html_text = requests.get(url, headers=headers).text

    data = re.search(
        r'window\["WEBPACK_ENTRY_DATA"\]=({"application":.*);', html_text
    ).group(1)
    data = json.loads(data)

    # print(json.dumps(data, indent=4))

    objs = data["application"]["props"]["browse"]["browse_grid_objects"]
    return objs


for p in range(1, 3):  # <-- increase number of pages here
    objs = get_data(p)

    for o in objs:
        print(f"{o['sku']:<15} {o['product_name']}")
    print()

Prints:


...

D100155833      3-Sitzer Sofa Anease                                                                                                                                                                               
D001649432      Sofa Dantzler                                                                                                                                                                                      
D004037217      Sofa Forsyth                                                                                                                                                                                       
D003327297      Zweisitzer Maxen                                                                                                                                                                                   
D110028633      Zweiersofa Bricyn                                                                                                                                                                                  
D100155840      3-Sitzer Sofa Anease                                                                                                                                                                               
DOID7952        Vidaxl 3-Sitzer-Sofa Mit Hocker 180 Cm Stoff 214                                                                                                                                                   
D100169432      Sofa Maurizia                                                                                                                                                                                      
D003971364      Sofa Abhinaya mit Bettfunktion                                                                                                                                                                     
VOX2313         Sofa Rodeo aus Echtleder                                                                                                                                                                           

...

Upvotes: 1

Related Questions