Reputation: 3241
I need to scrape Google Shopping, for example this link https://www.google.com/?gfe_rd=cr&ei=BtcRWeX_D8aAsAHDgZ2QAw#q=hooker+furniture+5183-75300&tbm=shop
but in the response from server I've just receive test without items. and even in the source code viewer in the Google Chrome I can not see items details. what request will get me all items details data?
Upvotes: 3
Views: 5156
Reputation: 1724
You can achieve this using parsel
and requests
libraries as it can be done without selenium
since everything you need is in the HTML (not rendered via JavaScript).
Make sure you're using user-agent as default requests
user-agent is python-requests
so Google will understand that it's a script that sends a request and can block it. Check what's your user-agent. List of user-agents (if you need to rotate user-agent on each request).
If it's difficult to figure out with CSS selector(s) to use to extract the right data, have a look at SelectorGadget Chrome extension which allows you to click on the desired element in your browser and return a CSS selector.
Code that extracts image data from the inline JSON using regular expression, and other data with CSS selector with full example in the online IDE:
import requests, json, re
from parsel import Selector
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "minecraft",
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"tbm": "shop" # google search shopping tab
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
def get_original_images():
all_script_tags = "".join(
[
script.replace("</script>", "</script>\n")
for script in selector.css("script").getall()
]
)
image_urls = []
for result in selector.css(".Qlx7of .sh-dgr__grid-result"):
# https://regex101.com/r/udjFUq/1
url_with_unicode = re.findall(rf"var\s?_u='(.*?)';var\s?_i='{result.attrib['data-pck']}';", all_script_tags)
if url_with_unicode:
url_decode = bytes(url_with_unicode[0], 'ascii').decode('unicode-escape')
image_urls.append(url_decode)
return image_urls
def get_suggested_search_data():
google_shopping_data = []
for result, thumbnail in zip(selector.css(".Qlx7of .i0X6df"), get_original_images()):
title = result.css(".tAxDx::text").get()
product_link = "https://www.google.com" + result.css(".Lq5OHe::attr(href)").get()
product_rating = result.css(".NzUzee .Rsc7Yb::text").get()
product_reviews = result.css(".NzUzee > div::text").get()
price = result.css(".a8Pemb::text").get()
store = result.css(".aULzUe::text").get()
store_link = "https://www.google.com" + result.css(".eaGTj div a::attr(href)").get()
delivery = result.css(".vEjMR::text").get()
store_rating_value = result.css(".zLPF4b .XEeQ2 .QIrs8::text").get()
# https://regex101.com/r/kAr8I5/1
store_rating = re.search(r"^\S+", store_rating_value).group() if store_rating_value else store_rating_value
store_reviews_value = result.css(".zLPF4b .XEeQ2 .ugFiYb::text").get()
# https://regex101.com/r/axCQAX/1
store_reviews = re.search(r"^\(?(\S+)", store_reviews_value).group() if store_reviews_value else store_reviews_value
store_reviews_link_value = result.css(".zLPF4b .XEeQ2 .QhE5Fb::attr(href)").get()
store_reviews_link = "https://www.google.com" + store_reviews_link_value if store_reviews_link_value else store_reviews_link_value
compare_prices_link_value = result.css(".Ldx8hd .iXEZD::attr(href)").get()
compare_prices_link = "https://www.google.com" + compare_prices_link_value if compare_prices_link_value else compare_prices_link_value
google_shopping_data.append({
"title": title,
"product_link": product_link,
"product_rating": product_rating,
"product_reviews": product_reviews,
"price": price,
"store": store,
"thumbnail": thumbnail,
"store_link": store_link,
"delivery": delivery,
"store_rating": store_rating,
"store_reviews": store_reviews,
"store_reviews_link": store_reviews_link,
"compare_prices_link": compare_prices_link,
})
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
Portion of the output:
]
{
"title": "Minecraft Mini Mob 4-Piece Figure Mood Light Set | Battery Operated",
"product_link": "https://www.google.com/shopping/product/15256303704867209410?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wII7xY",
"product_rating": "5.0",
"product_reviews": null,
"price": "$29.99",
"store": "Oriental Trading Company",
"thumbnail": "https://encrypted-tbn1.gstatic.com/shopping?q=tbn:ANd9GcS7Xddy5pF2gPiRFpF0E1YumatHuyBW3HYiltvZrimFoP_r3yAGWWMcYcnhaRrb7prHSAc93lWBEGQEGJ9NUCBkvQuvMCfxFXWXjY6oqrLebAmDtqcwpY6l&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.orientaltrading.com/minecraft-mini-mob-4-piece-figure-mood-light-set-battery-operated-a2-14260956.fltr%3Fsku%3D14260956%26cm_mmc%3DGooglePLA-_-Free-_-Google-_-14260956%26BP%3DPS544&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECPEW&usg=AOvVaw0KxuR61pE4aEt37xEXBI2O",
"delivery": "Delivery by Wed, Dec 7",
"store_rating": "4.7",
"store_reviews": "45",
"store_reviews_link": "https://www.google.com/url?url=https://www.google.com/shopping/ratings/account/metrics%3Fq%3Dorientaltrading.com%26c%3DUS%26v%3D19%26hl%3Den&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ9-wCCPkW&usg=AOvVaw2WL-Mo7EBJ9N8C4NlQEJ_n",
"compare_prices_link": "https://www.google.com/shopping/product/15256303704867209410/offers?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECPoW"
}, # other results
{
"title": "Minecraft Explorer Kit - Build Minecraft in The Real World",
"product_link": "https://www.google.com/shopping/product/10073223339448590299?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wIIzRg",
"product_rating": null,
"product_reviews": null,
"price": "$99.99",
"store": "Make-A-Fort",
"thumbnail": "https://encrypted-tbn2.gstatic.com/shopping?q=tbn:ANd9GcTJ55chkN9FYuwRQbupSWJRdSS70Y8XHKxQEUvOOuwHKbuBaSekHcWo9wndDFA-5_ZMlIdJFpWqMwpyMd9RDmUEiQ_DpaSaigwmPHBceO5rg885VEh_YbacBw&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.makeafort.fun/shop/original-fort-kits/1mek%3Futm_source%3Dgoogle-shopping%26utm_medium%3Dcpc&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECM4Y&usg=AOvVaw3ZzxgI8ILnCg0-Nd78JH7F",
"delivery": "Delivery by Thu, Dec 8",
"store_rating": null,
"store_reviews": null,
"store_reviews_link": null,
"compare_prices_link": "https://www.google.com/shopping/product/10073223339448590299/offers?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECNEY"
}
]
Alternatively you can do it using Google Shopping Results API by SerpApi:
from serpapi import GoogleSearch
import requests, lxml, os, json
params = {
"q": "minecraft", # search query
"tbm": "shop", # shop results
"location": "Dallas", # location from where search comes from
"hl": "en", # language of the search
"gl": "us", # country of the search
# https://docs.python.org/3/library/os.html#os.getenv
"api_key": os.getenv("API_KEY"), # your serpapi api
}
search = GoogleSearch(params) # where data extraction happens on the SerpApi backend
results = search.get_dict() # JSON -> Python dict
google_shopping_data = results["shopping_results"]
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
Part of the output:
]
# other results
{
"position": 80,
"title": "Minecraft Steve Vacuform Mask",
"link": "https://www.fun.com/minecraft-steve-vacuform-mask.html?mpid=191051&srsltid=AYJSbAfU8d_TRhvnvhvi9-U79_BB8bgh_dTHGkD75Dt6mq8nK0apj3hUOjY",
"product_link": "https://www.google.com/shopping/product/15914996745618368243?gl=us",
"product_id": "15914996745618368243",
"serpapi_product_api": "https://serpapi.com/search.json?device=desktop&engine=google_product&gl=us&google_domain=google.com&hl=en&location=Dallas&product_id=15914996745618368243",
"source": "Fun.com",
"price": "$12.99",
"extracted_price": 12.99,
"rating": 4.1,
"reviews": 40,
"extensions": [
"15% OFF"
],
"thumbnail": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQe1LOeSKWgFvhVt_bct6rRohpAvl2023AqbnqE78dxwocrz7Sbre-tQ5s9M26_4q8bp86eRzI9PvfXwaBLmaESZlXwxH5HF9monqhr7jyChYqSLHWo9PcUFmU&usqp=CAE",
"tag": "15% OFF",
"delivery": "$4.99 delivery"
}
]
If you want to better understand what shown code does, there's a dedicated blog post on scraping Google Shopping Tab with Python.
DIsclaimer, I work for SerpApi.
Upvotes: 4