Amit Bhandari
Amit Bhandari

Reputation: 3134

How to convert data in particular html tag to dictionary

I am trying to scrap google images page by using following peace of code.

# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup

site= "https://www.google.co.in/search?q=batman+wallpaper+hd&source=lnms&tbm=isch"

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}


req = urllib2.Request(site,headers=hdr)

page = urllib2.urlopen(req)

soup = BeautifulSoup(page, 'html.parser')

for child in soup.find("div", {"data-ri":"16"}).children:
    print child

And getting this output

<a class="rg_l" href="#" jsaction="fire.ivg_o;mouseover:str.hmov;mouseout:str.hmou" jsname="hSRGPd" rel="noopener" style="background:rgb(11,18,24)"><img alt="Image result for batman wallpaper hd" class="rg_ic rg_i" jsaction="load:str.tbn" name="NCsi46a6Dm2_HM:" onload="typeof google==='object'&amp;&amp;google.aft&amp;&amp;google.aft(this)"/><div class="_aOd rg_ilm"><div class="rg_ilmbg"><span class="rg_ilmn"> 2880 × 1800 - wallpapertag.com </span></div></div></a>
<div class="rg_meta notranslate" jsname="ik8THc">{"id":"NCsi46a6Dm2_HM:","isu":"wallpapertag.com","itg":0,"ity":"jpg","oh":1800,"ou":"https://wallpapertag.com/wallpaper/full/b/8/5/84668-vertical-batman-wallpaper-hd-2880x1800-full-hd.jpg","ow":2880,"pt":"Batman wallpaper HD ·① Download free High Resolution wallpapers ...","rid":"vLHnAF3_eWR-KM","rmt":0,"rt":0,"ru":"https://wallpapertag.com/batman-wallpaper-hd","s":"2880x1800 Batman Wallpapers - HD Wallpapers Inn","st":"Wallpapertag.com","th":177,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcSAIP3lqGZ0a2wkgqIecGZtCEMKAx8Qk5lp89FaV6ovmygejjf1YA","tw":284}</div>

I want to read value of "ou" tag which is link to the wallpaper, can someone please help me parsing that link in variable. Beginner in python. Thanks in advance.

Upvotes: 3

Views: 4239

Answers (2)

Dmitriy Zub
Dmitriy Zub

Reputation: 1724

To scrape thumbnail, original size images, you need to scrape data from the <script> tags using regex.

Code and full example in the online IDE that scrapes and downloads Google Images:

import requests, lxml, re, json
from bs4 import BeautifulSoup

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": "batman wallpaper",  # query
    "tbm": "isch",            
    "hl": "en",               # language
    "ijn": "0",               # batch of 100 images
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')

def get_images_data():

    print('\nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
        source = google_image.select_one('.fxgdke').text
        link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
        print(f'{title}\n{source}\n{link}\n')

    # this steps could be refactored to a more compact
    all_script_tags = soup.select('script')

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/pdZOnW/3
    matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ', '.join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(', ')

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                       removed_matched_google_images_thumbnails)


    print('\nFull Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)

-----------------
'''
Google Images Metadata:
Batman's Throne by Mizuri Official on Artstation | Batman artwork, Batman  wallpaper, Batman
pinterest.com
https://www.pinterest.com/pin/682436149771063635/
..

Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcROCc-723eFmX7pigzuvsbIa9sCRIorG1TZMOa9KjFlDhgjrVd1B2AsW8dbAvU2MCjpAAM&usqp=CAU
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRi4CcR3vRBPGg_4JsE4hb_rni5mzD1zKkwztbrxcZrhBrSUH-cxLdwwI08MB0yDFa1z8E&usqp=CAU
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ1iAgOw8uW2jmmLtZQdnWYT6sP5h1-QR4SyVZxLxNZOELr90NbN5X8ChgVH3P9Unc3OD4&usqp=CAU
...

Full Resolution Images:
https://i.pinimg.com/originals/cf/f8/20/cff820cb3ea56f6b09a8fea0c33f8cb2.jpg
https://cdn.wallpapersafari.com/87/73/rbEyqd.jpg
https://wallpaperaccess.com/full/197277.jpg
...
'''

Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.

The difference in your case is that you only need to iterate over structured JSON and get the data you want rather than doing everything from scratch and maintain it over time.

import os, json # json for pretty output
from serpapi import GoogleSearch

def get_google_images():
    params = {
      "api_key": os.getenv("API_KEY"),
      "engine": "google",
      "q": "batman wallpaper",
      "tbm": "isch"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))

------------
'''
[
  {
    "position": 90,
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ2sBG4WzQCO_z4z3XyJsjZiT4gwhr46mT00Q&usqp=CAU",
    "source": "wallpapers-clan.com",
    "title": "Batman Wallpaper for iPhone & Android - Wallpapers Clan",
    "link": "https://wallpapers-clan.com/wallpapers/batman-dark-purple/",
    "original": "https://wallpapers-clan.com/wp-content/uploads/2021/05/batman-dark-purple-wallpaper-scaled.jpg",
    "is_product": false
  }
]
'''

P.S - I wrote a more in-depth blog post about how to scrape Google Images.

Disclaimer, I work for SerpApi.

Upvotes: 0

Octavio Durana
Octavio Durana

Reputation: 96

You could use a json parser, check this code, it will print only the ou variable value:

from bs4 import BeautifulSoup
import json

html = '<div><div class="rg_meta notranslate" jsname="ik8THc">{"id":"NCsi46a6Dm2_HM:","isu":"wallpapertag.com","itg":0,"ity":"jpg","oh":1800,"ou":"https://wallpapertag.com/wallpaper/full/b/8/5/84668-vertical-batman-wallpaper-hd-2880x1800-full-hd.jpg","ow":2880,"pt":"Batman wallpaper HD ·① Download free High Resolution wallpapers ...","rid":"vLHnAF3_eWR-KM","rmt":0,"rt":0,"ru":"https://wallpapertag.com/batman-wallpaper-hd","s":"2880x1800 Batman Wallpapers - HD Wallpapers Inn","st":"Wallpapertag.com","th":177,"tu":"https://encrypted-tbn0.gstatic.com/images?q\u003dtbn:ANd9GcSAIP3lqGZ0a2wkgqIecGZtCEMKAx8Qk5lp89FaV6ovmygejjf1YA","tw":284}</div></div>'

soup = BeautifulSoup(html, "html.parser")

for child in soup.find("div").children:
  if child.name == 'div':
     data_content = json.loads(child.text)
     print(data_content["ou"])

Upvotes: 2

Related Questions