sam_unrelax
sam_unrelax

Reputation: 51

Python Beautifulsoup4 unable to extract json data from the source document

I'm new to python as well as beautifulsoup libary. I'm making a script to scrape some of images from a webpage. but the web site stores the images in the form of json in their source code. Also there is one other problem they store the related listing's images also in the page.

But i need to get all the images which has the attribute "full_screen" but only the first group of the source code because i don't want other listing's images i need only the current page listing images.

My code :

import os
import requests
from bs4 import BeautifulSoup, Tag
import json


def getResponse(url):
    while True:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')
            return soup
        except:
            print("retrying...")


url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"

soup = getResponse(url)

script = soup.find_all("script")

val = json.loads(script[7].text)

print(val)

Source document example :

{"homepage":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/338\/248\/MODE\/6cf3ec\/7481797-75cceo.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/260\/185\/MODE\/686c22\/7481797-75cceo.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/95\/95\/MODE\/2f9a70\/7481797-75cceo.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/856\/550\/MODE\/7cbb67\/7481797-75cceo.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/416\/272\/MODE\/724ffe\/7481797-75cceo.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/1312\/894\/MODE\/57d3b7\/7481797-75cceo.jpg"}},{"type":"property_image","id":"118819718","attributes":{"id":"118819718","path":"7481797-a0120o.jpg","number":2,"version":"537f08c43e0437e41778534772d1659a","is_default":false},"links":{"homepage":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/338\/248\/MODE\/a56d8f\/7481797-a0120o.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/260\/185\/MODE\/b5637b\/7481797-a0120o.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/95\/95\/MODE\/8d79d7\/7481797-a0120o.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/856\/550\/MODE\/30ee0f\/7481797-a0120o.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/416\/272\/MODE\/ee84d8\/7481797-a0120o.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/1312\/894\/MODE\/8afdf1\/7481797-a0120o.jpg"}},{"type":"property_image","id":"118819719","attributes":{"id":"118819719","path":"7481797-f337do.jpg","number":3,"version":"3523f4921a89e87ea7d4b752038e93ef","is_default":false},"links":

Error :

No JSON object could be decoded

Please anybody help me to get first group of images with the id "full_screen"

Pyfiddle link : https://pyfiddle.io/fiddle/8e039908-e713-43be-9513-ef4bab9dfb9d/?i=true

Upvotes: 0

Views: 119

Answers (2)

import requests
import re


def main(url):
    r = requests.get(url)
    match = re.search(r'location = ([^;]+)', r.text).group(1)
    print(match)


main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")

Output:

{
    id: "3037",
    payload: {"data":{"type":"location","id":"3037","attributes":{"name":"Botanica Tower","path":"1.50.3037","path_name":"Dubai, Dubai Marina","location_type":"TOWER","review_score":3.7142856000000002,"reviews_count":3,"image_token":"60040b9695bbc9b791d1c121e17a91366de3eba1","coordinates":{"lon":55.142415,"lat":25.085046999999999},"level":2,"abbreviation":"","url_slug":"dubai-marina-botanica-tower","children_count":0},"links":{"building_reviews":"\/en\/building-reviews\/dubai\/dubai-marina-botanica-tower.html","image_location":"https:\/\/www.propertyfinder.ae\/images\/pf_portal\/tower\/60040b9695bbc9b791d1c121e17a91366de3eba1\/desktop"}}}
  }

Regex Demo:

Check

Or if you are targeting the data key.

So use the below version:

import requests
import re
import json


def main(url):
    r = requests.get(url)
    match = re.search(r'location = {[\s\S]+?payload: ({.+})', r.text).group(1)
    goal = json.loads(match)
    print(goal)


main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")

Output:

{
    "data": {
        "type": "location",
        "id": "3037",
        "attributes": {
            "name": "Botanica Tower",
            "path": "1.50.3037",
            "path_name": "Dubai, Dubai Marina",
            "location_type": "TOWER",
            "review_score": 3.7142856,
            "reviews_count": 3,
            "image_token": "60040b9695bbc9b791d1c121e17a91366de3eba1",        
            "coordinates": {
                "lon": 55.142415,
                "lat": 25.085047
            },
            "level": 2,
            "abbreviation": "",
            "url_slug": "dubai-marina-botanica-tower",
            "children_count": 0
        },
        "links": {
            "building_reviews": "/en/building-reviews/dubai/dubai-marina-botanica-tower.html",
            "image_location": "https://www.propertyfinder.ae/images/pf_portal/tower/60040b9695bbc9b791d1c121e17a91366de3eba1/desktop"
        }
    }
}

Upvotes: 1

chitown88
chitown88

Reputation: 28630

Well easiest thing would be to go through the API but you can also do it through the <script> tags. Not all properties have the 'full_screen' attribute:

With <script> tag:

import os
import requests
from bs4 import BeautifulSoup, Tag
import json

def getResponse(url):
    while True:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'html.parser')
            return soup
        except:
            print("retrying...")

url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"
soup = getResponse(url)
script = soup.find_all("script")

jsonStr = script[7].text.split('payload: ')[-1].split(';')[0].rsplit('}',1)[0]
val = json.loads(jsonStr)


properties = val['included']
for prop in properties:
    if 'links' in prop.keys():
        if 'full_screen' in prop['links'].keys():
            print (prop['links']['full_screen'])

With API:

import requests

url = 'https://www.propertyfinder.ae/en/api/search'
payload = {
'er[category_id]': '2',
'filter[furnished]': '0',
'filter[locations_ids][]': '3037',
'filter[price_type]': 'y',
'filter[property_type_id]': '1',
'page[limit]': '9999',
'page[number]': '1',
'sort': 'mr',
'include': 'properties,properties.property_type,properties.property_images,properties.location_tree,properties.agent,properties.broker,smart_ads,smart_ads.agent,smart_ads.broker,smart_ads.property_type,smart_ads.property_images,smart_ads.location_tree,direct_from_developer,direct_from_developer.property_type,direct_from_developer.property_images,direct_from_developer.location_tree,direct_from_developer.agent,direct_from_developer.broker,cts,cts.agent,cts.broker,cts.property_type,cts.property_images,cts.location_tree,similar_properties,similar_properties.agent,similar_properties.broker,similar_properties.property_type,similar_properties.property_images,similar_properties.location_tree,agent_smart_ads,agent_smart_ads.broker,agent_smart_ads.languages,agent_properties_smart_ads,agent_properties_smart_ads.agent,agent_properties_smart_ads.broker,agent_properties_smart_ads.location_tree,agent_properties_smart_ads.property_type'}

val = requests.get(url,  params=payload).json()

properties = val['included']
for prop in properties:
    if 'links' in prop.keys():
        if 'full_screen' in prop['links'].keys():
            print (prop['links']['full_screen'])

Output:

https://www.propertyfinder.ae/property/eaefddc999df314f589016fbb9df0c1e/1312/894/MODE/62d644/7468078-5a70co.jpg
https://www.propertyfinder.ae/property/9dae7d23cc50000baa36f55cde632fec/1312/894/MODE/b59dc8/7468078-d15a9o.jpg
https://www.propertyfinder.ae/property/2aaaa29b083099436f8dea3d018ba0f0/1312/894/MODE/94e390/7468078-84666o.jpg
https://www.propertyfinder.ae/property/a6ab186660629c4b6d494c2e66bd2b71/1312/894/MODE/97a052/7468078-eb879o.jpg
....

Upvotes: 0

Related Questions