Reputation: 51
I'm new to python as well as beautifulsoup libary. I'm making a script to scrape some of images from a webpage. but the web site stores the images in the form of json in their source code. Also there is one other problem they store the related listing's images also in the page.
But i need to get all the images which has the attribute "full_screen" but only the first group of the source code because i don't want other listing's images i need only the current page listing images.
My code :
import os
import requests
from bs4 import BeautifulSoup, Tag
import json
def getResponse(url):
while True:
try:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
return soup
except:
print("retrying...")
url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"
soup = getResponse(url)
script = soup.find_all("script")
val = json.loads(script[7].text)
print(val)
Source document example :
{"homepage":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/338\/248\/MODE\/6cf3ec\/7481797-75cceo.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/260\/185\/MODE\/686c22\/7481797-75cceo.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/668\/452\/MODE\/782bc1\/7481797-75cceo.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/95\/95\/MODE\/2f9a70\/7481797-75cceo.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/856\/550\/MODE\/7cbb67\/7481797-75cceo.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/416\/272\/MODE\/724ffe\/7481797-75cceo.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/2c86eb83cbe5c9588b9347ef0c0f50b9\/1312\/894\/MODE\/57d3b7\/7481797-75cceo.jpg"}},{"type":"property_image","id":"118819718","attributes":{"id":"118819718","path":"7481797-a0120o.jpg","number":2,"version":"537f08c43e0437e41778534772d1659a","is_default":false},"links":{"homepage":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/338\/248\/MODE\/a56d8f\/7481797-a0120o.jpg","cts":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/260\/185\/MODE\/b5637b\/7481797-a0120o.jpg","medium":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/668\/452\/MODE\/094349\/7481797-a0120o.jpg","thumb":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/95\/95\/MODE\/8d79d7\/7481797-a0120o.jpg","new_big":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/856\/550\/MODE\/30ee0f\/7481797-a0120o.jpg","new_small":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/416\/272\/MODE\/ee84d8\/7481797-a0120o.jpg","full_screen":"https:\/\/www.propertyfinder.ae\/property\/537f08c43e0437e41778534772d1659a\/1312\/894\/MODE\/8afdf1\/7481797-a0120o.jpg"}},{"type":"property_image","id":"118819719","attributes":{"id":"118819719","path":"7481797-f337do.jpg","number":3,"version":"3523f4921a89e87ea7d4b752038e93ef","is_default":false},"links":
Error :
No JSON object could be decoded
Please anybody help me to get first group of images with the id "full_screen"
Pyfiddle link : https://pyfiddle.io/fiddle/8e039908-e713-43be-9513-ef4bab9dfb9d/?i=true
Upvotes: 0
Views: 119
Reputation: 11515
import requests
import re
def main(url):
r = requests.get(url)
match = re.search(r'location = ([^;]+)', r.text).group(1)
print(match)
main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")
Output:
{
id: "3037",
payload: {"data":{"type":"location","id":"3037","attributes":{"name":"Botanica Tower","path":"1.50.3037","path_name":"Dubai, Dubai Marina","location_type":"TOWER","review_score":3.7142856000000002,"reviews_count":3,"image_token":"60040b9695bbc9b791d1c121e17a91366de3eba1","coordinates":{"lon":55.142415,"lat":25.085046999999999},"level":2,"abbreviation":"","url_slug":"dubai-marina-botanica-tower","children_count":0},"links":{"building_reviews":"\/en\/building-reviews\/dubai\/dubai-marina-botanica-tower.html","image_location":"https:\/\/www.propertyfinder.ae\/images\/pf_portal\/tower\/60040b9695bbc9b791d1c121e17a91366de3eba1\/desktop"}}}
}
Regex Demo:
Or if you are targeting the data
key.
So use the below version:
import requests
import re
import json
def main(url):
r = requests.get(url)
match = re.search(r'location = {[\s\S]+?payload: ({.+})', r.text).group(1)
goal = json.loads(match)
print(goal)
main("https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html")
Output:
{
"data": {
"type": "location",
"id": "3037",
"attributes": {
"name": "Botanica Tower",
"path": "1.50.3037",
"path_name": "Dubai, Dubai Marina",
"location_type": "TOWER",
"review_score": 3.7142856,
"reviews_count": 3,
"image_token": "60040b9695bbc9b791d1c121e17a91366de3eba1",
"coordinates": {
"lon": 55.142415,
"lat": 25.085047
},
"level": 2,
"abbreviation": "",
"url_slug": "dubai-marina-botanica-tower",
"children_count": 0
},
"links": {
"building_reviews": "/en/building-reviews/dubai/dubai-marina-botanica-tower.html",
"image_location": "https://www.propertyfinder.ae/images/pf_portal/tower/60040b9695bbc9b791d1c121e17a91366de3eba1/desktop"
}
}
}
Upvotes: 1
Reputation: 28630
Well easiest thing would be to go through the API but you can also do it through the <script>
tags. Not all properties have the 'full_screen' attribute:
With <script>
tag:
import os
import requests
from bs4 import BeautifulSoup, Tag
import json
def getResponse(url):
while True:
try:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
return soup
except:
print("retrying...")
url = "https://www.propertyfinder.ae/en/rent/apartment-for-rent-dubai-dubai-marina-botanica-tower-7469382.html"
soup = getResponse(url)
script = soup.find_all("script")
jsonStr = script[7].text.split('payload: ')[-1].split(';')[0].rsplit('}',1)[0]
val = json.loads(jsonStr)
properties = val['included']
for prop in properties:
if 'links' in prop.keys():
if 'full_screen' in prop['links'].keys():
print (prop['links']['full_screen'])
With API:
import requests
url = 'https://www.propertyfinder.ae/en/api/search'
payload = {
'er[category_id]': '2',
'filter[furnished]': '0',
'filter[locations_ids][]': '3037',
'filter[price_type]': 'y',
'filter[property_type_id]': '1',
'page[limit]': '9999',
'page[number]': '1',
'sort': 'mr',
'include': 'properties,properties.property_type,properties.property_images,properties.location_tree,properties.agent,properties.broker,smart_ads,smart_ads.agent,smart_ads.broker,smart_ads.property_type,smart_ads.property_images,smart_ads.location_tree,direct_from_developer,direct_from_developer.property_type,direct_from_developer.property_images,direct_from_developer.location_tree,direct_from_developer.agent,direct_from_developer.broker,cts,cts.agent,cts.broker,cts.property_type,cts.property_images,cts.location_tree,similar_properties,similar_properties.agent,similar_properties.broker,similar_properties.property_type,similar_properties.property_images,similar_properties.location_tree,agent_smart_ads,agent_smart_ads.broker,agent_smart_ads.languages,agent_properties_smart_ads,agent_properties_smart_ads.agent,agent_properties_smart_ads.broker,agent_properties_smart_ads.location_tree,agent_properties_smart_ads.property_type'}
val = requests.get(url, params=payload).json()
properties = val['included']
for prop in properties:
if 'links' in prop.keys():
if 'full_screen' in prop['links'].keys():
print (prop['links']['full_screen'])
Output:
https://www.propertyfinder.ae/property/eaefddc999df314f589016fbb9df0c1e/1312/894/MODE/62d644/7468078-5a70co.jpg
https://www.propertyfinder.ae/property/9dae7d23cc50000baa36f55cde632fec/1312/894/MODE/b59dc8/7468078-d15a9o.jpg
https://www.propertyfinder.ae/property/2aaaa29b083099436f8dea3d018ba0f0/1312/894/MODE/94e390/7468078-84666o.jpg
https://www.propertyfinder.ae/property/a6ab186660629c4b6d494c2e66bd2b71/1312/894/MODE/97a052/7468078-eb879o.jpg
....
Upvotes: 0