Reputation: 77
I want to scrape post description from particular Instagram page (e. g. https://www.instagram.com/p/BoFlrM7gwnK/). I have a part of code that gets some resent posts from an Instagram page, and it outputs much not needed info, like some scripts of the page.
from random import choice
import json
from pprint import pprint
import requests
from bs4 import BeautifulSoup
_user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36']
class InstagramScraper:
def __init__(self, user_agents=None, proxy=None):
self.user_agents = user_agents
self.proxy = proxy
def __random_agent(self):
if self.user_agents and isinstance(self.user_agents, list):
return choice(self.user_agents)
return choice(_user_agents)
def __request_url(self, url):
try:
response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
'https': self.proxy})
response.raise_for_status()
except requests.HTTPError:
raise requests.HTTPError('Received non 200 status code from Instagram')
except requests.RequestException:
raise requests.RequestException
else:
return response.text
@staticmethod
def extract_json_data(html):
soup = BeautifulSoup(html, 'html.parser')
body = soup.find('body')
script_tag = body.find('script')
raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
return json.loads(raw_string)
def profile_page_metrics(self, profile_url):
results = {}
try:
response = self.__request_url(profile_url)
json_data = self.extract_json_data(response)
metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
except Exception as e:
raise e
else:
for key, value in metrics.items():
if key != 'edge_owner_to_timeline_media':
if value and isinstance(value, dict):
value = value['count']
results[key] = value
elif value:
results[key] = value
return results
def profile_page_recent_posts(self, profile_url):
results = []
try:
response = self.__request_url(profile_url)
json_data = self.extract_json_data(response)
metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media'][
"edges"]
except Exception as e:
raise e
else:
for node in metrics:
node = node.get('node')
if node and isinstance(node, dict):
results.append(node)
return results
k = InstagramScraper()
results=k.profile_page_recent_posts('https://www.instagram.com/selenagomez/')
pprint(results)
Is there some way to just get an info of specific post from it's url? Any help would be appreciated.
Upvotes: 2
Views: 1111
Reputation: 19164
just duplicate profile_page_recent_posts()
method, for example
def get_single_posts(self, post_url):
results = []
response = self.__request_url(post_url)
json_data = self.extract_json_data(response)
post_text = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
post_shortcode = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode']
results.append({'text' : post_text, 'shortcode' : post_shortcode})
return results
Output:
{'text' : 'Mood lol....', 'shortcode' : 'BoFlrM7gwnK'}
To find value that you want save json_data
to file and use JSON viewer to select right keys.
Upvotes: 2