Taras
Taras

Reputation: 77

BeautifulSoup Instagram post html scraping

I want to scrape post description from particular Instagram page (e. g. https://www.instagram.com/p/BoFlrM7gwnK/). I have a part of code that gets some resent posts from an Instagram page, and it outputs much not needed info, like some scripts of the page.

from random import choice
import json
from pprint import pprint

import requests
from bs4 import BeautifulSoup

_user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36']


class InstagramScraper:

def __init__(self, user_agents=None, proxy=None):
    self.user_agents = user_agents
    self.proxy = proxy

def __random_agent(self):
    if self.user_agents and isinstance(self.user_agents, list):
        return choice(self.user_agents)
    return choice(_user_agents)

def __request_url(self, url):
    try:
        response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                             'https': self.proxy})
        response.raise_for_status()
    except requests.HTTPError:
        raise requests.HTTPError('Received non 200 status code from Instagram')
    except requests.RequestException:
        raise requests.RequestException
    else:
        return response.text

@staticmethod
def extract_json_data(html):
    soup = BeautifulSoup(html, 'html.parser')
    body = soup.find('body')
    script_tag = body.find('script')
    raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
    return json.loads(raw_string)

def profile_page_metrics(self, profile_url):
    results = {}
    try:
        response = self.__request_url(profile_url)
        json_data = self.extract_json_data(response)
        metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
    except Exception as e:
        raise e
    else:
        for key, value in metrics.items():
            if key != 'edge_owner_to_timeline_media':
                if value and isinstance(value, dict):
                    value = value['count']
                    results[key] = value
                elif value:
                    results[key] = value
    return results

def profile_page_recent_posts(self, profile_url):
    results = []
    try:
        response = self.__request_url(profile_url)
        json_data = self.extract_json_data(response)
        metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media'][
            "edges"]
    except Exception as e:
        raise e
    else:
        for node in metrics:
            node = node.get('node')
            if node and isinstance(node, dict):
                results.append(node)
    return results


k = InstagramScraper()

results=k.profile_page_recent_posts('https://www.instagram.com/selenagomez/')
pprint(results)

Is there some way to just get an info of specific post from it's url? Any help would be appreciated.

Upvotes: 2

Views: 1111

Answers (1)

ewwink
ewwink

Reputation: 19164

just duplicate profile_page_recent_posts() method, for example

def get_single_posts(self, post_url):
    results = []
    response = self.__request_url(post_url)
    json_data = self.extract_json_data(response)

    post_text = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
    post_shortcode = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode']

    results.append({'text' : post_text, 'shortcode' : post_shortcode})

    return results

Output:

{'text' : 'Mood lol....', 'shortcode' : 'BoFlrM7gwnK'}

To find value that you want save json_data to file and use JSON viewer to select right keys.

Upvotes: 2

Related Questions