Andy Peterson
Andy Peterson

Reputation: 33

In bs4 get text between elements

HTML Source:

<script type="text/javascript">window._sharedData = {"activity_counts":null,"config":{"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null},"supports_es6":true,"country_code":"NL","language_code":"en","locale":"en_US","entry_data":{"ProfilePage":[{"logging_page_id":"profilePage_4469324900","show_suggested_profiles":false,"graphql":{"user":{"biography":"","blocked_by_viewer":false,"country_block":false,"external_url":null,"external_url_linkshimmed":null,"edge_followed_by":{"count":143},"followed_by_viewer":false,"edge_follow":{"count":43},"follows_viewer":false,"full_name":"\u0627\u062c\u0627\u0631\u0647 \u0648\u06cc\u0644\u0627 \u062f\u0631 \u06af\u0631\u062f\u0646\u0647 ..................  ;</script>

<script type="text/javascript">
  (function() {
    var docElement = document.documentElement;
    var classRE = new RegExp('(^|\\s)no-js(\\s|$)');
    var className = docElement.className;
    docElement.className = className.replace(classRE, '$1js$2');

Now i want output show JUST every thing after window._sharedData =


{"activity_counts":null,"config":{"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null},"supports_es6":true,"count .......`

Here is my code :

url = ''
s = requests.session()
soup = bs(s.get(url).text, 'html.parser').findAll('script')

Upvotes: 2

Views: 674

Answers (1)

Jon Clements
Jon Clements

Reputation: 142166

Use bs4 to find the first script tag whose text starts with what you're looking for and then take the text content of that and split the start of it, eg:

import json
import requests
import bs4

key = 'window._sharedData = '

soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
script_tag = soup.find('script', text=lambda L: L and L.startswith(key))
if script_tag:
    # raw string of data in script
    text_data = script_tag.text.partition(key)[2]
    # remove the trailing ; and you've json data... interpret as such
    data = json.loads(text_data.rstrip(';\n'))
    # didn't find a match - up to you what to do here...

If you managed to find the relevant script tag, then data will be a Python dictionary of:

{'activity_counts': None,
 'config': {'csrf_token': '1Srrhc6GQmmC19TdM3nLFsDOORtJMpCj', 'viewer': None},
 'supports_es6': False,
 'country_code': 'GB',
 'language_code': 'en',
 'locale': 'en_US',
 'entry_data': {'ProfilePage': [{'logging_page_id': 'profilePage_4469324900',
    'show_suggested_profiles': False,
    'graphql': {'user': {'biography': '',
      'blocked_by_viewer': False,
      'country_block': False,
      'external_url': None,
      'external_url_linkshimmed': None,
      'edge_followed_by': {'count': 143},
      'followed_by_viewer': False,
      'edge_follow': {'count': 43},
      'follows_viewer': False,
      'full_name': 'اجاره ویلا در گردنه حیران',
      'has_channel': False,
      'has_blocked_viewer': False,
      'highlight_reel_count': 0,
      'has_requested_viewer': False,
      'id': '4469324900',
      'is_business_account': False,
      'is_private': False,
      'is_verified': False,
      'edge_mutual_followed_by': {'count': 0, 'edges': []},
      'profile_pic_url': '',
      'profile_pic_url_hd': '',
      'requested_by_viewer': False,
      'username': 'mehran_eblaghi',
      'connected_fb_page': None,
      'edge_owner_to_timeline_media': {'count': 2,
       'page_info': {'has_next_page': False,
        'end_cursor': 'AQBnocogeHdSL1DSSxRdiYR4D1RguUeEj5Ap1do1KIy4U_NutZIe9ZCyRpDExD4TL9k'},
       'edges': [{'node': {'__typename': 'GraphImage',
          'id': '1429655015362664538',
          'edge_media_to_caption': {'edges': [{'node': {'text': 'درصورت نیاز به ویلاتماس بگیرید 09112815125'}}]},
          'shortcode': 'BPXJ6luDBha',
          'edge_media_to_comment': {'count': 10},
          'comments_disabled': False,
          'taken_at_timestamp': 1484648180,
          'dimensions': {'height': 1080, 'width': 1080},
          'display_url': '',
          'edge_liked_by': {'count': 42},
          'edge_media_preview_like': {'count': 42},
          'gating_info': None,
          'media_preview': 'ACoqZEv32OQcHHpViMrKoJxkE59cVnFC7HB6epP+NSBGhXdkc8e/5+lNRYc0UaYiBO7J9aR3DDg4IOCaoR3IOFwcr1x0wf8AGkF0vJCHB75POP5UrNF3XyNBpARwQe1VN+OKiEqA5AIbr+Hr0q7tVud/Xn7p/wAahptj0RnKcEkVKXwMY/lTEbHNSbt3OK6rbadF1OaMrXXNZ3elr/oN345A5/xpxk56cGoy5JpBKR9KVutvxKUntzO3+H0/p9SSRz0x/wDqqrg1P5mRj161HVxVuljKcru6lf8ACw8DNH61KANo+lMIFGvRrbt/wSLq7unu+v8AwBhIHamnHYfqKdimEClZ919z/wAwutrfl/kKKKYAKkqlfr+Vv1Ynbof/2Q==',
          'owner': {'id': '4469324900'},
          'thumbnail_src': '',
          'thumbnail_resources': [{'src': '',
            'config_width': 150,
            'config_height': 150},
           {'src': '',
            'config_width': 240,
            'config_height': 240},
           {'src': '',
            'config_width': 320,
            'config_height': 320},
           {'src': '',
            'config_width': 480,
            'config_height': 480},
           {'src': '',
            'config_width': 640,
            'config_height': 640}],
          'is_video': False,
          'accessibility_caption': None}},
        {'node': {'__typename': 'GraphImage',
          'id': '1429628539162724247',
          'edge_media_to_caption': {'edges': []},
          'shortcode': 'BPXD5T1jgeX',
          'edge_media_to_comment': {'count': 3},
          'comments_disabled': False,
          'taken_at_timestamp': 1484645024,
          'dimensions': {'height': 1080, 'width': 1080},
          'display_url': '',
          'edge_liked_by': {'count': 42},
          'edge_media_preview_like': {'count': 42},
          'gating_info': None,
          'media_preview': 'ACoqdDpYeEP0J5wTSHR2C5yPzP8AhVn7YVi45xgYNWbWbzjtII4z1BFVzMjQpxacigHILZ6c/wD1qr6jaFDuwMH0rdBjDYGMjg5pJYVkXDcjNF9bkOCvzLc4dhim5rR1K3EEmByD0rNq7miNRmLIQOScVYsJGik+YEbhgfnn+VY4ncd6kSaR+nJH6etZhaxqyXa/PzyW4/Opri+/dqqn+77dqwZFZTyOvI96aZGHB4x7UrBa5PevvlJ69P5VVp5DPz1pm0+hqwL6wRZ5GB7k/wD6qsRwRxncuc/Wq6E0McEY4osTctCJGYM7EsvTt/jUhhiJ3MN59W5/+tUAozSsFywWUYCgY/pTOKhYZGaQGixLP//Z',
          'owner': {'id': '4469324900'},
          'thumbnail_src': '',
          'thumbnail_resources': [{'src': '',
            'config_width': 150,
            'config_height': 150},
           {'src': '',
            'config_width': 240,
            'config_height': 240},
           {'src': '',
            'config_width': 320,
            'config_height': 320},
           {'src': '',
            'config_width': 480,
            'config_height': 480},
           {'src': '',
            'config_width': 640,
            'config_height': 640}],
          'is_video': False,
          'accessibility_caption': None}}]},
      'edge_saved_media': {'count': 0,
       'page_info': {'has_next_page': False, 'end_cursor': None},
       'edges': []},
      'edge_media_collections': {'count': 0,
       'page_info': {'has_next_page': False, 'end_cursor': None},
       'edges': []}}},
    'felix_onboarding_video_resources': {'mp4': '/static/videos/felix-onboarding/onboardingVideo.mp4/9d16838ca7f9.mp4',
     'poster': '/static/images/felix-onboarding/onboardingVideoPoster.png/8fdba7cf2120.png'}}]},
 'gatekeepers': {'cb': True,
  'sf': True,
  'ld': True,
  'seo': True,
  'seoht': True,
  'saa': True,
  'phone_qp': True},
 'knobs': {'acct:ntb': 0, 'cb': 0, 'captcha': 0},
 'qe': {'form_navigation_dialog': {'g': '', 'p': {}},
  'cred_man': {'g': 'test', 'p': {'use_on_landing': 'true'}},
  'iab': {'g': '', 'p': {}},
  'app_upsell_li': {'g': '', 'p': {}},
  'app_upsell': {'g': '', 'p': {}},
  'stale_fix': {'g': '', 'p': {}},
  'profile_header_name': {'g': '', 'p': {}},
  'bc3l': {'g': '', 'p': {}},
  'direct_conversation_reporting': {'g': '', 'p': {}},
  'general_reporting': {'g': '', 'p': {}},
  'reporting': {'g': '', 'p': {}},
  'acc_recovery_link': {'g': '', 'p': {}},
  'notif': {'g': '', 'p': {}},
  'fb_unlink': {'g': '', 'p': {}},
  'mobile_stories_doodling': {'g': '', 'p': {}},
  'show_copy_link': {'g': '', 'p': {}},
  'mobile_logout': {'g': '', 'p': {}},
  'p_edit': {'g': '', 'p': {}},
  '404_as_react': {'g': '', 'p': {}},
  'acc_recovery': {'g': '', 'p': {}},
  'collections': {'g': '', 'p': {}},
  'comment_ta': {'g': '', 'p': {}},
  'su': {'g': '', 'p': {}},
  'disc_ppl': {'g': '', 'p': {}},
  'ebd_ul': {'g': 'launch', 'p': {'is_enabled': 'true'}},
  'ebdsim_li': {'g': '', 'p': {}},
  'ebdsim_lo': {'g': '', 'p': {}},
  'empty_feed': {'g': '', 'p': {}},
  'bundles': {'g': '', 'p': {}},
  'exit_story_creation': {'g': '', 'p': {}},
  'appsell': {'g': '', 'p': {}},
  'imgopt': {'g': '', 'p': {}},
  'follow_button': {'g': '', 'p': {}},
  'loggedout': {'g': '', 'p': {}},
  'loggedout_upsell': {'g': 'control_without_new_loggedout_upsell_content_03_15_18',
   'p': {'has_new_loggedout_upsell_content': 'false'}},
  'msisdn': {'g': '', 'p': {}},
  'bg_sync': {'g': '', 'p': {}},
  'onetaplogin': {'g': '', 'p': {}},
  'login_poe': {'g': '', 'p': {}},
  'private_lo': {'g': '', 'p': {}},
  'profile_tabs': {'g': '', 'p': {}},
  'push_notifications': {'g': '', 'p': {}},
  'reg': {'g': '', 'p': {}},
  'reg_vp': {'g': 'test_group_1', 'p': {'hide_value_prop': 'true'}},
  'report_media': {'g': '', 'p': {}},
  'report_profile': {'g': '', 'p': {}},
  'scroll_log': {'g': '', 'p': {}},
  'sidecar_swipe': {'g': '', 'p': {}},
  'su_universe': {'g': '', 'p': {}},
  'stale': {'g': '', 'p': {}},
  'stories_lo': {'g': 'test_05_01', 'p': {'location': 'true'}},
  'stories': {'g': '', 'p': {}},
  'tp_pblshr': {'g': '', 'p': {}},
  'video': {'g': '', 'p': {}},
  'gdpr_eu_tos': {'g': 'control_05_01',
   'p': {'gdpr_required': 'true',
    'eu_new_user_flow': 'age_two_button',
    'tos_version': 'eu'}},
  'gdpr_row_tos': {'g': '', 'p': {}},
  'fd_gr': {'g': '', 'p': {}},
  'felix': {'g': '', 'p': {}},
  'felix_clear_fb_cookie': {'g': '', 'p': {}},
  'felix_creation_duration_limits': {'g': '', 'p': {}},
  'felix_creation_enabled': {'g': '', 'p': {}},
  'felix_creation_fb_crossposting': {'g': '', 'p': {}},
  'felix_creation_fb_crossposting_v2': {'g': '', 'p': {}},
  'felix_creation_validation': {'g': '', 'p': {}},
  'felix_creation_video_upload': {'g': '', 'p': {}},
  'felix_early_onboarding': {'g': '', 'p': {}},
  'unfollow_confirm': {'g': '', 'p': {}},
  'profile_enhance_li': {'g': '', 'p': {}},
  'profile_enhance_lo': {'g': '', 'p': {}},
  'phone_confirm': {'g': '', 'p': {}},
  'comment_enhance': {'g': '', 'p': {}},
  'mweb_topical_explore': {'g': '', 'p': {}},
  'web_nametag': {'g': '', 'p': {}},
  'image_downgrade': {'g': '', 'p': {}},
  'image_downgrade_lite': {'g': '', 'p': {}},
  'follow_all_fb': {'g': '', 'p': {}},
  'lite_direct_upsell': {'g': '', 'p': {}},
  'web_loggedout_noop': {'g': '', 'p': {}},
  'stories_video_preload': {'g': '', 'p': {}},
  'lite_stories_video_preload': {'g': '', 'p': {}},
  'a2hs_heuristic_uc': {'g': '', 'p': {}},
  'a2hs_heuristic_non_uc': {'g': '', 'p': {}},
  'web_hashtag': {'g': '', 'p': {}},
  'header_scroll': {'g': '', 'p': {}},
  'rout': {'g': '', 'p': {}},
  'websr': {'g': '', 'p': {}},
  'web_lo_follow': {'g': '', 'p': {}},
  'web_share': {'g': '', 'p': {}},
  'lite_rating': {'g': '', 'p': {}},
  'web_embeds_share': {'g': '', 'p': {}},
  'web_share_lo': {'g': '', 'p': {}},
  'web_embeds_logged_out': {'g': 'test_comment_input',
   'p': {'show_comment_input': 'true'}},
  'sl': {'g': '', 'p': {}},
  'reg_nux': {'g': '', 'p': {}},
  'web_datasaver_mode': {'g': '', 'p': {}},
  'lite_datasaver_mode': {'g': '', 'p': {}},
  'lite_video_upload': {'g': '', 'p': {}}},
 'hostname': '',
 'platform': 'web',
 'rhx_gis': 'b9d7a25d3e0772990918069a0652bc21',
 'nonce': 'E+077618aJD12ZjcMWUynA==',
 'zero_data': {},
 'rollout_hash': '2502ae2429f4',
 'bundle_variant': 'base',
 'probably_has_app': False}

Upvotes: 1

Related Questions