Gettin HTML element and sending new json requests in python

Question

I try to crawl this link by sending json requests. My first request would be :

parameters1 = {'ticker':'XOM', 'countryCode':'US',
       'dateTime':'', 'docId':'1222737422 ',
       'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
       'messageNumber':'','count':'10',
      'channelName':'/news/latest/company/us/xom', 'topic':'',
       '_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))

for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?

Anyany Pan · Accepted Answer

import requests
import json

from bs4 import BeautifulSoup 


def main():

    html_url = 'http://www.marketwatch.com/investing/stock/xom'

    resp = requests.get(html_url)
    if resp.status_code != 200:
        raise Exception("http request failed: %s" % resp)
    soup = BeautifulSoup(resp.text, 'lxml')

    # get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
    li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
    unique_id = li_node['data-uniqueid']
    print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('
', ' ').strip()))


    baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
    parameters = {
        'ticker':'XOM',
        'countryCode':'US',
        'docType':'806',
        'docId': '', # (Optional) initial value extract from HTML page
        'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
        'messageNumber':'8589', # initial value extract from HTML page
        'count':'10',
        'channelName': '/news/latest/company/us/xom',
    }

    parameters.update(extract_page_params(unique_id))


    while True:
        resp = requests.get(baseUrl, params = parameters)
        data = json.loads(resp.text) # array of size 10 
        first = data[0] # get first item of array
        last = data[-1] # get last item of array
        print("
got %d data, url: %s" % (len(data), resp.url))
        print("	first: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
        print("	 last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
        print("")


        uid = last['UniqueId'] # get value of UniqueId from dict object `last`

        parameters.update(extract_page_params(uid))

        input("press  to get next")


def extract_page_params(uid):
    sequence = ''
    messageNumber = ''

    docId = ''

    if ':' in uid: # if the symbol ':' in string `uid`
        # uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
        # so split it by ':'
        sequence, messageNumber = uid.split(':')
    else:
        docId = uid

    return {
        'sequence': sequence,
        'messageNumber': messageNumber,
        'docId': docId,
    }


if __name__ == '__main__':
    main()

This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)

Gettin HTML element and sending new json requests in python

Answers (2)

Related Questions