Reputation: 1070
I try to crawl this link by sending json requests. My first request would be :
parameters1 = {'ticker':'XOM', 'countryCode':'US',
'dateTime':'', 'docId':'1222737422 ',
'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2',
'messageNumber':'','count':'10',
'channelName':'/news/latest/company/us/xom', 'topic':'',
'_':'' }
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines"
html1 = requests.get(firstUrl, params = parameters1, headers = header)
html_json1=(json.loads(html1.text))
for sending the next requests, I have to extract docId from the corresponding HTML and add it to the new parameters. I don't know how to do that. Do you have any idea how to get new HTML frile after sending json requestes?
Upvotes: 0
Views: 126
Reputation: 787
import requests
import json
from bs4 import BeautifulSoup
def main():
html_url = 'http://www.marketwatch.com/investing/stock/xom'
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1]
unique_id = li_node['data-uniqueid']
print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip()))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = {
'ticker':'XOM',
'countryCode':'US',
'docType':'806',
'docId': '', # (Optional) initial value extract from HTML page
'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page
'messageNumber':'8589', # initial value extract from HTML page
'count':'10',
'channelName': '/news/latest/company/us/xom',
}
parameters.update(extract_page_params(unique_id))
while True:
resp = requests.get(baseUrl, params = parameters)
data = json.loads(resp.text) # array of size 10
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("\ngot %d data, url: %s" % (len(data), resp.url))
print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment']))
print("")
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
parameters.update(extract_page_params(uid))
input("press <enter> to get next")
def extract_page_params(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
if __name__ == '__main__':
main()
This is my code to solve your problem.
Since you are new to programming, i have added some comments.
You could directly copy and run with python version 3. (2 should work either)
Upvotes: 1
Reputation: 2513
You can use Beautiful Soup to extract data from html.It is a python library for extracting data from HTML.
Upvotes: 0