Reputation: 899
I would like to estimate the impact of the news on the Dow Jones quotes. For this, I wrote the Python html parser, using the beutifullsoup library. I extract an article and store it in XML file for the further analysis using NLTK library. How can I increase the speed of parsing? The code below does the required task, but in a very slow manner.
Here is the code of the html parser:
import urllib2
import re
import xml.etree.cElementTree as ET
import nltk
from bs4 import BeautifulSoup
from datetime import date
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
from collections import defaultdict
def main_parser():
#starting date
a = date(2014, 3, 27)
#ending date
b = date(2014, 3, 27)
articles = ET.Element("articles")
f = open('~/Documents/test.xml', 'w')
#loop through the links and per each link extract the text of the article, store the latter at xml file
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html"
page = urllib2.urlopen(url)
#use html5lib ??? possibility to use another parser
soup = BeautifulSoup(page.read(), "html5lib")
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for links in soup.find_all("div", "headlineMed"):
anchor_tag = links.a
if not 'video' in anchor_tag['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(links.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(anchor_tag.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(anchor_tag['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
#get text and remove all stop words
article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml","utf-8")
#getting the article text from the spicific url
def extract_article(url):
plain_text = ""
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, "html5lib")
tag = soup.find_all("p")
#replace all html tags
plain_text = re.sub(r'<p>|</p>|[|]|<span class=.*</span>|<a href=.*</a>', "", str(tag))
plain_text = plain_text.replace(", ,", "")
return str(plain_text)
def remove_stop_words(text):
text=nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)
Upvotes: 2
Views: 5040
Reputation: 319
You want to pick the best parser.
We benchmark most of the parser / platform when building: http://serpapi.com
Here is a full article on Medium: https://medium.com/@vikoky/fastest-html-parser-available-now-f677a68b81dd
Upvotes: 0
Reputation: 473773
Several fixes can be applied (without changing modules you are currently using):
lxml
parser instead of html5lib
- it is much much (and 3 more muches) fasterhtml5lib
doesn't support SoupStrainer
- it will always parse the whole document slowly)Here's how the code would look like after the changes. Brief performance test shows at least 3x improvement:
import urllib2
import xml.etree.cElementTree as ET
from datetime import date
from bs4 import SoupStrainer, BeautifulSoup
import nltk
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
def main_parser():
a = b = date(2014, 3, 27)
articles = ET.Element("articles")
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
"%d") + ".html"
links = SoupStrainer("div", "headlineMed")
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links)
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for link in soup.find_all('a'):
if not 'video' in link['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(link.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(link.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(link['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml", "utf-8")
def extract_article(url):
paragraphs = SoupStrainer('p')
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs)
return soup.text
def remove_stop_words(text):
text = nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)
Note that I've removed the regular expression processing from extract_article()
- looks like you can just get the whole text from the p tags.
I might have introduced some problems - please check if everything is correct.
Another solution would be to use lxml
for everything from parsing (replace beautifulSoup
) to creating the xml (replace xml.etree.ElementTree
).
Another solution (definitely the fastest) would be to switch to Scrapy web-scraping web-framework. It is simple and very fast. There are all kind of batteries, you can imagine, included. For example there are link extractors, XML exporters, database pipelines etc. Worth looking.
Hope that helps.
Upvotes: 1