Slow html parser. How to increase the speed?

Question

I would like to estimate the impact of the news on the Dow Jones quotes. For this, I wrote the Python html parser, using the beutifullsoup library. I extract an article and store it in XML file for the further analysis using NLTK library. How can I increase the speed of parsing? The code below does the required task, but in a very slow manner.

Here is the code of the html parser:

import urllib2
import re
import xml.etree.cElementTree as ET
import nltk
from bs4 import BeautifulSoup
from datetime import date
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
from collections import defaultdict

def main_parser():
    #starting date
    a = date(2014, 3, 27)
    #ending date
    b = date(2014, 3, 27)
    articles = ET.Element("articles")
    f = open('~/Documents/test.xml', 'w')
    #loop through the links and per each link extract the text of the article, store the latter at xml file
    for dt in rrule(DAILY, dtstart=a, until=b):
        url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html"
        page = urllib2.urlopen(url)
        #use html5lib ??? possibility to use another parser
        soup = BeautifulSoup(page.read(), "html5lib")
        article_date = ET.SubElement(articles, "article_date")
        article_date.text = str(dt)
        for links in soup.find_all("div", "headlineMed"):
            anchor_tag = links.a
            if not 'video' in anchor_tag['href']:
                try:
                    article_time = ET.SubElement(article_date, "article_time")
                    article_time.text = str(links.text[-11:])

                    article_header = ET.SubElement(article_time, "article_name")
                    article_header.text = str(anchor_tag.text)

                    article_link = ET.SubElement(article_time, "article_link")
                    article_link.text = str(anchor_tag['href']).encode('utf-8')

                    try:
                        article_text = ET.SubElement(article_time, "article_text")
                        #get text and remove all stop words
                        article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore')
                    except Exception:
                        pass
                except Exception:
                    pass

    tree = ET.ElementTree(articles)
    tree.write("~/Documents/test.xml","utf-8")

#getting the article text from the spicific url
def extract_article(url):
    plain_text = ""
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html, "html5lib")
    tag = soup.find_all("p")
    #replace all html tags
    plain_text = re.sub(r'|
|[|]||', "", str(tag))
    plain_text = plain_text.replace(", ,", "")
    return str(plain_text)

def remove_stop_words(text):
    text=nltk.word_tokenize(text)
    filtered_words = [w for w in text if not w in stopwords.words('english')]
    return ' '.join(filtered_words)

alecxe · Accepted Answer

Several fixes can be applied (without changing modules you are currently using):

use lxml parser instead of html5lib - it is much much (and 3 more muches) faster
parse only a part of document with SoupStrainer (note that html5lib doesn't support SoupStrainer - it will always parse the whole document slowly)

Here's how the code would look like after the changes. Brief performance test shows at least 3x improvement:

import urllib2
import xml.etree.cElementTree as ET
from datetime import date

from bs4 import SoupStrainer, BeautifulSoup
import nltk
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords


def main_parser():
    a = b = date(2014, 3, 27)
    articles = ET.Element("articles")
    for dt in rrule(DAILY, dtstart=a, until=b):
        url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
            "%d") + ".html"

        links = SoupStrainer("div", "headlineMed")
        soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links)

        article_date = ET.SubElement(articles, "article_date")
        article_date.text = str(dt)
        for link in soup.find_all('a'):
            if not 'video' in link['href']:
                try:
                    article_time = ET.SubElement(article_date, "article_time")
                    article_time.text = str(link.text[-11:])

                    article_header = ET.SubElement(article_time, "article_name")
                    article_header.text = str(link.text)

                    article_link = ET.SubElement(article_time, "article_link")
                    article_link.text = str(link['href']).encode('utf-8')

                    try:
                        article_text = ET.SubElement(article_time, "article_text")
                        article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore')
                    except Exception:
                        pass
                except Exception:
                    pass

    tree = ET.ElementTree(articles)
    tree.write("~/Documents/test.xml", "utf-8")


def extract_article(url):
    paragraphs = SoupStrainer('p')
    soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs)
    return soup.text


def remove_stop_words(text):
    text = nltk.word_tokenize(text)
    filtered_words = [w for w in text if not w in stopwords.words('english')]
    return ' '.join(filtered_words)

Note that I've removed the regular expression processing from extract_article() - looks like you can just get the whole text from the p tags.

I might have introduced some problems - please check if everything is correct.

Another solution would be to use lxml for everything from parsing (replace beautifulSoup) to creating the xml (replace xml.etree.ElementTree).

Another solution (definitely the fastest) would be to switch to Scrapy web-scraping web-framework. It is simple and very fast. There are all kind of batteries, you can imagine, included. For example there are link extractors, XML exporters, database pipelines etc. Worth looking.

Hope that helps.

Slow html parser. How to increase the speed?

Answers (2)

Related Questions