Kevin Lu
Kevin Lu

Reputation: 5

Are there any ways to optimize Beautiful Soup parsing over a large number of files?

I downloaded the entirety of the PubMed Central Archives for text mining and for preprocessing I am parsing it to JSON to decrease the size of it and eliminate any information that is irrelevant or difficult to mine, such as the bibliography. However, the entire document is 25Gb in size and currently, the ETA is about 50 hours. Below is my Python script for doing so. I have already tried multiprocessing which increased the speed by about a factor of 3. Also, I checked with time and found that the bottleneck (about 90% of the runtime) is attributed to line tree = BS(f.read(), features='lxml-xml'), so I don't think the regex is the issue. Are there any other ways to increase the speed?

import glob
import json
import multiprocessing as mp
import os
import re

from bs4 import BeautifulSoup as BS
from tqdm import tqdm

skipped = 0
files = tuple(glob.iglob(r'*\*.nxml'))
pbar = tqdm(total=len(files))

def xml2json(filename, logging=False):
    if logging:
        tqdm.write("Now parsing {}".format(filename))

    with open(filename, 'r', encoding='utf-8') as f:
        # start = time.time()
        tree = BS(f.read(), features='lxml-xml')
        # print("elapsed time " + str(time.time() - start))

    dic = {
        'publisher': {
            'name': "",  # tree.find('publisher-name').text,
            'loc': "",  # tree.find('publisher-loc').text
        },
        "id": tree.find('article-id').text,
        'title': tree.find('article-title').text.strip(),
        'contributors': [],
        "pub": {
            "volume": "",
            "issue": "",
            "day": "",
            "month": "",
            "year": "",
        },
        "abstract": "",
        "body": "",
        "body_headings": ""
    }

    # start = time.time()

    for tag in ("volume", "issue", "day", "month", "year"):
        node = tree.find(tag)
        if node:
            dic["pub"][tag] = node.text

    node = tree.find('publisher-name')
    if node:
        dic["publisher"]["publisher-name"] = node.text

    node = tree.find('publisher-loc')
    if node:
        dic["publisher"]["publisher-loc"] = node.text

    contributors = []
    branch = tree.find("contrib-group")
    if branch:
        for node in branch.findAll("contrib"):
            contributors.append("{}, {}".format(node.find("surname").text, node.find("given-names").text))
        dic["contributors"] = contributors

    abstract = ""
    branch = tree.find("abstract")
    if not branch:
        return None

    for node in branch.find_all(["p"]):
        if node.text == "Supporting Information":
            break
        text = "\n" + node.text.replace("\n", "").strip()
        text = re.sub("[\(\[].*?[\)\]]", "", text)
        text = re.sub(" {2,}", " ", text)
        text = re.sub(" \.", ".", text)
        abstract += text
    dic["abstract"] = abstract

    body = ""
    body_headings = ""
    branch = tree.find("body")
    if not branch:
        return None
    for node in branch.find_all(["title", "p"]):
        if node.text == "Supporting Information":
            break
        if node.name == "title":
            text = "\n"
        else:
            text = ""
        text += "\n" + node.text.replace("\n", "").strip()
        text = re.sub("[\(\[].*?[\)\]]", "", text)
        text = re.sub(" {2,}", " ", text)
        text = re.sub(" (\.|\,)", "\g<1>", text)
        body_headings += text
        if node.name == "p":
            body += text

    dic["body"] = body
    dic["body_headings"] = body_headings

    # print(time.time() - start)

    return dic

def parse(file):
    _, name = os.path.split(file)
    name, _ = os.path.splitext(name)
    with open("json/{}.json".format(name[3:]), "w") as f:
        dic = xml2json(file, logging=False)
        if dic:
            json.dump(dic, f)
        else:
            global skipped
            skipped += 1
            # tqdm.write("Skipping!")

def callback(m):
    # print(m)
    pbar.update(1)

def error_callback(e):
    print(e)


if __name__ == '__main__':
    tqdm.write("Found {} files...".format(len(files)))
    pool = mp.Pool()

    for filepath in files:
        pool.apply_async(parse, (filepath,), callback=callback, error_callback=error_callback)

    pool.close()
    pool.join()
    pbar.close()

    print("Done, skipped {}".format(skipped))

Upvotes: 0

Views: 250

Answers (1)

tdelaney
tdelaney

Reputation: 77357

BeautifulSoup does its parsing in python which is not as efficient as C. Additionally, html is messier than XML, adding to the burden of parsing. I believe that nXML files are fully conformant XML so the C based lxml parser should be much faster.

Upvotes: 0

Related Questions