Reputation: 5
I downloaded the entirety of the PubMed Central Archives for text mining and for preprocessing I am parsing it to JSON to decrease the size of it and eliminate any information that is irrelevant or difficult to mine, such as the bibliography. However, the entire document is 25Gb in size and currently, the ETA is about 50 hours. Below is my Python script for doing so. I have already tried multiprocessing which increased the speed by about a factor of 3. Also, I checked with time and found that the bottleneck (about 90% of the runtime) is attributed to line tree = BS(f.read(), features='lxml-xml')
, so I don't think the regex is the issue. Are there any other ways to increase the speed?
import glob
import json
import multiprocessing as mp
import os
import re
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
skipped = 0
files = tuple(glob.iglob(r'*\*.nxml'))
pbar = tqdm(total=len(files))
def xml2json(filename, logging=False):
if logging:
tqdm.write("Now parsing {}".format(filename))
with open(filename, 'r', encoding='utf-8') as f:
# start = time.time()
tree = BS(f.read(), features='lxml-xml')
# print("elapsed time " + str(time.time() - start))
dic = {
'publisher': {
'name': "", # tree.find('publisher-name').text,
'loc': "", # tree.find('publisher-loc').text
},
"id": tree.find('article-id').text,
'title': tree.find('article-title').text.strip(),
'contributors': [],
"pub": {
"volume": "",
"issue": "",
"day": "",
"month": "",
"year": "",
},
"abstract": "",
"body": "",
"body_headings": ""
}
# start = time.time()
for tag in ("volume", "issue", "day", "month", "year"):
node = tree.find(tag)
if node:
dic["pub"][tag] = node.text
node = tree.find('publisher-name')
if node:
dic["publisher"]["publisher-name"] = node.text
node = tree.find('publisher-loc')
if node:
dic["publisher"]["publisher-loc"] = node.text
contributors = []
branch = tree.find("contrib-group")
if branch:
for node in branch.findAll("contrib"):
contributors.append("{}, {}".format(node.find("surname").text, node.find("given-names").text))
dic["contributors"] = contributors
abstract = ""
branch = tree.find("abstract")
if not branch:
return None
for node in branch.find_all(["p"]):
if node.text == "Supporting Information":
break
text = "\n" + node.text.replace("\n", "").strip()
text = re.sub("[\(\[].*?[\)\]]", "", text)
text = re.sub(" {2,}", " ", text)
text = re.sub(" \.", ".", text)
abstract += text
dic["abstract"] = abstract
body = ""
body_headings = ""
branch = tree.find("body")
if not branch:
return None
for node in branch.find_all(["title", "p"]):
if node.text == "Supporting Information":
break
if node.name == "title":
text = "\n"
else:
text = ""
text += "\n" + node.text.replace("\n", "").strip()
text = re.sub("[\(\[].*?[\)\]]", "", text)
text = re.sub(" {2,}", " ", text)
text = re.sub(" (\.|\,)", "\g<1>", text)
body_headings += text
if node.name == "p":
body += text
dic["body"] = body
dic["body_headings"] = body_headings
# print(time.time() - start)
return dic
def parse(file):
_, name = os.path.split(file)
name, _ = os.path.splitext(name)
with open("json/{}.json".format(name[3:]), "w") as f:
dic = xml2json(file, logging=False)
if dic:
json.dump(dic, f)
else:
global skipped
skipped += 1
# tqdm.write("Skipping!")
def callback(m):
# print(m)
pbar.update(1)
def error_callback(e):
print(e)
if __name__ == '__main__':
tqdm.write("Found {} files...".format(len(files)))
pool = mp.Pool()
for filepath in files:
pool.apply_async(parse, (filepath,), callback=callback, error_callback=error_callback)
pool.close()
pool.join()
pbar.close()
print("Done, skipped {}".format(skipped))
Upvotes: 0
Views: 250
Reputation: 77357
BeautifulSoup does its parsing in python which is not as efficient as C. Additionally, html is messier than XML, adding to the burden of parsing. I believe that nXML files are fully conformant XML so the C based lxml parser should be much faster.
Upvotes: 0