Reputation: 7961
I tried parsing this huge XML document using XML minidom. While it worked fine on a sample file, it choked the system when trying to process the real file (about 400 MB).
I tried adapting code (it processes data in a streaming fashion rather than in-memory load at once) from codereview for my xml file, I am having trouble isolating the datasets due to the nested nature of the elements. I worked on simple XML files before but not on a memory intensive task like this.
Is this the right approach? How do I associate the Inventory and Publisher IDs to each book? That is how I am planning to relate the 2 tables eventually.
Any feedback is much appreciated.
book.xml
<BookDatabase>
<BookHeader>
<Name>BookData</Name>
<BookUniverse>All</BookUniverse>
<AsOfDate>2010-05-02</AsOfDate>
<Version>1.1</Version>
</BookHeader>
<InventoryBody>
<Inventory ID="12">
<PublisherClass ID="34">
<Publisher>
<PublisherDetails>
<Name>Microsoft Press</Name>
<Type>Tech</Type>
<ID>7462</ID>
</PublisherDetails>
</Publisher>
</PublisherClass>
<BookList>
<Listing>
<BookListSummary>
<Date>2009-01-30</Date>
</BookListSummary>
<Book>
<BookDetail ID="67">
<BookName>Code Complete 2</BookName>
<Author>Steve McConnell</Author>
<Pages>960</Pages>
<ISBN>0735619670</ISBN>
</BookDetail>
<BookDetail ID="78">
<BookName>Application Architecture Guide 2</BookName>
<Author>Microsoft Team</Author>
<Pages>496</Pages>
<ISBN>073562710X</ISBN>
</BookDetail>
</Book>
</Listing>
</BookList>
</Inventory>
<Inventory ID="64">
<PublisherClass ID="154">
<Publisher>
<PublisherDetails>
<Name>O'Reilly Media</Name>
<Type>Tech</Type>
<ID>7484</ID>
</PublisherDetails>
</Publisher>
</PublisherClass>
<BookList>
<Listing>
<BookListSummary>
<Date>2009-03-30</Date>
</BookListSummary>
<Book>
<BookDetail ID="98">
<BookName>Head First Design Patterns</BookName>
<Author>Kathy Sierra</Author>
<Pages>688</Pages>
<ISBN>0596007124</ISBN>
</BookDetail>
</Book>
</Listing>
</BookList>
</Inventory>
</InventoryBody>
</BookDatabase>
Python Code :
import sys
import os
#import MySQLdb
from lxml import etree
CATEGORIES = set(['BookHeader', 'Inventory', 'PublisherClass', 'PublisherDetails', 'BookDetail'])
SKIP_CATEGORIES = set(['BookHeader'])
DATA_ITEMS = ["Name", "Type", "ID", "BookName", "Author", "Pages", "ISBN"]
def clear_element(element):
element.clear()
while element.getprevious() is not None:
del element.getparent()[0]
def extract_book_elements(context):
for event, element in context:
if element.tag in CATEGORIES:
yield element
clear_element(element)
def fast_iter2(context):
for bookCounter, element in enumerate(extract_book_elements(context)):
books = [book.text for book in element.findall("BookDetail")]
bookdetail = {
'element' : element.tag,
'ID' : element.get('ID')
}
for data_item in DATA_ITEMS:
data = element.find(data_item)
if data is not None:
bookdetail[data_item] = data
if bookdetail['element'] not in SKIP_CATEGORIES:
#populate_database(bookdetail, books, cursor)
print bookdetail, books
print "========>", bookCounter , "<======="
def main():
#cursor = connectToDatabase()
#cursor.execute("""SET NAMES utf8""")
context = etree.iterparse("book.xml", events=("start", "end"))
#fast_iter(context, cursor)
fast_iter2(context)
#cursor.close()
if __name__ == '__main__':
main()
Python Output :
$ python lxmletree_book.py
========> 0 <=======
========> 1 <=======
{'ID': '12', 'element': 'Inventory'} []
========> 2 <=======
{'ID': '34', 'element': 'PublisherClass'} []
========> 3 <=======
{'Name': <Element Name at 0x105140af0>, 'Type': <Element Type at 0x105140b40>, 'ID': <Element ID at 0x105140b90>, 'element': 'PublisherDetails'} []
========> 4 <=======
{'ID': None, 'element': 'PublisherDetails'} []
========> 5 <=======
{'ID': None, 'element': 'PublisherClass'} []
========> 6 <=======
{'ISBN': <Element ISBN at 0x105140eb0>, 'Name': <Element Name at 0x105140dc0>, 'Author': <Element Author at 0x105140e10>, 'ID': '67', 'element': 'BookDetail', 'Pages': <Element Pages at 0x105140e60>} []
========> 7 <=======
{'ID': None, 'element': 'BookDetail'} []
========> 8 <=======
{'ISBN': <Element ISBN at 0x1051460a0>, 'Name': <Element Name at 0x105140f50>, 'Author': <Element Author at 0x105140fa0>, 'ID': '78', 'element': 'BookDetail', 'Pages': <Element Pages at 0x105146050>} []
========> 9 <=======
{'ID': None, 'element': 'BookDetail'} []
========> 10 <=======
{'ID': None, 'element': 'Inventory'} []
========> 11 <=======
{'ID': '64', 'element': 'Inventory'} []
========> 12 <=======
{'ID': '154', 'element': 'PublisherClass'} []
========> 13 <=======
{'Name': <Element Name at 0x105146230>, 'Type': <Element Type at 0x105146280>, 'ID': <Element ID at 0x1051462d0>, 'element': 'PublisherDetails'} []
========> 14 <=======
{'ID': None, 'element': 'PublisherDetails'} []
========> 15 <=======
{'ID': None, 'element': 'PublisherClass'} []
========> 16 <=======
{'ISBN': <Element ISBN at 0x1051465f0>, 'Name': <Element Name at 0x105146500>, 'Author': <Element Author at 0x105146550>, 'ID': '98', 'element': 'BookDetail', 'Pages': <Element Pages at 0x1051465a0>} []
========> 17 <=======
{'ID': None, 'element': 'BookDetail'} []
========> 18 <=======
{'ID': None, 'element': 'Inventory'} []
========> 19 <=======
Desired Output (eventually stored in MySQL - for now a List in Python):
Publishers
InventoryID PublisherClassID Name Type ID
12 34 Microsoft Press Tech 7462
64 154 O'Reilly Media Tech 7484
Books
PublisherID BookDetailID Name Author Pages ISBN
7462 67 Code Complete 2 Steve McConnell 960 0735619670
7462 78 Application Architecture Guide 2 Microsoft Team 496 073562710X
7484 98 Head First Design Patterns Kathy Sierra 688 0596007124
Upvotes: 8
Views: 3600
Reputation: 879481
You might try something like this:
import MySQLdb
from lxml import etree
import config
def fast_iter(context, func, args=[], kwargs={}):
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# Author: Liza Daly
for event, elem in context:
func(elem, *args, **kwargs)
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
def extract_paper_elements(element,cursor):
pub={}
pub['InventoryID']=element.attrib['ID']
try:
pub['PublisherClassID']=element.xpath('PublisherClass/@ID')[0]
except IndexError:
pub['PublisherClassID']=None
pub['PublisherClassID']=element.xpath('PublisherClass/@ID')[0]
for key in ('Name','Type','ID'):
try:
pub[key]=element.xpath(
'PublisherClass/Publisher/PublisherDetails/{k}/text()'.format(k=key))[0]
except IndexError:
pub[key]=None
sql='''INSERT INTO Publishers (InventoryID, PublisherClassID, Name, Type, ID)
VALUES (%s, %s, %s, %s, %s)
'''
args=[pub.get(key) for key in
('InventoryID', 'PublisherClassID', 'Name', 'Type', 'ID')]
print(args)
# cursor.execute(sql,args)
for bookdetail in element.xpath('descendant::BookList/Listing/Book/BookDetail'):
pub['BookDetailID']=bookdetail.attrib['ID']
for key in ('BookName', 'Author', 'Pages', 'ISBN'):
try:
pub[key]=bookdetail.xpath('{k}/text()'.format(k=key))[0]
except IndexError:
pub[key]=None
sql='''INSERT INTO Books
(PublisherID, BookDetailID, Name, Author, Pages, ISBN)
VALUES (%s, %s, %s, %s, %s, %s)
'''
args=[pub.get(key) for key in
('ID', 'BookDetailID', 'BookName', 'Author', 'Pages', 'ISBN')]
# cursor.execute(sql,args)
print(args)
def main():
context = etree.iterparse("book.xml", events=("end",), tag='Inventory')
connection=MySQLdb.connect(
host=config.HOST,user=config.USER,
passwd=config.PASS,db=config.MYDB)
cursor=connection.cursor()
fast_iter(context,extract_paper_elements,args=(cursor,))
cursor.close()
connection.commit()
connection.close()
if __name__ == '__main__':
main()
fast_iter2
. The original fast_iter
separates the
useful utility from the specific processing function
(extract_paper_elements
). fast_iter2
mixes the two together
leaving you with no repeatable code.tag
parameter in etree.iterparse("book.xml",
events=("end",), tag='Inventory')
then your processing function
extract_paper_elements
will only see Inventory
elements.xpath
method to burrow
down and scrape the desired data.args
and kwargs
parameters were added to fast_iter
so cursor
can be passed to extract_paper_elements
.Upvotes: 7
Reputation: 348
So if you want something quick and dirty you need to simply remember the last publisher parsed and assign it to each bookdetail until the next PublisherDetails is found. Add something like this to your fast_iter2 function after you extract the bookdetail:
if bookdetail['element'] == "PublisherDetails" and bookdetail['ID'] is not None:
CurrentPublisher = bookdetail
if bookdetail['element'] == "BookDetail":
bookdetail['PublisherID'] = CurrentPublisher['ID'].text
Upvotes: 1