Reputation: 460
I have a python module that was already written for me to download and parse data from googles patent listing. The code works great until I do anything before 2005. I have no knowledge of python except how to run the module. How do I fix it?
The traceback I receive is:
Traceback (most recent call last):
File "C:\Users\John\Desktop\FINAL BART ALL INFO-Magic Bullet.py", line 46, in <module>
assert xml_file is not None
AssertionError
And this is the code I'm using:
#Ignore all this information
import urllib2, os, zipfile
from lxml import etree
#-------------------------------------------------------------------------------
#Ignore all this information
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
buff = []
for line in data:
if separator(line):
if buff:
yield ''.join(buff)
buff[:] = []
buff.append(line)
yield ''.join(buff)
def first(seq,default=None):
"""Return the first item from sequence, seq or the default(None) value"""
for item in seq:
return item
return default
#-------------------------------------------------------------------------------
#This is where you change the internet source file- Use the file extensions from the sheet provided.
datasrc = "http://storage.googleapis.com/patents/grant_full_text/2003/pg030107.zip"
#http://commondatastorage.googleapis.com/patents/grant_full_text/2012/ipg120117.zip
filename = datasrc.split('/')[-1]
#-------------------------------------------------------------------------------
#Ignore all this information
if not os.path.exists(filename):
with open(filename,'wb') as file_write:
r = urllib2.urlopen(datasrc)
file_write.write(r.read())
zf = zipfile.ZipFile(filename)
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')])
assert xml_file is not None
#-------------------------------------------------------------------------------
#output set your folder location here, keep double \\ between
outFolder = "C:\\PatentFiles\\"
outFilename = os.path.splitext(filename)[0]
#-------------------------------------------------------------------------------
#These outputs are the names of the files-Ignore all this information
output = outFolder + outFilename + "_general.txt"
output2 = outFolder + outFilename + "_USCL.txt"
output3 = outFolder + outFilename + "_citation.txt"
output4 = outFolder + outFilename + "_inventor.txt"
#Open files
outFile = open(output, "w")
outFile2 = open(output2, "w")
outFile3 = open(output3, "w")
outFile4 = open(output4, "w")
#write the headers
outFile.write("Patent No.|GrantDate|Application Date|Number of Claims|Examiners|US Primary Main Classification|Assignee|Assignee Address City_State_Country|First Inventor|First Inventor Address City_State_Country| \n")
outFile2.write("Patent No.|Primary|U.S Classification| \n")
outFile3.write ("Patent No.|Citation|Citation Date|Who Cited This| \n")
outFile4.write ("Patent No.|Inventor Last Name|First Name|City|State|Country|Nationality Country|Residence Country|\n")
#-------------------------------------------------------------------------------
#Here is the count- adjust this each time you run the program for the first time.
#Run at 10 for the 1st run then 5500 afterward.
count = 0
for item in xmlSplitter(zf.open(xml_file)):
count += 1
#5500
if count > 10: break
doc = etree.XML(item)
#-------------------------------------------------------------------------------
#This is where the python starts parsing the infomation.
#This is the Start of the General Infomation file.
docID = "~".join(doc.xpath('//publication-reference/document-id/country/text()|//publication-reference/document-id/doc-number/text()'))
docID = docID.replace("D0","D")
docID = docID.replace("H000","H")
docID = docID.replace("PP0","PP")
docID = docID.replace("PP0","PP")
docID = docID.replace("RE0","RE")
docID = docID.replace("~0","~")
docID = docID.replace("US~","")
grantdate = first(doc.xpath('//publication-reference/document-id/date/text()'))
applicationdate = first(doc.xpath('//application-reference/document-id/date/text()'))
claimsNum = first(doc.xpath('//number-of-claims/text()'))
assignee1 = "-".join(doc.xpath('//assignees/assignee/addressbook/orgname/text()|//assignees/assignee/addressbook/last-name/text()|//assignees/assignee/addressbook/first-name/text()'))
assignee1 = assignee1.replace('-',', ')
assignee2 = "_".join(doc.xpath('//assignee/addressbook/address/*/text()'))
assignees = str(assignee1.encode("UTF-8")) + "|" + str(assignee2.encode("UTF-8"))
inventors1 = first(doc.xpath('//applicants/applicant/addressbook/last-name/text()'))
inventor2 = first(doc.xpath('//applicants/applicant/addressbook/first-name/text()'))
inventor3 = first(doc.xpath('//applicants/applicant/addressbook/address/city/text()'))
inventor4 = first(doc.xpath('//applicants/applicant/addressbook/address/state/text()'))
inventor5 = first(doc.xpath('//applicants/applicant/addressbook/address/country/text()'))
inventor = str(inventor2.encode("UTF-8") if inventor2 else inventor2) + " " + str(inventors1.encode("UTF-8") if inventors1 else inventors1)
inventors2 = str(inventor3.encode("UTF-8") if inventor3 else inventor3) + "_" + str(inventor4) + "_" + str(inventor5)
inventors = str(inventor) + "|" + str(inventors2)
examiners = "~".join(doc.xpath('//examiners/primary-examiner/first-name/text()|//examiners/primary-examiner/last-name/text()'))
examiners = examiners.replace("~",", ")
uscl1 = first(doc.xpath('//classification-national/main-classification/text()'))
#END FIRST TEXT FILE #-------------------------------------------------------------------------------
#This begings the USCL file
notprimary = first(doc.xpath('//publication-reference/document-id/country/text()'))
notprimary = notprimary.replace("US","0")
primary1 = first(doc.xpath('//publication-reference/document-id/country/text()'))
primary1 = primary1.replace("US","1")
uscl2 = "~".join(doc.xpath('//us-bibliographic-data-grant/classification-national/*/text()|//sequence-cwu/publication-reference/document-id/country/text()'))
#-------------------------NOTE--------------------------------------------------
#--------------------------NOTE-------------------------------------------------
#-----------------------NOTE----------------------------------------------------
#NOTE- RUN through count 10 then remove pound signs from two below
uscl2 = uscl2.replace("US~", str(primary1) + "|")
uscl2 = uscl2.replace("~", "|" + "\n" + str(docID) + "|" + str(notprimary) + "|")
uscl2 = uscl2.replace("US", "|")
#END SECOND TEXT FILE #-------------------------------------------------------------------------------
#Begin the Citation file
citation = '~'.join(doc.xpath('//publication-reference/document-id/country/text()|//references-cited/citation/patcit/document-id/country/text()|//references-cited/citation/patcit/document-id/doc-number/text()|//references-cited/citation/patcit/document-id/kind/text()|//references-cited/citation/patcit/document-id/date/text()|//references-cited/citation/category/text()'))
#Here is the start of the patent connectors- in the patents they exist at the end. They are replaced in this code to make pipes | for the final output
citation = citation.replace("~A~", "$@")
citation = citation.replace("~S~", "$@")
citation = citation.replace("~S1~", "$@")
citation = citation.replace("~B1~", "$@")
citation = citation.replace("~B2~", "$@")
citation = citation.replace("~A1~", "$@")
citation = citation.replace("~H~", "$@")
citation = citation.replace("~E~", "$@")
#citation = citation.replace("~QQ~", "$@")
#make unique citation changes here-for example when "US" or "DE" in imbeded in citation see below
citation = citation.replace("05225US~", "05225U$|" )
citation = citation.replace("063106 DE", "063106D!" )
citation = citation.replace("US~US~", "US~" )
citation = citation.replace("PCT/US", "PCT/U$")
citation = citation.replace("PCTUS", "PCTU$")
citation = citation.replace("WO US", "WO U$")
citation = citation.replace("WO~US", "WO~ U$")
#fixes for cites without pipes-see below -DONT TOUCH THESE
citation = citation.replace("US~cited by examiner", "||cited by examiner" )
citation = citation.replace("US~cited by other", "||cited by other" )
#Here are the changes to return each citation into a unique row
#If a country is only listed in the columns in Excel they need a fix like this, If KR is alone then use the code:::: citation = citation.replace("KR~", "Foreign -KR-" )
citation = citation.replace("$@", "|")
citation = citation.replace("~US~", "|" + "\n" + str(docID) +"|")
citation = citation.replace("US~", "")
citation = citation.replace("~JP~", "|" + "\n" + str(docID) +"|"+ "Foreign -JP-")
citation = citation.replace("JP~", "Foreign -JP-" )
citation = citation.replace("~GB~", "|" + "\n" + str(docID) +"|"+ "Foreign -GB-")
citation = citation.replace("GB~", "Foreign -GB-" )
citation = citation.replace("~WO~", "|" + "\n" + str(docID) +"|"+ "Foreign -WO-")
citation = citation.replace("WO~", "Foreign -WO-" )
citation = citation.replace("~CA~", "|" + "\n" + str(docID) +"|"+ "Foreign -CA-")
citation = citation.replace("~DE~EP~", "~DE~ EP-" )
citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
citation = citation.replace("DE~", "Foreign -DE-" )
citation = citation.replace("~KR~", "|" + "\n" + str(docID) +"|"+ "Foreign -KR-")
citation = citation.replace("KR~", "Foreign -KR-" )
citation = citation.replace("~EM~", "|" + "\n" + str(docID) +"|"+ "Foreign -EM-")
citation = citation.replace("~CH~", "|" + "\n" + str(docID) +"|"+ "Foreign -CH-")
citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
citation = citation.replace("~SE~", "|" + "\n" + str(docID) +"|"+ "Foreign -SE-")
citation = citation.replace("~FR~", "|" + "\n" + str(docID) +"|"+ "Foreign -FR-")
citation = citation.replace("~FR~EP~", "~FR~ EP-" )
citation = citation.replace("FR~", "Foreign -FR-" )
citation = citation.replace("~CN~", "|" + "\n" + str(docID) +"|"+ "Foreign -CN-")
citation = citation.replace("~TW~", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
citation = citation.replace("~TW", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
citation = citation.replace("TW~", "Foreign -TW-" )
citation = citation.replace("~NL~", "|" + "\n" + str(docID) +"|"+ "Foreign -NL-")
citation = citation.replace("~BR~", "|" + "\n" + str(docID) +"|"+ "Foreign -BR-")
citation = citation.replace("~AU~", "|" + "\n" + str(docID) +"|"+ "Foreign -AU-")
citation = citation.replace("~ES~", "|" + "\n" + str(docID) +"|"+ "Foreign -ES-")
citation = citation.replace("~IT~", "|" + "\n" + str(docID) +"|"+ "Foreign -IT-")
citation = citation.replace("~SU~", "|" + "\n" + str(docID) +"|"+ "Foreign -SU-")
citation = citation.replace("~AT~", "|" + "\n" + str(docID) +"|"+ "Foreign -AT-")
citation = citation.replace("~BE~", "|" + "\n" + str(docID) +"|"+ "Foreign -BE-")
citation = citation.replace("~DK~", "|" + "\n" + str(docID) +"|"+ "Foreign -DK-")
citation = citation.replace("~RU~", "|" + "\n" + str(docID) +"|"+ "Foreign -RU-")
citation = citation.replace("RU~", "Foreign -RU-" )
#citation = citation.replace("~QQ~", "|" + "\n" + str(docID) +"|"+ "Foreign -QQ-")
#These are just end of citation fixes-DONT TOUCH THESE
citation = citation.replace("cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner", "cited by other" )
citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner", "cited by other" )
citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner", "cited by other" )
citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
citation = citation.replace("cited by other~cited by other", "cited by other" )
citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
citation = citation.replace("cited by other~cited by examiner", "cited by other" )
citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
citation = citation.replace("~", "|" )
citation = citation.replace("US", "||")
#make unique post-processing citation changes here-If needed for the end of the scripts
citation = citation.replace("CA|", "Foreign -CA-" )
citation = citation.replace("EP|", "Foreign -EP-" )
citation = citation.replace("CN|", "Foreign -CN-" )
citation = citation.replace("$", "S")
citation = citation.replace("D!", "DE")
#citation = citation.replace(" ", " " )
#END CITATION FILE-------------------------------------------------------------------------------
#START the inventors file
inventor1 = doc.xpath('//applicants/applicant/addressbook/last-name/text()|//applicants/applicant/addressbook/first-name/text()|//applicants/applicant/addressbook/address/city/text()|//applicants/applicant/addressbook/address/state/text()|//applicants/applicant/addressbook/address/country/text()|//applicants/applicant/nationality/*/text()|//applicants/applicant/residence/*/text()|//sequence-cwu/publication-reference/document-id/country/text()|//sequence-cwu/number/text()')
inventor1 = '~'.join(inventor1).replace('\n-','')
#For files after 2009 use this to replace State errors in the Excel- If the output is short then use this to add in a None value for State
inventor1 = inventor1.replace('~KR~omitted','~None~KR~omitted')
inventor1 = inventor1.replace('~GB~omitted','~None~GB~omitted')
inventor1 = inventor1.replace('~IT~omitted','~None~IT~omitted')
inventor1 = inventor1.replace('~JP~omitted','~None~JP~omitted')
inventor1 = inventor1.replace('~FR~omitted','~None~FR~omitted')
inventor1 = inventor1.replace('~BR~omitted','~None~BR~omitted')
inventor1 = inventor1.replace('~NO~omitted','~None~NO~omitted')
inventor1 = inventor1.replace('~HK~omitted','~None~HK~omitted')
inventor1 = inventor1.replace('~CA~omitted','~None~CA~omitted')
inventor1 = inventor1.replace('~TW~omitted','~None~TW~omitted')
inventor1 = inventor1.replace('~SE~omitted','~None~SE~omitted')
inventor1 = inventor1.replace('~CH~omitted','~None~CH~omitted')
inventor1 = inventor1.replace('~DE~omitted','~None~DE~omitted')
inventor1 = inventor1.replace('~SG~omitted','~None~SG~omitted')
inventor1 = inventor1.replace('~IN~omitted','~None~IN~omitted')
inventor1 = inventor1.replace('~IL~omitted','~None~IL~omitted')
inventor1 = inventor1.replace('~CN~omitted','~None~CN~omitted')
inventor1 = inventor1.replace('~FI~omitted','~None~FI~omitted')
inventor1 = inventor1.replace('~ZA~omitted','~None~ZA~omitted')
inventor1 = inventor1.replace('~NL~omitted','~None~NL~omitted')
inventor1 = inventor1.replace('~AT~omitted','~None~AT~omitted')
inventor1 = inventor1.replace('~AU~omitted','~None~AU~omitted')
inventor1 = inventor1.replace('~BE~omitted','~None~BE~omitted')
inventor1 = inventor1.replace('~CZ~omitted','~None~CZ~omitted')
inventor1 = inventor1.replace('~RU~omitted','~None~RU~omitted')
inventor1 = inventor1.replace('~IE~omitted','~None~IE~omitted')
inventor1 = inventor1.replace('~AR~omitted','~None~AR~omitted')
inventor1 = inventor1.replace('~MY~omitted','~None~MY~omitted')
inventor1 = inventor1.replace('~SK~omitted','~None~SK~omitted')
inventor1 = inventor1.replace('~ES~omitted','~None~ES~omitted')
inventor1 = inventor1.replace('~NZ~omitted','~None~NZ~omitted')
inventor1 = inventor1.replace('~HU~omitted','~None~HU~omitted')
inventor1 = inventor1.replace('~UA~omitted','~None~UA~omitted')
inventor1 = inventor1.replace('~DK~omitted','~None~DK~omitted')
inventor1 = inventor1.replace('~TH~omitted','~None~TH~omitted')
inventor1 = inventor1.replace('~MX~omitted','~None~MX~omitted')
#inventor1 = inventor1.replace('~QQ~omitted','~None~QQ~omitted')
#For the 2005-2008 files use these lines
inventor1 = inventor1.replace('~NO~NO~NO','~None~NO~NO~NO')
inventor1 = inventor1.replace('~NZ~NZ~NZ','~None~NZ~NZ~NZ')
inventor1 = inventor1.replace('~RU~RU~RU','~None~RU~RU~RU')
inventor1 = inventor1.replace('~RO~RO~RO','~None~RO~RO~RO')
inventor1 = inventor1.replace('~SE~SE~SE','~None~SE~SE~SE')
inventor1 = inventor1.replace('~SG~SG~SG','~None~SG~SG~SG')
inventor1 = inventor1.replace('~SI~SI~SI','~None~SI~SI~SI')
inventor1 = inventor1.replace('~TH~TH~TH','~None~TH~TH~TH')
inventor1 = inventor1.replace('~TR~TR~TR','~None~TR~TR~TR')
inventor1 = inventor1.replace('~TW~TW~TW','~None~TW~TW~TW')
inventor1 = inventor1.replace('~VE~VE~VE','~None~VE~VE~VE')
inventor1 = inventor1.replace('~ZA~ZA~ZA','~None~ZA~ZA~ZA')
inventor1 = inventor1.replace('~AN~AN~AN','~None~AN~AN~AN')
inventor1 = inventor1.replace('~AR~AR~AR','~None~AR~AR~AR')
inventor1 = inventor1.replace('~BA~BA~BA','~None~BA~BA~BA')
inventor1 = inventor1.replace('~PH~PH~PH','~None~PH~PH~PH')
inventor1 = inventor1.replace('~HR~HR~HR','~None~HR~HR~HR')
inventor1 = inventor1.replace('~LT~LT~LT','~None~LT~LT~LT')
inventor1 = inventor1.replace('~EE~EE~EE','~None~EE~EE~EE')
inventor1 = inventor1.replace('~BJ~BJ~BJ','~None~BJ~BJ~BJ')
inventor1 = inventor1.replace('~CR~CR~CR','~None~CR~CR~CR')
inventor1 = inventor1.replace('~PL~PL~PL','~None~PL~PL~PL')
inventor1 = inventor1.replace('~CO~CO~CO','~None~CO~CO~CO')
inventor1 = inventor1.replace('~UA~UA~UA','~None~UA~UA~UA')
inventor1 = inventor1.replace('~KW~KW~KW','~None~KW~KW~KW')
inventor1 = inventor1.replace('~CL~CL~CL','~None~CL~CL~CL')
inventor1 = inventor1.replace('~CY~CY~CY','~None~CY~CY~CY')
inventor1 = inventor1.replace('~LI~LI~LI','~None~LI~LI~LI')
inventor1 = inventor1.replace('~SA~SA~SA','~None~SA~SA~SA')
#inventor1 = inventor1.replace('~QQ~QQ~QQ','~None~QQ~QQ~QQ')
#For lines that don't return use these lines in the code for 2009-
inventor1 = inventor1.replace('omitted~US~','omitted~US' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~FR~','omitted~FR' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~DK~','omitted~DK' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~KR~','omitted~KR' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~JP~','omitted~JP' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~GB~','omitted~GB' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~IT~','omitted~IT' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~CH~','omitted~CH' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~SG~','omitted~SG' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~DE~','omitted~DE' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~IN~','omitted~IN' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~TW~','omitted~TW' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('omitted~CN~','omitted~CN' +"|"+ '\n' + str(docID) +"|")
#inventor1 = inventor1.replace('omitted~QQ~','omitted~QQ' +"|"+ '\n' + str(docID) +"|")
#for lines 2005-2008 use this line for returning countries
inventor1 = inventor1.replace('AT~AT~AT~','AT~AT~AT' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('AN~AN~AN~','AN~AN~AN' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('AR~AR~AR~','AR~AR~AR' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('AU~AU~AU~','AU~AU~AU' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('AZ~AZ~AZ~','AZ~AZ~AZ' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('BA~BA~BA~','BA~BA~BA' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('BE~BE~BE~','BE~BE~BE' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('BR~BR~BR~','BR~BR~BR' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('BS~BS~BS~','BS~BS~BS' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('CA~CA~CA~','CA~CA~CA' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('CH~CH~CH~','CH~CH~CH' +"|"+ '\n' + str(docID) +"|")
inventor1 = inventor1.replace('CN~CN~CN~','CN~CN~CN' +"|"+ '\n' + str(docID) +"|")
#inventor1 = inventor1.replace('QQ~QQ~QQ~','QQ~QQ~QQ' +"|"+ '\n' + str(docID) +"|")
#special case fixes- these are for strange names fixes in the code that may not create the correct amount of columns.
inventor1 = inventor1.replace('~None~None~NO~','~None~NO~')
inventor1 = inventor1.replace('Ramandeep~Chandigarh','Ramandeep|None~Chandigarh')
inventor1 = inventor1.replace('Esk~eh~r','Eskehr')
inventor1 = inventor1.replace('Baychar~Eastport','Baychar~None~Eastport')
inventor1 = inventor1.replace('US~1', '||||||')
inventor1 = inventor1.replace('~','|')
#End the inventor file
#-------------------------------------------------------------------------------
#Here are the output print fields- you can change one if you want but remember to comment out all but the one you wish to view.
print "DocID: {0}\nGrantDate: {1}\nApplicationDate: {2}\nNumber of Claims: {3}\nExaminers: {4}\nAssignee: {5}\nInventor: {6}\nUS Cl.: {7}\n".format(docID,grantdate,applicationdate,claimsNum,examiners.encode("UTF-8"),assignees,inventors,uscl1)
#print "DocID: {0}\nU.S Cl: {1}\nPrimary: {2}\n".format(docID,uscl2,primary1)
#print "DocID: {0}\nCitation: {1}\n".format(docID,citation.encode("UTF-8"))
#print "DocID: {0}\nTitle: {1}\nInventors: {2}\n".format(docID,appID,inventor1.encode("UTF-8"))
#------------------------------------------------------------------------------- IGNORE Everything else below this.
#Output first general info bits
outFile.write(str(docID) +"|"+ str(grantdate) +"|"+ str(applicationdate) + "|"+ str(claimsNum) + "|"+ str(examiners.encode("UTF-8")) + "|"+ str(uscl1) + "|"+ str(assignees) + "|"+ str(inventors) +"|"+"\n")
#Output Classifications only
outFile2.write(str(docID) +"|"+ str(uscl2) +"|"+ "\n")
#Output Citations only
outFile3.write(str(docID) +"|"+ str(citation) +"|"+"\n")
#Output inventors only
outFile4.write(str(docID) + "|"+ str(inventor1.encode("UTF-8")) + "|" +"\n")
outFile.close()
outFile2.close()
outFile3.close()
outFile4.close()
print "output files complete"
Upvotes: 0
Views: 491
Reputation: 176
The problem you are seeing is not a python problem. The code unzips a zipfile and expects to find an xml file inside. The assert statement is a chack statement to make sure an xml file was found. It is designed to halt your program if it doesn't find an xml file. If you download the zipfile assigned to datasrc
you will find an empty zipfile. When it tries to find the xml file, it doesn't find one, so xml_file = None
. Then when it reaches the assert
statement, it raises the Assertion Error.
You could probably take out the assert
and run the code just fine, but then when your program crashes you won't know why. Having it there gives you a convenient way to catch the failure when, where, and why it happens.
Upvotes: 1