Johnny B
Johnny B

Reputation: 460

Using python to parse XML file

I have a python module that was already written for me to download and parse data from googles patent listing. The code works great until I do anything before 2005. I have no knowledge of python except how to run the module. How do I fix it?

The traceback I receive is:

Traceback (most recent call last): 
  File "C:\Users\John\Desktop\FINAL BART ALL INFO-Magic Bullet.py", line 46, in <module> 
    assert xml_file is not None
AssertionError

And this is the code I'm using:

#Ignore all this information 
import urllib2, os, zipfile
from lxml import etree
#-------------------------------------------------------------------------------
#Ignore all this information 
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
  buff = []
  for line in data:
    if separator(line):
      if buff:
        yield ''.join(buff)
        buff[:] = []
    buff.append(line)
  yield ''.join(buff)

def first(seq,default=None):
  """Return the first item from sequence, seq or the default(None) value"""
  for item in seq:
    return item
  return default
#-------------------------------------------------------------------------------
#This is where you change the internet source file- Use the file extensions from the sheet provided.
datasrc = "http://storage.googleapis.com/patents/grant_full_text/2003/pg030107.zip"
#http://commondatastorage.googleapis.com/patents/grant_full_text/2012/ipg120117.zip
filename = datasrc.split('/')[-1]
#-------------------------------------------------------------------------------
#Ignore all this information 
if not os.path.exists(filename):
  with open(filename,'wb') as file_write:
    r = urllib2.urlopen(datasrc)
    file_write.write(r.read())

zf = zipfile.ZipFile(filename)
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')])
assert xml_file is not None
#-------------------------------------------------------------------------------
#output set your folder location here, keep double \\ between
outFolder = "C:\\PatentFiles\\"
outFilename = os.path.splitext(filename)[0]
#-------------------------------------------------------------------------------
#These outputs are the names of the files-Ignore all this information 
output = outFolder + outFilename + "_general.txt"
output2 = outFolder + outFilename + "_USCL.txt"
output3 = outFolder + outFilename + "_citation.txt"
output4 = outFolder + outFilename + "_inventor.txt"
#Open files
outFile = open(output, "w")
outFile2 = open(output2, "w")
outFile3 = open(output3, "w")
outFile4 = open(output4, "w")
#write the headers
outFile.write("Patent No.|GrantDate|Application Date|Number of Claims|Examiners|US Primary Main Classification|Assignee|Assignee Address City_State_Country|First Inventor|First Inventor Address City_State_Country| \n")
outFile2.write("Patent No.|Primary|U.S Classification| \n")
outFile3.write ("Patent No.|Citation|Citation Date|Who Cited This| \n")
outFile4.write ("Patent No.|Inventor Last Name|First Name|City|State|Country|Nationality Country|Residence Country|\n")
#-------------------------------------------------------------------------------
#Here is the count- adjust this each time you run the program for the first time.
#Run at 10 for the 1st run then 5500 afterward.
count = 0
for item in xmlSplitter(zf.open(xml_file)):
  count += 1
  #5500
  if count > 10: break  
  doc = etree.XML(item)
  #-------------------------------------------------------------------------------
  #This is where the python starts parsing the infomation.
  #This is the Start of the General Infomation file.
  docID = "~".join(doc.xpath('//publication-reference/document-id/country/text()|//publication-reference/document-id/doc-number/text()'))
  docID = docID.replace("D0","D") 
  docID = docID.replace("H000","H")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("RE0","RE")
  docID = docID.replace("~0","~")
  docID = docID.replace("US~","")

  grantdate = first(doc.xpath('//publication-reference/document-id/date/text()'))
  applicationdate = first(doc.xpath('//application-reference/document-id/date/text()'))
  claimsNum = first(doc.xpath('//number-of-claims/text()'))

  assignee1 = "-".join(doc.xpath('//assignees/assignee/addressbook/orgname/text()|//assignees/assignee/addressbook/last-name/text()|//assignees/assignee/addressbook/first-name/text()'))
  assignee1 = assignee1.replace('-',', ')
  assignee2 = "_".join(doc.xpath('//assignee/addressbook/address/*/text()'))
  assignees = str(assignee1.encode("UTF-8")) + "|" + str(assignee2.encode("UTF-8"))  

  inventors1 = first(doc.xpath('//applicants/applicant/addressbook/last-name/text()'))
  inventor2 = first(doc.xpath('//applicants/applicant/addressbook/first-name/text()'))
  inventor3 = first(doc.xpath('//applicants/applicant/addressbook/address/city/text()'))
  inventor4 = first(doc.xpath('//applicants/applicant/addressbook/address/state/text()'))
  inventor5 = first(doc.xpath('//applicants/applicant/addressbook/address/country/text()'))
  inventor = str(inventor2.encode("UTF-8") if inventor2 else inventor2) + " " + str(inventors1.encode("UTF-8") if inventors1 else inventors1)
  inventors2 = str(inventor3.encode("UTF-8") if inventor3 else inventor3) + "_" + str(inventor4) + "_" + str(inventor5)
  inventors = str(inventor) + "|" + str(inventors2)

  examiners = "~".join(doc.xpath('//examiners/primary-examiner/first-name/text()|//examiners/primary-examiner/last-name/text()'))
  examiners = examiners.replace("~",", ")

  uscl1 = first(doc.xpath('//classification-national/main-classification/text()'))

  #END FIRST TEXT FILE #-------------------------------------------------------------------------------
  #This begings the USCL file
  notprimary = first(doc.xpath('//publication-reference/document-id/country/text()'))
  notprimary = notprimary.replace("US","0")

  primary1 = first(doc.xpath('//publication-reference/document-id/country/text()'))
  primary1 = primary1.replace("US","1")

  uscl2 = "~".join(doc.xpath('//us-bibliographic-data-grant/classification-national/*/text()|//sequence-cwu/publication-reference/document-id/country/text()'))
  #-------------------------NOTE--------------------------------------------------
  #--------------------------NOTE-------------------------------------------------
  #-----------------------NOTE----------------------------------------------------
  #NOTE- RUN through count 10 then remove pound signs from two below
  uscl2 = uscl2.replace("US~", str(primary1) + "|")
  uscl2 = uscl2.replace("~", "|" + "\n" + str(docID) + "|" + str(notprimary) + "|")
  uscl2 = uscl2.replace("US", "|") 

  #END SECOND TEXT FILE #-------------------------------------------------------------------------------
  #Begin the Citation file
  citation = '~'.join(doc.xpath('//publication-reference/document-id/country/text()|//references-cited/citation/patcit/document-id/country/text()|//references-cited/citation/patcit/document-id/doc-number/text()|//references-cited/citation/patcit/document-id/kind/text()|//references-cited/citation/patcit/document-id/date/text()|//references-cited/citation/category/text()'))

  #Here is the start of the patent connectors- in the patents they exist at the end. They are replaced in this code to make pipes | for the final output
  citation = citation.replace("~A~", "$@")
  citation = citation.replace("~S~", "$@")
  citation = citation.replace("~S1~", "$@")
  citation = citation.replace("~B1~", "$@")
  citation = citation.replace("~B2~", "$@")
  citation = citation.replace("~A1~", "$@")
  citation = citation.replace("~H~", "$@")
  citation = citation.replace("~E~", "$@")


  #citation = citation.replace("~QQ~", "$@")

  #make unique citation changes here-for example when "US" or "DE" in imbeded in citation see below
  citation = citation.replace("05225US~", "05225U$|" )
  citation = citation.replace("063106 DE", "063106D!" )
  citation = citation.replace("US~US~", "US~" )
  citation = citation.replace("PCT/US", "PCT/U$")
  citation = citation.replace("PCTUS", "PCTU$")
  citation = citation.replace("WO US", "WO U$")
  citation = citation.replace("WO~US", "WO~ U$")

  #fixes for cites without pipes-see below -DONT TOUCH THESE
  citation = citation.replace("US~cited by examiner", "||cited by examiner" )
  citation = citation.replace("US~cited by other", "||cited by other" )


  #Here are the changes to return each citation into a unique row
  #If a country is only listed in the columns in Excel they need a fix like this, If KR is alone then use the code:::: citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("$@", "|")
  citation = citation.replace("~US~", "|" + "\n" + str(docID) +"|")
  citation = citation.replace("US~", "")
  citation = citation.replace("~JP~", "|" + "\n" + str(docID) +"|"+ "Foreign -JP-")
  citation = citation.replace("JP~", "Foreign -JP-" )
  citation = citation.replace("~GB~", "|" + "\n" + str(docID) +"|"+ "Foreign -GB-")
  citation = citation.replace("GB~", "Foreign -GB-" )
  citation = citation.replace("~WO~", "|" + "\n" + str(docID) +"|"+ "Foreign -WO-")
  citation = citation.replace("WO~", "Foreign -WO-" )
  citation = citation.replace("~CA~", "|" + "\n" + str(docID) +"|"+ "Foreign -CA-")
  citation = citation.replace("~DE~EP~", "~DE~ EP-" )
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("DE~", "Foreign -DE-" )
  citation = citation.replace("~KR~", "|" + "\n" + str(docID) +"|"+ "Foreign -KR-")
  citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("~EM~", "|" + "\n" + str(docID) +"|"+ "Foreign -EM-")
  citation = citation.replace("~CH~", "|" + "\n" + str(docID) +"|"+ "Foreign -CH-")
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("~SE~", "|" + "\n" + str(docID) +"|"+ "Foreign -SE-")
  citation = citation.replace("~FR~", "|" + "\n" + str(docID) +"|"+ "Foreign -FR-")
  citation = citation.replace("~FR~EP~", "~FR~ EP-" )
  citation = citation.replace("FR~", "Foreign -FR-" )
  citation = citation.replace("~CN~", "|" + "\n" + str(docID) +"|"+ "Foreign -CN-")
  citation = citation.replace("~TW~", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("~TW", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("TW~", "Foreign -TW-" )
  citation = citation.replace("~NL~", "|" + "\n" + str(docID) +"|"+ "Foreign -NL-")
  citation = citation.replace("~BR~", "|" + "\n" + str(docID) +"|"+ "Foreign -BR-")
  citation = citation.replace("~AU~", "|" + "\n" + str(docID) +"|"+ "Foreign -AU-")
  citation = citation.replace("~ES~", "|" + "\n" + str(docID) +"|"+ "Foreign -ES-")
  citation = citation.replace("~IT~", "|" + "\n" + str(docID) +"|"+ "Foreign -IT-")
  citation = citation.replace("~SU~", "|" + "\n" + str(docID) +"|"+ "Foreign -SU-")
  citation = citation.replace("~AT~", "|" + "\n" + str(docID) +"|"+ "Foreign -AT-")
  citation = citation.replace("~BE~", "|" + "\n" + str(docID) +"|"+ "Foreign -BE-")
  citation = citation.replace("~DK~", "|" + "\n" + str(docID) +"|"+ "Foreign -DK-")
  citation = citation.replace("~RU~", "|" + "\n" + str(docID) +"|"+ "Foreign -RU-")
  citation = citation.replace("RU~", "Foreign -RU-" )


  #citation = citation.replace("~QQ~", "|" + "\n" + str(docID) +"|"+ "Foreign -QQ-")

  #These are just end of citation fixes-DONT TOUCH THESE
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )

  citation = citation.replace("~", "|" )

  citation = citation.replace("US", "||")

  #make unique post-processing citation changes here-If needed for the end of the scripts
  citation = citation.replace("CA|", "Foreign -CA-" )
  citation = citation.replace("EP|", "Foreign -EP-" )
  citation = citation.replace("CN|", "Foreign -CN-" )
  citation = citation.replace("$", "S")
  citation = citation.replace("D!", "DE")

  #citation = citation.replace(" ", " " )

  #END CITATION FILE-------------------------------------------------------------------------------

  #START the inventors file
  inventor1 = doc.xpath('//applicants/applicant/addressbook/last-name/text()|//applicants/applicant/addressbook/first-name/text()|//applicants/applicant/addressbook/address/city/text()|//applicants/applicant/addressbook/address/state/text()|//applicants/applicant/addressbook/address/country/text()|//applicants/applicant/nationality/*/text()|//applicants/applicant/residence/*/text()|//sequence-cwu/publication-reference/document-id/country/text()|//sequence-cwu/number/text()')
  inventor1 = '~'.join(inventor1).replace('\n-','')

  #For files after 2009 use this to replace State errors in the Excel- If the output is short then use this to add in a None value for State
  inventor1 = inventor1.replace('~KR~omitted','~None~KR~omitted')
  inventor1 = inventor1.replace('~GB~omitted','~None~GB~omitted')
  inventor1 = inventor1.replace('~IT~omitted','~None~IT~omitted')
  inventor1 = inventor1.replace('~JP~omitted','~None~JP~omitted')
  inventor1 = inventor1.replace('~FR~omitted','~None~FR~omitted')
  inventor1 = inventor1.replace('~BR~omitted','~None~BR~omitted')
  inventor1 = inventor1.replace('~NO~omitted','~None~NO~omitted')
  inventor1 = inventor1.replace('~HK~omitted','~None~HK~omitted')
  inventor1 = inventor1.replace('~CA~omitted','~None~CA~omitted')
  inventor1 = inventor1.replace('~TW~omitted','~None~TW~omitted')
  inventor1 = inventor1.replace('~SE~omitted','~None~SE~omitted')
  inventor1 = inventor1.replace('~CH~omitted','~None~CH~omitted')
  inventor1 = inventor1.replace('~DE~omitted','~None~DE~omitted')
  inventor1 = inventor1.replace('~SG~omitted','~None~SG~omitted')
  inventor1 = inventor1.replace('~IN~omitted','~None~IN~omitted')
  inventor1 = inventor1.replace('~IL~omitted','~None~IL~omitted')
  inventor1 = inventor1.replace('~CN~omitted','~None~CN~omitted')
  inventor1 = inventor1.replace('~FI~omitted','~None~FI~omitted')
  inventor1 = inventor1.replace('~ZA~omitted','~None~ZA~omitted')
  inventor1 = inventor1.replace('~NL~omitted','~None~NL~omitted')
  inventor1 = inventor1.replace('~AT~omitted','~None~AT~omitted')
  inventor1 = inventor1.replace('~AU~omitted','~None~AU~omitted')
  inventor1 = inventor1.replace('~BE~omitted','~None~BE~omitted')
  inventor1 = inventor1.replace('~CZ~omitted','~None~CZ~omitted')
  inventor1 = inventor1.replace('~RU~omitted','~None~RU~omitted')
  inventor1 = inventor1.replace('~IE~omitted','~None~IE~omitted')
  inventor1 = inventor1.replace('~AR~omitted','~None~AR~omitted')
  inventor1 = inventor1.replace('~MY~omitted','~None~MY~omitted')
  inventor1 = inventor1.replace('~SK~omitted','~None~SK~omitted')
  inventor1 = inventor1.replace('~ES~omitted','~None~ES~omitted')
  inventor1 = inventor1.replace('~NZ~omitted','~None~NZ~omitted')
  inventor1 = inventor1.replace('~HU~omitted','~None~HU~omitted')
  inventor1 = inventor1.replace('~UA~omitted','~None~UA~omitted')
  inventor1 = inventor1.replace('~DK~omitted','~None~DK~omitted')
  inventor1 = inventor1.replace('~TH~omitted','~None~TH~omitted')
  inventor1 = inventor1.replace('~MX~omitted','~None~MX~omitted')


  #inventor1 = inventor1.replace('~QQ~omitted','~None~QQ~omitted')

  #For the 2005-2008 files use these lines

  inventor1 = inventor1.replace('~NO~NO~NO','~None~NO~NO~NO')
  inventor1 = inventor1.replace('~NZ~NZ~NZ','~None~NZ~NZ~NZ')
  inventor1 = inventor1.replace('~RU~RU~RU','~None~RU~RU~RU')
  inventor1 = inventor1.replace('~RO~RO~RO','~None~RO~RO~RO')
  inventor1 = inventor1.replace('~SE~SE~SE','~None~SE~SE~SE')
  inventor1 = inventor1.replace('~SG~SG~SG','~None~SG~SG~SG')
  inventor1 = inventor1.replace('~SI~SI~SI','~None~SI~SI~SI')
  inventor1 = inventor1.replace('~TH~TH~TH','~None~TH~TH~TH')
  inventor1 = inventor1.replace('~TR~TR~TR','~None~TR~TR~TR')
  inventor1 = inventor1.replace('~TW~TW~TW','~None~TW~TW~TW')
  inventor1 = inventor1.replace('~VE~VE~VE','~None~VE~VE~VE')
  inventor1 = inventor1.replace('~ZA~ZA~ZA','~None~ZA~ZA~ZA')
  inventor1 = inventor1.replace('~AN~AN~AN','~None~AN~AN~AN')
  inventor1 = inventor1.replace('~AR~AR~AR','~None~AR~AR~AR')
  inventor1 = inventor1.replace('~BA~BA~BA','~None~BA~BA~BA')
  inventor1 = inventor1.replace('~PH~PH~PH','~None~PH~PH~PH')
  inventor1 = inventor1.replace('~HR~HR~HR','~None~HR~HR~HR')
  inventor1 = inventor1.replace('~LT~LT~LT','~None~LT~LT~LT')
  inventor1 = inventor1.replace('~EE~EE~EE','~None~EE~EE~EE')
  inventor1 = inventor1.replace('~BJ~BJ~BJ','~None~BJ~BJ~BJ')
  inventor1 = inventor1.replace('~CR~CR~CR','~None~CR~CR~CR')
  inventor1 = inventor1.replace('~PL~PL~PL','~None~PL~PL~PL')
  inventor1 = inventor1.replace('~CO~CO~CO','~None~CO~CO~CO')
  inventor1 = inventor1.replace('~UA~UA~UA','~None~UA~UA~UA')
  inventor1 = inventor1.replace('~KW~KW~KW','~None~KW~KW~KW')
  inventor1 = inventor1.replace('~CL~CL~CL','~None~CL~CL~CL')
  inventor1 = inventor1.replace('~CY~CY~CY','~None~CY~CY~CY')
  inventor1 = inventor1.replace('~LI~LI~LI','~None~LI~LI~LI')
  inventor1 = inventor1.replace('~SA~SA~SA','~None~SA~SA~SA')

  #inventor1 = inventor1.replace('~QQ~QQ~QQ','~None~QQ~QQ~QQ')

  #For lines that don't return use these lines in the code for 2009-
  inventor1 = inventor1.replace('omitted~US~','omitted~US' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~FR~','omitted~FR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DK~','omitted~DK' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~KR~','omitted~KR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~JP~','omitted~JP' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~GB~','omitted~GB' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IT~','omitted~IT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CH~','omitted~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~SG~','omitted~SG' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DE~','omitted~DE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IN~','omitted~IN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~TW~','omitted~TW' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CN~','omitted~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('omitted~QQ~','omitted~QQ' +"|"+ '\n' + str(docID) +"|")

  #for lines 2005-2008 use this line for returning countries
  inventor1 = inventor1.replace('AT~AT~AT~','AT~AT~AT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AN~AN~AN~','AN~AN~AN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AR~AR~AR~','AR~AR~AR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AU~AU~AU~','AU~AU~AU' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AZ~AZ~AZ~','AZ~AZ~AZ' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BA~BA~BA~','BA~BA~BA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BE~BE~BE~','BE~BE~BE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BR~BR~BR~','BR~BR~BR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BS~BS~BS~','BS~BS~BS' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CA~CA~CA~','CA~CA~CA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CH~CH~CH~','CH~CH~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CN~CN~CN~','CN~CN~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('QQ~QQ~QQ~','QQ~QQ~QQ' +"|"+ '\n' + str(docID) +"|")

  #special case fixes- these are for strange names fixes in the code that may not create the correct amount of columns.
  inventor1 = inventor1.replace('~None~None~NO~','~None~NO~')
  inventor1 = inventor1.replace('Ramandeep~Chandigarh','Ramandeep|None~Chandigarh')
  inventor1 = inventor1.replace('Esk~eh~r','Eskehr')
  inventor1 = inventor1.replace('Baychar~Eastport','Baychar~None~Eastport')

  inventor1 = inventor1.replace('US~1', '||||||')
  inventor1 = inventor1.replace('~','|') 

  #End the inventor file
  #-------------------------------------------------------------------------------

  #Here are the output print fields- you can change one if you want but remember to comment out all but the one you wish to view.
  print "DocID: {0}\nGrantDate: {1}\nApplicationDate: {2}\nNumber of Claims: {3}\nExaminers: {4}\nAssignee: {5}\nInventor: {6}\nUS Cl.: {7}\n".format(docID,grantdate,applicationdate,claimsNum,examiners.encode("UTF-8"),assignees,inventors,uscl1)
  #print "DocID: {0}\nU.S Cl: {1}\nPrimary: {2}\n".format(docID,uscl2,primary1)
  #print "DocID: {0}\nCitation: {1}\n".format(docID,citation.encode("UTF-8"))
  #print "DocID:    {0}\nTitle:    {1}\nInventors: {2}\n".format(docID,appID,inventor1.encode("UTF-8"))

  #------------------------------------------------------------------------------- IGNORE Everything else below this.
  #Output first general info bits
  outFile.write(str(docID) +"|"+ str(grantdate) +"|"+ str(applicationdate) + "|"+ str(claimsNum) + "|"+ str(examiners.encode("UTF-8")) + "|"+ str(uscl1) + "|"+ str(assignees) + "|"+ str(inventors)  +"|"+"\n")

  #Output Classifications only
  outFile2.write(str(docID) +"|"+ str(uscl2) +"|"+ "\n")

  #Output Citations only
  outFile3.write(str(docID) +"|"+ str(citation) +"|"+"\n")

  #Output inventors only
  outFile4.write(str(docID)  + "|"+ str(inventor1.encode("UTF-8")) + "|" +"\n")


outFile.close()
outFile2.close()
outFile3.close()
outFile4.close()
print "output files complete"

Upvotes: 0

Views: 491

Answers (1)

jdj081
jdj081

Reputation: 176

The problem you are seeing is not a python problem. The code unzips a zipfile and expects to find an xml file inside. The assert statement is a chack statement to make sure an xml file was found. It is designed to halt your program if it doesn't find an xml file. If you download the zipfile assigned to datasrc you will find an empty zipfile. When it tries to find the xml file, it doesn't find one, so xml_file = None. Then when it reaches the assert statement, it raises the Assertion Error.

You could probably take out the assert and run the code just fine, but then when your program crashes you won't know why. Having it there gives you a convenient way to catch the failure when, where, and why it happens.

Upvotes: 1

Related Questions