Convert PDF to Excel/csv/xlsx

Question

My intention is to convert the pdf strings into excel/csv file as follows:

PDF file: (Source File)

#_________________________________________________________________________
appliance
n. 1. See server appliance. 2. See information appliance. 3. A device with a single or limited ......

appliance server
n. 1. An inexpensive computing .....2. See server appliance. 

application
n. A program designed ......
#________________________________________________________________________

Excel File : (Target File)
#________________________________________________________________________
appliance              , n. ,          1. See server appliance    ,
appliance server       , n. ,          1. An inexpensive co       ,
application            , n. ,          A program designed ......  ,  
_#_______________________________________________________________________

I have convert the pdf into text and trying to split with "," and then convert the text file into csv file. But i have stuck after converting the pdf to text file.

import os
from os import chdir, getcwd, listdir, path
import PyPDF2
from time import strftime
def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print ("
The specified path does not exist.
")
        abs_path = raw_input(prompt)
    return abs_path
print ("
")
folder = check_path("Provide absolute path for the folder: ")
list=[]
directory=folder
for root,dirs,files in os.walk(directory):
    for filename in files:
        if filename.endswith('.pdf'):
            t=os.path.join(directory,filename)
            list.append(t)
m=len(list)
i=0
while i<=len(list):
    path=list[i]
    head,tail=os.path.split(path)
    var="\"
    tail=tail.replace(".pdf",".txt")
    name=head+var+tail
    content = ""
    # Load PDF into pyPDF
    pdf = PyPDF2.PdfFileReader(filename(path, "rb"))
    # Iterate pages
    for i in range(0, pdf.getNumPages()):
        # Extract text from page and add to content
        content += pdf.getPage(i).extractText() + "
"
    print (strftime("%H:%M:%S"), " pdf  -> txt ")
    f=open(name,'w')
    f.write(content.encode("UTF-8"))
    f.close

Convert PDF to Excel/csv/xlsx

Answers (1)

Related Questions