Hugo
Hugo

Reputation: 49

Incorrect output: Extracting text from pdf's,docx's pptx's will not output in their own spearte line

I created a function that will open each file in a directory and extract the text from each file and output it in an excel sheet using Pandas. The indexing for each file type seems to be working just fine.However the extracted text from each file comes out next to each other in a list and not separated and next to their corresponding file.

See bottom of script for current output and the out put I want.

** I believe the problem lies in the loader() function which takes in a path, goes through each directory file checks the file .ext and extracts the text.

Thank you!

import re
#import PyPDF4
import pathlib
from pathlib import Path 
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit



p = Path('C:/Users/XXXX/Desktop/test')


txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))




def pdf_to_text(x):

    # PDFMiner 
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = open(x, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text

#-------------------------------------------------------------------------------

def loader(path):
    with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
        docx_out,pptx_out,pdf_out = [],[],[]
        if path.suffix == ".pdf":
            for name1 in PDF_files:
                pdf_out.append(pdf_to_text(name1))
            return pdf_out
        elif path.suffix == ".docx":
            for name2 in docx_files:
                docx_out.append(docx2txt.process(name2))
            return docx_out
        elif path.suffix == ".pptx":
            for file in pptx_files:
                prs = Presentation(file)
                for slide in prs.slides:
                    for shape in slide.shapes:
                        if not shape.has_text_frame:
                            continue
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                pptx_out.append(run.text)
            return pptx_out
        else:
            return f.readlines()

print(pdf_out)



def file_generator():
    files = txt_files+PDF_files+csv_files+docx_files+pptx_files
    for item in files:
        yield {
            "path": item,
            "name": item.name[0:],
            "created": time.ctime(item.stat().st_ctime),
            "modified": time.ctime(item.stat().st_mtime),
            "content": loader(item) 
        }


def to_xlsx():
    df = pd.DataFrame.from_dict(file_generator())
    df.head()
    df.to_excel("tester4.xlsx")

if __name__ == "__main__":
    to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE

current output:                                
  content
["content_test1","content_test2"]  test1.pdf
["content_test1","content_test2"]  test2.pdf

What I want:
["content_test1"]  test1.pdf
["content_test2"]  test2.pdf

Upvotes: 0

Views: 87

Answers (1)

Tyler Quiring
Tyler Quiring

Reputation: 40

The appends called by each filetype_out function look like they are adding the contents of each file to the end of the list pertaining to that filetype. If you want to generate a unique list with the contents of each individual file, I'd recommend creating a separate dict for each filetype, which then includes individual lists for each file processed. Taking the PDFs as an example:

def loader(path):
    with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
        docx_out,pptx_out,pdf_out = {},{},{}
        if path.suffix == ".pdf":
            for name1 in PDF_files:
                name1_contents = []
                name1_contents.append(pdf_to_text(name1))
                pdf_out[name1] = name1_contents
            return pdf_out

To then print out your results in a similar way as you have been:

for name, contents in pdf_out:
    print(contents + '  ' + name)

Upvotes: 1

Related Questions