Avery Delancy
Avery Delancy

Reputation: 1

PDF splitting code is not outputting pdfs after running

I was recently working on a task to modify a python code that would get a pdf and based on the requirements, split the pdfs and output them into a folder. The code is able to read through the pdfs, and gives me the amount of pages that it went through in order to create the files, but the output does not create a the documents inside of the file path.

After making changes to allow it to read through the pdfs better, I was no longer able to get any outputs.

import re
import time
import os
from PyPDF2 import PdfReader, PdfWriter
import fitz
import parameters

input_dir = parameters.inputs_foldername+'\Certificates'
try:
    os.mkdir(input_dir)
except:
    pass

output_dir = 'Outputs_'+parameters.batch_name+'\Split Certificates'
isExist = os.path.exists(output_dir)
if not isExist:
    os.makedirs(output_dir)

for path in os.listdir(input_dir):
    full_path = os.path.join(input_dir, path)

    t0 = time.time()

    i = 0
    new = True
    pdf_writer = None  # Initialize pdf_writer
    parid = None  # Initialize parid

    with fitz.open(full_path) as doc:
        pdf = PdfReader(full_path)
        for i in range(len(pdf.pages)):
            text = doc[i].get_text()

            if ("Page 1 of" not in text) and (new == True):
                try:
                    tmp = re.search(r"(?<=\*  \*  \*\n)\d{7}", text)
                    parid = tmp.group()
                except AttributeError:
                    print(f"Pattern not found in text: {text}")
                    continue
                if pdf_writer is not None:
                    with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
                        pdf_writer.write(out)
                pdf_writer = PdfWriter()
                pdf_writer.add_page(pdf.pages[i])
                i += 1
                new = False
            elif ("Page 1 of" not in text) and (new == False):
                pdf_writer.add_page(pdf.pages[i])
            elif ("Page 1 of" in text) and (new == False):
                with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
                    pdf_writer.write(out)
                new = True

    # Save the last pdf_writer after the loop
    if pdf_writer is not None:
        with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
            pdf_writer.write(out)

    t1 = time.time()
    print(str(i+1)+" pages processed in " + str(int(t1-t0)) + " seconds.")

Upvotes: 0

Views: 42

Answers (1)

AKX
AKX

Reputation: 168967

As I mentioned in my comments, it's hard to help without knowing your PDFs or the details, but I think you might maybe be looking for something like this... maybe. The idea is to have a function that processes the PDF and yields pairs of parid (whatever that is) and the related page; the other function uses itertools.groupby to collate those into groups (assuming the pages are sequential per parid; I think that was the assumption in the original code too) and copy them out to a writer.

import glob
import itertools
import os
import re
import time

import fitz
import parameters
from PyPDF2 import PdfReader, PdfWriter

input_dir = f'{parameters.inputs_foldername}/Certificates'
os.makedirs(input_dir, exist_ok=True)
output_dir = f'Outputs_{parameters.batch_name}/Split Certificates'
os.makedirs(output_dir, exist_ok=True)


def get_pages_and_parids(pdf_path):
    """
    Group the PDF at `full_path` into (parid, page) pairs.
    """
    parid = None
    with fitz.open(pdf_path) as doc:
        pdf = PdfReader(pdf_path)
        for i in range(len(pdf.pages)):
            text = doc[i].get_text()
            if "Page 1 of" not in text:
                try:
                    tmp = re.search(r"(?<=\*  \*  \*\n)\d{7}", text)
                    parid = tmp.group()
                except AttributeError:
                    continue
            yield parid, pdf.pages[i]


def process_pdf(full_path):
    t0 = time.time()
    n_pages = 0
    n_pages_written = 0
    for parid, pages in itertools.groupby(get_pages_and_parids(full_path), key=lambda x: x[0]):
        if parid:
            pdf_writer = PdfWriter()
            for _, page in pages:
                pdf_writer.add_page(page)
                n_pages_written += 1
            with open(f'{output_dir}/{parid}.pdf', "wb") as out:
                pdf_writer.write(out)
        n_pages += 1
    t1 = time.time()
    print(f"{n_pages} pages processed, {n_pages_written} written in {full_path} in {t1 - t0} seconds.")


def main():
    for full_path in glob.glob(input_dir + "/*.pdf"):
        process_pdf(full_path)

if __name__ == '__main__':
    main()

Upvotes: 0

Related Questions