Reputation: 1
I was recently working on a task to modify a python code that would get a pdf and based on the requirements, split the pdfs and output them into a folder. The code is able to read through the pdfs, and gives me the amount of pages that it went through in order to create the files, but the output does not create a the documents inside of the file path.
After making changes to allow it to read through the pdfs better, I was no longer able to get any outputs.
import re
import time
import os
from PyPDF2 import PdfReader, PdfWriter
import fitz
import parameters
input_dir = parameters.inputs_foldername+'\Certificates'
try:
os.mkdir(input_dir)
except:
pass
output_dir = 'Outputs_'+parameters.batch_name+'\Split Certificates'
isExist = os.path.exists(output_dir)
if not isExist:
os.makedirs(output_dir)
for path in os.listdir(input_dir):
full_path = os.path.join(input_dir, path)
t0 = time.time()
i = 0
new = True
pdf_writer = None # Initialize pdf_writer
parid = None # Initialize parid
with fitz.open(full_path) as doc:
pdf = PdfReader(full_path)
for i in range(len(pdf.pages)):
text = doc[i].get_text()
if ("Page 1 of" not in text) and (new == True):
try:
tmp = re.search(r"(?<=\* \* \*\n)\d{7}", text)
parid = tmp.group()
except AttributeError:
print(f"Pattern not found in text: {text}")
continue
if pdf_writer is not None:
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf.pages[i])
i += 1
new = False
elif ("Page 1 of" not in text) and (new == False):
pdf_writer.add_page(pdf.pages[i])
elif ("Page 1 of" in text) and (new == False):
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
new = True
# Save the last pdf_writer after the loop
if pdf_writer is not None:
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
t1 = time.time()
print(str(i+1)+" pages processed in " + str(int(t1-t0)) + " seconds.")
Upvotes: 0
Views: 42
Reputation: 168967
As I mentioned in my comments, it's hard to help without knowing your PDFs or the details, but I think you might maybe be looking for something like this... maybe. The idea is to have a function that processes the PDF and yield
s pairs of parid
(whatever that is) and the related page; the other function uses itertools.groupby
to collate those into groups (assuming the pages are sequential per parid
; I think that was the assumption in the original code too) and copy them out to a writer.
import glob
import itertools
import os
import re
import time
import fitz
import parameters
from PyPDF2 import PdfReader, PdfWriter
input_dir = f'{parameters.inputs_foldername}/Certificates'
os.makedirs(input_dir, exist_ok=True)
output_dir = f'Outputs_{parameters.batch_name}/Split Certificates'
os.makedirs(output_dir, exist_ok=True)
def get_pages_and_parids(pdf_path):
"""
Group the PDF at `full_path` into (parid, page) pairs.
"""
parid = None
with fitz.open(pdf_path) as doc:
pdf = PdfReader(pdf_path)
for i in range(len(pdf.pages)):
text = doc[i].get_text()
if "Page 1 of" not in text:
try:
tmp = re.search(r"(?<=\* \* \*\n)\d{7}", text)
parid = tmp.group()
except AttributeError:
continue
yield parid, pdf.pages[i]
def process_pdf(full_path):
t0 = time.time()
n_pages = 0
n_pages_written = 0
for parid, pages in itertools.groupby(get_pages_and_parids(full_path), key=lambda x: x[0]):
if parid:
pdf_writer = PdfWriter()
for _, page in pages:
pdf_writer.add_page(page)
n_pages_written += 1
with open(f'{output_dir}/{parid}.pdf', "wb") as out:
pdf_writer.write(out)
n_pages += 1
t1 = time.time()
print(f"{n_pages} pages processed, {n_pages_written} written in {full_path} in {t1 - t0} seconds.")
def main():
for full_path in glob.glob(input_dir + "/*.pdf"):
process_pdf(full_path)
if __name__ == '__main__':
main()
Upvotes: 0