Reputation: 43832
I would like to take a multi-page pdf file and create separate pdf files per page.
I have downloaded reportlab and have browsed the documentation, but it seems aimed at pdf generation. I haven't yet seen anything about processing PDF files themselves.
Is there an easy way to do this in python?
Upvotes: 115
Views: 161280
Reputation: 711
# pip install pymupdf
import fitz # Importing the fitz library
# specify the path of the PDF file
pdf_path = "2021.pdf"
# open the PDF
try:
pdf = fitz.open(pdf_path)
except FileNotFoundError:
print("File not found")
fitz
library is used for working with PDF files and images."2021.pdf"
. If the file doesn't exist at the specified path, it catches a FileNotFoundError
and prints "File not found".# iterate over the pages starting from the 6th page
for i, page in enumerate(pdf[6:], start=6):
try:
# render the page as a JPEG image
pix = page.get_pixmap(alpha=False)
# save the image
pix.writeJPG(f"page_{i+1}.jpg")
except Exception as e:
print(e)
pdf[6:]
refers to the 7th page onwards.pixmap
, which is an image representation of the page. The alpha=False
parameter is used to specify that the image should not include transparency.page_{i+1}.jpg
). The enumeration starts at 6, but we increment it by 1 to reflect the actual page numbers starting from 7.# close the PDF
print("Successful")
pdf.close()
Upvotes: 1
Reputation: 759
The PyPDF2 package gives you the ability to split up a single PDF into multiple ones.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
Changes for PyPDF2 3.0.0
import os
from PyPDF2 import PdfReader, PdfWriter
path = 'pdf_forms/myform.pdf'
fname = 'fname'
pdf = PdfReader(path)
for page in range(len(pdf.pages)):
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf.pages[page])
output_filename =
'pdf_forms/splitted/{}_page_{}.pdf'.format(fname, page+1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
Source: https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/
Upvotes: 10
Reputation: 31
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import sys
import glob
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
if getattr(sys, 'frozen', False):
_location_ = os.path.dirname(os.path.realpath(sys.executable))
elif __file__:
_location_ = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
for file in glob.glob(__location__ + "/*.pdf"):
if file.endswith('.pdf'):
pdf_file = open(os.path.join(__location__, file), 'rb')
pdf_reader = PdfFileReader(pdf_file)
pageNumbers = pdf_reader.getNumPages()
for i in range (pageNumbers):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(i))
split_motive = open('Page ' + str(i+1) + '.pdf', 'wb')
pdf_writer.write(split_motive)
split_motive.close()
pdf_file.close()
Upvotes: 0
Reputation: 721
Updated solution for the latest release of PyPDF (3.0.0) and to split a range of pages.
from PyPDF2 import PdfReader, PdfWriter
file_name = r'c:\temp\junk.pdf'
pages = (121, 130)
reader = PdfReader(file_name)
writer = PdfWriter()
page_range = range(pages[0], pages[1] + 1)
for page_num, page in enumerate(reader.pages, 1):
if page_num in page_range:
writer.add_page(page)
with open(f'{file_name}_page_{pages[0]}-{pages[1]}.pdf', 'wb') as out:
writer.write(out)
Upvotes: 13
Reputation: 5652
from PyPDF2 import PdfWriter, PdfReader
inputpdf = PdfReader(open("document.pdf", "rb"))
for i in range(len(inputpdf.pages)):
output = PdfWriter()
output.add_page(inputpdf.pages[i])
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
etc.
Upvotes: 243
Reputation: 61
The earlier answers with PyPDF2
for splitting pdfs are not working anymore with the latest version update. The authors recommend using pypdf
instead and this version of PyPDF2==3.0.1
will be the last version of PyPDF2
. The function needs to be modified as follows:
import os
from PyPDF2 import PdfReader, PdfWriter
def split_pdfs(input_file_path):
inputpdf = PdfReader(open(input_file_path, "rb"))
out_paths = []
if not os.path.exists("outputs"):
os.makedirs("outputs")
for i, page in enumerate(inputpdf.pages):
output = PdfWriter()
output.add_page(page)
out_file_path = f"outputs/{input_file_path[:-4]}_{i}.pdf"
with open(out_file_path, "wb") as output_stream:
output.write(output_stream)
out_paths.append(out_file_path)
return out_paths
Note: The same function will work with pypdf
as well. Import PdfReader
and PdfWriter
from pypdf
rather than PyPDF2
.
Upvotes: 5
Reputation: 3110
import fitz
src = fitz.open("source.pdf")
for page in src:
tar = fitz.open() # output PDF for 1 page
# copy over current page
tar.insert_pdf(src, from_page=page.number, to_page=page.number)
tar.save(f"page-{page.number}.pdf")
tar.close()
Upvotes: 5
Reputation: 109
I know that the code is not related to python, however i felt like posting this piece of R code which is simple, flexible and works amazingly. The PDFtools package in R is amazing in splitting merging PDFs at ease.
library(pdftools) #Rpackage
pdf_subset('D:\\file\\20.02.20\\22 GT 2017.pdf',
pages = 1:51, output = "subset.pdf")
Upvotes: 4
Reputation: 1026
I missed here a solution where you split the PDF to two parts consisting of all pages so I append my solution if somebody was looking for the same:
from PyPDF2 import PdfFileWriter, PdfFileReader
def split_pdf_to_two(filename,page_number):
pdf_reader = PdfFileReader(open(filename, "rb"))
try:
assert page_number < pdf_reader.numPages
pdf_writer1 = PdfFileWriter()
pdf_writer2 = PdfFileWriter()
for page in range(page_number):
pdf_writer1.addPage(pdf_reader.getPage(page))
for page in range(page_number,pdf_reader.getNumPages()):
pdf_writer2.addPage(pdf_reader.getPage(page))
with open("part1.pdf", 'wb') as file1:
pdf_writer1.write(file1)
with open("part2.pdf", 'wb') as file2:
pdf_writer2.write(file2)
except AssertionError as e:
print("Error: The PDF you are cutting has less pages than you want to cut!")
Upvotes: 11