Reputation: 81
I am trying to strip out only the first page of multiple PDF files and combine into one file. (I receive 150 PDF files a day, the first page is the invoice which I need, the following three to 12 pages are just backup which I do not need) So the input is 150 PDF files of varying size and the output I want is 1 PDF file containing only the first page of each of the 150 files.
What I seem to have done is to have merged all the pages EXCEPT the first page (which is the only one I need).
# Get all PDF documents in current directory
import os
pdf_files = []
for filename in os.listdir("."):
if filename.endswith(".pdf"):
pdf_files.append(filename)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
for filename in pdf_files:
reader = PdfFileReader(filename)
writer = PdfFileWriter()
for pageNum in range(1, reader.numPages):
page = reader.getPage(pageNum)
writer.addPage(page)
with open("CombinedFirstPages.pdf", "wb") as fp:
writer.write(fp)
Upvotes: 8
Views: 11571
Reputation: 43
The other answers don't work if the PDF doesn't have text and only contains images. The below should work for any kind of pdf (relevant pypdf doc)
from pypdf import PdfWriter
from pathlib import Path
pdf_files = sorted(Path("<path-to-folder-with-files>").glob('**/*.pdf'))
# Take first page from each PDF
pdf_writer = PdfWriter()
for file in pdf_files:
pdf_writer.append(file, pages=(0, 1))
with open("CombinedFirstPages.pdf", "wb") as fp:
pdf_writer.write(fp)
Upvotes: 2
Reputation: 1
Did some changes. The following piece of code worked for me.
import os
from PyPDF2 import PdfWriter, PdfReader
pdf_files = []
# Get all PDF documents in current directory
for filename in os.listdir("."):
if filename.endswith(".pdf"):
pdf_files.append(filename)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
pdf_writer = PdfWriter()
for filename in pdf_files:
reader = PdfReader(filename)
page = reader.pages[0]
pdf_writer.add_page(page)
with open("CombinedFirstPages.pdf", "wb") as fp:
pdf_writer.write(fp)
Upvotes: 0
Reputation: 2091
Try this:
# Get all PDF documents in current directory
import os
your_target_folder = "."
pdf_files = []
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
if file_full_path.lower().endswith(".pdf"):
pdf_files.append(file_full_path)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
from PyPDF2 import PdfFileReader, PdfFileWriter
writer = PdfFileWriter()
for file_path in pdf_files:
reader = PdfFileReader(file_path)
page = reader.getPage(0)
writer.addPage(page)
with open("CombinedFirstPages.pdf", "wb") as output:
writer.write(output)
Upvotes: 2
Reputation: 3332
This script takes all the PDF files and converts the first page to png. In the current execution directory
#pip install pdf2image
import os
import tempfile
from pdf2image import convert_from_path
output_folder=os.getcwd() #current work directory
def pdf_to_png(pdf_name,source,destino):
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(pdf_path=source+"/"+pdf_name,
dpi=100,
output_folder=destino,
fmt="png",
output_file=pdf_name[:-4],
single_file=True)
for filename in os.listdir(output_folder):
if filename.endswith(".pdf"):
pdf_to_png(filename,output_folder,output_folder)
print("ok!")
Upvotes: -1