Reputation: 166
I try to check the PDF files are corrupted in windows environment and come up with following python code.
Just want to check is it the best way to check corrupted PDF files or is there any other easy way?
Note: C:\Temp\python\sample-map (1).pdf
is the corrupted PDF file
Here is the sample code
import os
import subprocess
import re
from subprocess import Popen, PIPE
def checkFile(fullfile):
proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
# -b, --brief : do not prepend filenames to output lines
out, err = proc.communicate()
exitcode = proc.returncode
return exitcode, out, err
def searchFiles(dirpath):
pwdpath=os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles=os.listdir(dirpath)
for files in listfiles:
fullfile=os.path.join(dirpath, files)
if os.access(fullfile, os.R_OK):
code, out, error = checkFile(fullfile)
if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
print("ERROR " + fullfile+"\n################")
else:
print("OK " + fullfile+"\n################")
else:
print("$s : File not readable" %fullfile)
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
sample output :
$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK
OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################
Upvotes: 2
Views: 7385
Reputation: 41
As of April 2023, PdfFileReader
is deprecated and causes the accepted answer's check_file
function to always return False. Here is updated code:
import os
import argparse
import pandas as pd
from PyPDF2 import PdfReader
def check_file(fullfile):
with open(fullfile, 'rb') as f:
try:
pdf = PdfReader(f)
info = pdf.metadata
if info:
return True
else:
return False
except Exception as e:
return False
def search_files(dirpath: str) -> pd.DataFrame:
pwdpath = os.path.dirname(os.path.realpath(__file__))
print("Running path : %s" %pwdpath)
files = []
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles = os.listdir(dirpath)
for f in listfiles:
fullfile = os.path.join(dirpath, f)
if check_file(fullfile):
print("OK " + fullfile + "\n################")
files.append((f, fullfile, 'good'))
else:
print("ERROR " + fullfile + "\n################")
files.append((f, fullfile, 'corrupted'))
else:
print("Path is not valid")
df = pd.DataFrame(files, columns=['filename', 'fullpath', 'status'])
return df
def main(args):
df = search_files(args.dirpath)
df.to_csv(args.output, index=False)
print(f'Final report saved to {args.output}')
print(df['status'].value_counts())
if __name__ == '__main__':
""" Command line script for finding corrupted PDFs in a directory. """
parser = argparse.ArgumentParser()
parser.add_argument('--dirpath', type=str, required=True, help='Path to directory containing PDFs.')
parser.add_argument('--output', type=str, required=True, help='Path to output CSV file.')
args = parser.parse_args()
main(args)
Upvotes: 4
Reputation: 318
I think you can use PyPDF2 module.
pip install pypdf2
The code is as follows.
from PyPDF2 import PdfFileReader
import os
def checkFile(fullfile):
with open(fullfile, 'rb') as f:
try:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
if info:
return True
else:
return False
except:
return False
def searchFiles(dirpath):
pwdpath = os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles = os.listdir(dirpath)
for f in listfiles:
fullfile = os.path.join(dirpath, f)
if checkFile(fullfile):
print("OK " + fullfile + "\n################")
else:
print("ERROR " + fullfile + "\n################")
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
I tried to match your coding style.
I think this code can also be used on MacOS or Linux.
Upvotes: 3