Sha
Sha

Reputation: 166

Best way to check the PDF file is corrupt using python

I try to check the PDF files are corrupted in windows environment and come up with following python code.

Just want to check is it the best way to check corrupted PDF files or is there any other easy way?

Note: C:\Temp\python\sample-map (1).pdf is the corrupted PDF file

Here is the sample code

import os
import subprocess
import re
from subprocess import Popen, PIPE

def checkFile(fullfile):
    proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
    # -b, --brief : do not prepend filenames to output lines
    out, err = proc.communicate()
    exitcode = proc.returncode
    return exitcode, out, err

def searchFiles(dirpath):
    pwdpath=os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK \n" %dirpath)
        listfiles=os.listdir(dirpath)
        for files in listfiles:
            fullfile=os.path.join(dirpath, files)
            if os.access(fullfile, os.R_OK):
                code, out, error = checkFile(fullfile)
                if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
                    print("ERROR " + fullfile+"\n################")
                else:
                    print("OK " + fullfile+"\n################")
            else:
                print("$s : File not readable" %fullfile)
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

sample output :

$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK

OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################

Upvotes: 2

Views: 7385

Answers (2)

Daniel Barker
Daniel Barker

Reputation: 41

As of April 2023, PdfFileReader is deprecated and causes the accepted answer's check_file function to always return False. Here is updated code:

import os
import argparse

import pandas as pd
from PyPDF2 import PdfReader


def check_file(fullfile):
    with open(fullfile, 'rb') as f:
        try:
            pdf = PdfReader(f)
            info = pdf.metadata
            if info:
                return True
            else:
                return False
        except Exception as e:
            return False


def search_files(dirpath: str) -> pd.DataFrame:
    pwdpath = os.path.dirname(os.path.realpath(__file__))
    print("Running path : %s" %pwdpath)
    files = []
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK \n" %dirpath)
        listfiles = os.listdir(dirpath)
        for f in listfiles:
            fullfile = os.path.join(dirpath, f)
            if check_file(fullfile):
                print("OK " + fullfile + "\n################")
                files.append((f, fullfile, 'good'))
            else:
                print("ERROR " + fullfile + "\n################")
                files.append((f, fullfile, 'corrupted'))
    else:
        print("Path is not valid")

    df = pd.DataFrame(files, columns=['filename', 'fullpath', 'status'])
    return df


def main(args):
    df = search_files(args.dirpath)
    df.to_csv(args.output, index=False)
    print(f'Final report saved to {args.output}')
    print(df['status'].value_counts())


if __name__ == '__main__':
    """ Command line script for finding corrupted PDFs in a directory. """
    parser = argparse.ArgumentParser()
    parser.add_argument('--dirpath', type=str, required=True, help='Path to directory containing PDFs.')
    parser.add_argument('--output', type=str, required=True, help='Path to output CSV file.')
    args = parser.parse_args()
    main(args)

Upvotes: 4

bigant02
bigant02

Reputation: 318

I think you can use PyPDF2 module.

pip install pypdf2

The code is as follows.

from PyPDF2 import PdfFileReader
import os

def checkFile(fullfile):
    with open(fullfile, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            if info:
                return True
            else:
                return False
        except:
            return False

def searchFiles(dirpath):
    pwdpath = os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK \n" %dirpath)
        listfiles = os.listdir(dirpath)
        for f in listfiles:
            fullfile = os.path.join(dirpath, f)
            if checkFile(fullfile):
                print("OK " + fullfile + "\n################")
            else:
                print("ERROR " + fullfile + "\n################")
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

I tried to match your coding style.

I think this code can also be used on MacOS or Linux.

Upvotes: 3

Related Questions