Best way to check the PDF file is corrupt using python

Question

I try to check the PDF files are corrupted in windows environment and come up with following python code.

Just want to check is it the best way to check corrupted PDF files or is there any other easy way?

Note: C:\Temp\python\sample-map (1).pdf is the corrupted PDF file

Here is the sample code

import os
import subprocess
import re
from subprocess import Popen, PIPE

def checkFile(fullfile):
    proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
    # -b, --brief : do not prepend filenames to output lines
    out, err = proc.communicate()
    exitcode = proc.returncode
    return exitcode, out, err

def searchFiles(dirpath):
    pwdpath=os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK 
" %dirpath)
        listfiles=os.listdir(dirpath)
        for files in listfiles:
            fullfile=os.path.join(dirpath, files)
            if os.access(fullfile, os.R_OK):
                code, out, error = checkFile(fullfile)
                if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
                    print("ERROR " + fullfile+"
################")
                else:
                    print("OK " + fullfile+"
################")
            else:
                print("$s : File not readable" %fullfile)
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

sample output :

$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK

OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################

bigant02 · Accepted Answer

I think you can use PyPDF2 module.

pip install pypdf2

The code is as follows.

from PyPDF2 import PdfFileReader
import os

def checkFile(fullfile):
    with open(fullfile, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            if info:
                return True
            else:
                return False
        except:
            return False

def searchFiles(dirpath):
    pwdpath = os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK 
" %dirpath)
        listfiles = os.listdir(dirpath)
        for f in listfiles:
            fullfile = os.path.join(dirpath, f)
            if checkFile(fullfile):
                print("OK " + fullfile + "
################")
            else:
                print("ERROR " + fullfile + "
################")
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

I tried to match your coding style.

I think this code can also be used on MacOS or Linux.

Best way to check the PDF file is corrupt using python

Answers (2)

Related Questions