LoveToCode
LoveToCode

Reputation: 53

Splitting PDF document into multiple documents

I'm trying to split a PDF document into multiple documents where each document includes the maximum number of pages it can contain where the file size is less than a maximum file size.

My code currently works when running from Eclipse, but when I click on the .jar file, the static method in a java class seems to crash (I can't seem to catch an exception however).

The code that isn't working is:

myListOfDocuments=mysplitter.split(document);

Somehow the JVM bails on the static method when the above line is called. The load seems to work fine, as follows: PDDocument document=PDDocument.load(aFile);

Any ideas?

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;

public class PDFMaxSizeSplitter {


    public static void main(String[] args) {
    }

    public static ArrayList<File> splitTheFile(File aFile,long maxSize){

        ArrayList<File> resultFiles = new ArrayList<File>();

        //Checks to see if file is already small enough
        if (aFile.length() <= maxSize){
            resultFiles.add(aFile);
            return resultFiles;
        }

        //checks to see if it's a directory
        if (aFile.isDirectory()){
            resultFiles.add(aFile);
            return resultFiles;
        }

        try {

            PDDocument document = PDDocument.load(aFile);
            Splitter mysplitter = new Splitter();
            List<PDDocument> myListOfDocuments = mysplitter.split(document);
            int docNumber = 0;
            while (myListOfDocuments.size()>0){
                long theResults = 0;
                theResults = getChunk(myListOfDocuments,0,(long) (myListOfDocuments.size()-1),maxSize);
                PDDocument newPDFDoc = new PDDocument();
                for (long pageindex=0; pageindex<=theResults; pageindex++){
                    newPDFDoc.addPage(myListOfDocuments.get((int) pageindex).getPage(0)); 
                }
                File newFile = new File(aFile.getParentFile() +
                                        File.separator +
                                        aFile.getName().replace(".pdf", "") +
                                        "Part" +
                                        String.format("%03d", docNumber) +
                                        ".pdf");
                //System.out.println(newFile.getCanonicalFile());
                newPDFDoc.save(newFile);
                resultFiles.add(newFile);
                myListOfDocuments=myListOfDocuments.subList((int) (theResults)+1, (myListOfDocuments.size()));
                newPDFDoc.close();
                docNumber++;
            }

            document.close();


        } catch (IOException e) {
            e.printStackTrace();
            }
        return resultFiles;
        }

    private static long getChunk(List<PDDocument> thePages, long lowPage, long highPage, long maxSize) throws IOException{
        //System.out.println("low " + lowPage + " high page: " + highPage);
        if ( (highPage-lowPage)<=1 ){
            if(PDFMaxSizeSplitter.testSize(thePages,0,highPage)<=maxSize){
                return highPage;
            } else{
                return lowPage;
            }

        } else if (PDFMaxSizeSplitter.testSize(thePages, 0,lowPage+ (highPage-lowPage)/2)<=maxSize){
            return PDFMaxSizeSplitter.getChunk(thePages, lowPage + (highPage-lowPage)/2, highPage,maxSize);
        }
            else {
                return PDFMaxSizeSplitter.getChunk(thePages, lowPage, lowPage + (highPage-lowPage)/2,maxSize);
            }
    }

    private static long testSize(List<PDDocument> thePages, long start, long stop) throws IOException{
        //System.out.println("Trying: " + (new Long(start)).toString() + " to " + (new Long(stop)).toString()); 
        PDDocument testerdocument = new PDDocument();
        //Path tempPath = Files.createTempFile((new Long(start)).toString(), (new Long(stop)).toString());
        //System.out.println("Creating tempPath " +tempPath.toString());    
        //File tempFile=new File(tempPath.toString());
        ByteArrayOutputStream tempFile = new ByteArrayOutputStream();
        for (long pageindex=start; pageindex<=stop; pageindex++){
            testerdocument.addPage(thePages.get((int) pageindex).getPage(0)); 
        }
        testerdocument.save(tempFile);
        long thefilesize = tempFile.size();
        //long thefilesize =  (tempFile.length());
        //Files.deleteIfExists(tempPath);
        tempFile.reset();
        testerdocument.close();
        return thefilesize;
    }
}

-----------edit--------------

It turns out the JVM was running out of memory.

Upvotes: 0

Views: 1098

Answers (1)

LoveToCode
LoveToCode

Reputation: 53

It turns out the JVM was running out of memory. I added a jvm argument to increase the memory. Also, I switched to the 64 bit jvm mode by using the argument -d64 on the jvm. Also, I have been using the disk drive cached memory management found in pdfbox, e.g., new PDDocument(aFile, MemoryUsageSetting.setupTempFileOnly());

With these settings, I can handle several gigabytes of files. Now in the code, I try to load the documents into direct memory and catch the out of memory exception to switch to a low memory mode. In the low memory mode I use the MemoryUsageSetting.setupTempFileOnly() to avoid using too much of the heap.

Upvotes: 0

Related Questions