Reputation: 87
I hve one pdf file, which contain 60 pages. In each pages I've unique and repeated Invoice Nos. Im using Apache PDFBOX.
import java.io.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;
import java.util.regex.*;
public class PDFtest1 {
public static void main(String[] args){
PDDocument pd;
try {
File input = new File("G:\\Sales.pdf");
// StringBuilder to store the extracted text
StringBuilder sb = new StringBuilder();
pd = PDDocument.load(input);
PDFTextStripper stripper = new PDFTextStripper();
// Add text to the StringBuilder from the PDF
sb.append(stripper.getText(pd));
Pattern p = Pattern.compile("Invoice No.\\s\\w\\d\\d\\d\\d\\d\\d\\d\\d\\d\\d");
// Matcher refers to the actual text where the pattern will be found
Matcher m = p.matcher(sb);
while (m.find()){
// group() method refers to the next number that follows the pattern we have specified.
System.out.println(m.group());
}
if (pd != null) {
pd.close();
}
} catch (Exception e){
e.printStackTrace();
}
}
}
I'm able to read all Invoice Nos. using java regex. Finally the Result is as follow
run:
Invoice No. D0000003010
Invoice No. D0000003011
Invoice No. D0000003011
Invoice No. D0000003011
Invoice No. D0000003011
Invoice No. D0000003012
Invoice No. D0000003012
Invoice No. D0000003012
Invoice No. D0000003013
Invoice No. D0000003013
Invoice No. D0000003014
Invoice No. D0000003014
Invoice No. D0000003015
Invoice No. D0000003016
I need to split the pdf according to tht Invoice No.s. For example Invoice No. D0000003011, all pdf pages should be merge as a single pdf and so on. Hw can i achive dis. ..
Upvotes: 4
Views: 5416
Reputation: 18861
public static void main(String[] args) throws IOException, COSVisitorException
{
File input = new File("G:\\Sales.pdf");
PDDocument outputDocument = null;
PDDocument inputDocument = PDDocument.loadNonSeq(input, null);
PDFTextStripper stripper = new PDFTextStripper();
String currentNo = null;
for (int page = 1; page <= inputDocument.getNumberOfPages(); ++page)
{
stripper.setStartPage(page);
stripper.setEndPage(page);
String text = stripper.getText(inputDocument);
Pattern p = Pattern.compile("Invoice No.(\\s\\w\\d\\d\\d\\d\\d\\d\\d\\d\\d\\d)");
// Matcher refers to the actual text where the pattern will be found
Matcher m = p.matcher(text);
String no = null;
if (m.find())
{
no = m.group(1);
}
System.out.println("page: " + page + ", value: " + no);
PDPage pdPage = (PDPage) inputDocument.getDocumentCatalog().getAllPages().get(page - 1);
if (no != null && !no.equals(currentNo))
{
saveCloseCurrent(currentNo, outputDocument);
// create new document
outputDocument = new PDDocument();
currentNo = no;
}
if (no == null && currentNo == null)
{
System.out.println ("header page ??? " + page + " skipped");
continue;
}
// append page to current document
outputDocument.importPage(pdPage);
}
saveCloseCurrent(currentNo, outputDocument);
inputDocument.close();
}
private static void saveCloseCurrent(String currentNo, PDDocument outputDocument)
throws IOException, COSVisitorException
{
// save to new output file
if (currentNo != null)
{
// save document into file
File f = new File(currentNo + ".pdf");
if (f.exists())
{
System.err.println("File " + f + " exists?!");
System.exit(-1);
}
outputDocument.save(f);
outputDocument.close();
}
}
Beware:
update 19.8.2015:
Upvotes: 6