Reputation: 1549
I am trying to split pdf by its bookmarks using itext7.
Problem : if Pdf is having same bookmark in other place in the outline tree , it is over ridding and unable to split.
Sample code to reproduce the problem:
public void walkOutlines(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,List<String>titles,List<Integer>pageNum) { //----------loop traversing all paths
for (PdfOutline child : outline.getAllChildren()){
if(child.getDestination() != null) {
prepareIndexFile(child,names,pdfDocument,titles,pageNum,list);
}
}
}
//------------Getting pageNumbers from outlines
public void prepareIndexFile(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,List<String>titles,List<Integer>pageNum) {
String title = outline.getTitle();
PdfDestination pdfDestination = outline.getDestination();
String pdfStr = ((PdfString)pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
PdfObject pdfObj = array != null ? array.get(0) : null;
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
titles.add(title);
pageNum.add(pageNumber);
if(outline.getAllChildren().size() > 0) {
for (PdfOutline child : outline.getAllChildren()){
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
public boolean splitPdf(String inputFile, final String outputFolder) {
boolean splitSuccess = true;
PdfDocument pdfDoc = null;
try {
PdfReader pdfReaderNew = new PdfReader(inputFile);
pdfDoc = new PdfDocument(pdfReaderNew);
final List<String> titles = new ArrayList<String>();
List<Integer> pageNum = new ArrayList<Integer>();
PdfNameTree destsTree = pdfDoc.getCatalog().getNameTree(PdfName.Dests);
Map<String, PdfObject> names = destsTree.getNames();//--------------------------------------Core logic for getting names
PdfOutline root = pdfDoc.getOutlines(false);//--------------------------------------Core logic for getting outlines
walkOutlines(root,names, pdfDoc, titles, pageNum,content); //------Logic to get bookmarks and pageNumbers
if (titles == null || titles.size()==0) {
splitSuccess = false;
}else { //------Proceed if it has bookmarks
for(int i=0;i<titles.size();i++) {
String title = titles.get(i);
String startPageNmStr =""+pageNum.get(i);
int startPage = Integer.parseInt(startPageNmStr);
int endPage = startPage;
if(i == titles.size() - 1) {
endPage = pdfDoc.getNumberOfPages();
}else {
int nextPage = pageNum.get(i+1);
if(nextPage > startPage) {
endPage = nextPage - 1;
}else {
endPage = nextPage;
}
}
String outFileName = outputFolder + File.separator + getFileName(title) + ".pdf";
PdfWriter pdfWriter = new PdfWriter(outFileName);
PdfDocument newDocument = new PdfDocument(pdfWriter, new DocumentProperties().setEventCountingMetaInfo(null));
pdfDoc.copyPagesTo(startPage, endPage, newDocument);
newDocument.close();
pdfWriter.close();
}
}
}catch(Exception e){
//---log
}
}
Found root cause: In PdfNameTree items.put(name.toUnicodeString(), names.get(k));
How to over come this issue?
Thanks in advance
Upvotes: 1
Views: 1135
Reputation: 12312
This part of the code:
PdfDestination pdfDestination = outline.getDestination();
String pdfStr = ((PdfString)pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
PdfObject pdfObj = array != null ? array.get(0) : null;
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
Does not take into account the case that the destination can be non-named and refer to a page explicitly.
So the code needs to be adapted into the following code:
PdfDestination pdfDestination = outline.getDestination();
PdfObject pdfObj = null;
if (pdfDestination.getPdfObject().isString()) {
String pdfStr = ((PdfString) pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
if (array != null) {
pdfObj = array.get(0);
}
} else if (pdfDestination.getPdfObject().isArray() && ((PdfArray)pdfDestination.getPdfObject()).get(0).isDictionary()) {
pdfObj = ((PdfArray)pdfDestination.getPdfObject()).get(0);
}
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
Additionally, if you want to obtain the full title names including the parent chain, you need to replace String title = outline.getTitle();
with the following piece of code:
String title = outline.getTitle();
PdfOutline parentChain = outline.getParent();
while (parentChain != null) {
title = parentChain.getTitle() + "." + title;
parentChain = parentChain.getParent();
}
As a result, I got 6 files in the output directory, with 5 files of 1 page each and one file of 4 pages.
Complete code:
public void walkOutlines(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,
java.util.List<String>titles,java.util.List<Integer>pageNum) { //----------loop traversing all paths
for (PdfOutline child : outline.getAllChildren()){
if(child.getDestination() != null) {
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
//------------Getting pageNumbers from outlines
public void prepareIndexFile(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,
java.util.List<String>titles,java.util.List<Integer>pageNum) {
String title = outline.getTitle();
PdfOutline parentChain = outline.getParent();
while (parentChain != null) {
title = parentChain.getTitle() + "." + title;
parentChain = parentChain.getParent();
}
PdfDestination pdfDestination = outline.getDestination();
PdfObject pdfObj = null;
if (pdfDestination.getPdfObject().isString()) {
String pdfStr = ((PdfString) pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
if (array != null) {
pdfObj = array.get(0);
}
} else if (pdfDestination.getPdfObject().isArray() && ((PdfArray)pdfDestination.getPdfObject()).get(0).isDictionary()) {
pdfObj = ((PdfArray)pdfDestination.getPdfObject()).get(0);
}
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
titles.add(title);
pageNum.add(pageNumber);
if(outline.getAllChildren().size() > 0) {
for (PdfOutline child : outline.getAllChildren()){
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
public void splitPdf(String inputFile, final String outputFolder) {
boolean splitSuccess = true;
PdfDocument pdfDoc = null;
try {
PdfReader pdfReaderNew = new PdfReader(inputFile);
pdfDoc = new PdfDocument(pdfReaderNew);
final java.util.List<String> titles = new ArrayList<String>();
java.util.List<Integer> pageNum = new ArrayList<Integer>();
PdfNameTree destsTree = pdfDoc.getCatalog().getNameTree(PdfName.Dests);
Map<String, PdfObject> names = destsTree.getNames();//--------------------------------------Core logic for getting names
PdfOutline root = pdfDoc.getOutlines(false);//--------------------------------------Core logic for getting outlines
walkOutlines(root,names, pdfDoc, titles, pageNum); //------Logic to get bookmarks and pageNumbers
if (titles == null || titles.size()==0) {
splitSuccess = false;
}else { //------Proceed if it has bookmarks
for(int i=0;i<titles.size();i++) {
String title = titles.get(i);
String startPageNmStr =""+pageNum.get(i);
int startPage = Integer.parseInt(startPageNmStr);
int endPage = startPage;
if(i == titles.size() - 1) {
endPage = pdfDoc.getNumberOfPages();
}else {
int nextPage = pageNum.get(i+1);
if(nextPage > startPage) {
endPage = nextPage - 1;
}else {
endPage = nextPage;
}
}
String outFileName = outputFolder + File.separator + title + ".pdf";
PdfWriter pdfWriter = new PdfWriter(outFileName);
PdfDocument newDocument = new PdfDocument(pdfWriter, new DocumentProperties().setEventCountingMetaInfo(null));
pdfDoc.copyPagesTo(startPage, endPage, newDocument);
newDocument.close();
pdfWriter.close();
}
}
}catch(IOException e){
System.out.println(e);
}
}
Upvotes: 1