Reputation: 1069
I am trying to extract the hyperlink information from a PDF using PDFBox but I am unsure how to get
for( Object p : pages ) {
PDPage page = (PDPage)p;
List<?> annotations = page.getAnnotations();
for( Object a : annotations ) {
PDAnnotation annotation = (PDAnnotation)a;
if( annotation instanceof PDAnnotationLink ) {
PDAnnotationLink link = (PDAnnotationLink)annotation;
System.out.println(link.toString());
System.out.println(link.getDestination());
}
}
}
I want to extract the url of the hyperlink destination and the text of the hyperlink. How can one do this?
Thanks
Upvotes: 10
Views: 6255
Reputation: 1
If you need a simple links extractor, follow snippet
public List<String> extractLinks(String url) {
try{
InputStream pdfStream = new URL(url).openStream();
List<String> links = new ArrayList<>();
try (PDDocument document = PDDocument.load(pdfStream)) {
document.getPages().forEach(page -> {
try {
List<PDAnnotation> annotations = page.getAnnotations();
for (PDAnnotation annotation : annotations) {
if (annotation instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annotation;
if (link.getAction() instanceof PDActionURI) {
PDActionURI uri = (PDActionURI) link.getAction();
links.add(uri.getURI());
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
});
} catch (IOException e) {
e.printStackTrace();
}
return links;
}catch (Exception e){
LOGGER.error("Error to extract links");
return new ArrayList<>();
}
}
Upvotes: 0
Reputation: 1
03/21/2023
I have Just tested today and it work. but there are couple of set up need to do. this is how i did it.
[dependency]
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
</dependency>
Imports:-
import java.awt.geom.Rectangle2D;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.text.PDFTextStripperByArea;
public class Test_newSome {
public static void main(String[] args) throws IOException {
// File path = open pdf file on MSEdge and copy url
URL url = new URL("file:///C:/Users/PcuserName/Downloads/Credentials%20Email%20-%20Mar21FN%20Mar21094806LN.pdf");
InputStream is = url.openStream();
BufferedInputStream filePase = new BufferedInputStream(is);
PDDocument doc = null;
doc = PDDocument.load(filePase);
for (PDPage page : doc.getPages()) {
int pageNum = 0;
pageNum++;
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
List<PDAnnotation> annotations = page.getAnnotations();
// first setup text extraction regions
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annot = annotations.get(j);
if (annot instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annot;
PDRectangle rect = link.getRectangle();
// need to reposition link rectangle to match text space
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.getRotation();
if (rotation == 0) {
PDRectangle pageSize = page.getMediaBox();
y = pageSize.getHeight() - y;
} else if (rotation == 90) {
// do nothing
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
stripper.addRegion("" + j, awtRect);
}
}
stripper.extractRegions(page);
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annot = annotations.get(j);
if (annot instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annot;
PDAction action = link.getAction();
String urlText = stripper.getTextForRegion("" + j);
if (action instanceof PDActionURI) {
PDActionURI uri = (PDActionURI) action;
System.out.println("Page " + pageNum + ":'" + urlText.trim() + "'=" + uri.getURI());
}
}
}
}
}
}
Upvotes: 0
Reputation: 18936
Use this code from the PrintURLs sample code from the source code download:
for( PDPage page : doc.getPages() )
{
pageNum++;
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
List<PDAnnotation> annotations = page.getAnnotations();
//first setup text extraction regions
for( int j=0; j<annotations.size(); j++ )
{
PDAnnotation annot = annotations.get(j);
if( annot instanceof PDAnnotationLink )
{
PDAnnotationLink link = (PDAnnotationLink)annot;
PDRectangle rect = link.getRectangle();
//need to reposition link rectangle to match text space
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.getRotation();
if( rotation == 0 )
{
PDRectangle pageSize = page.getMediaBox();
y = pageSize.getHeight() - y;
}
else if( rotation == 90 )
{
//do nothing
}
Rectangle2D.Float awtRect = new Rectangle2D.Float( x,y,width,height );
stripper.addRegion( "" + j, awtRect );
}
}
stripper.extractRegions( page );
for( int j=0; j<annotations.size(); j++ )
{
PDAnnotation annot = annotations.get(j);
if( annot instanceof PDAnnotationLink )
{
PDAnnotationLink link = (PDAnnotationLink)annot;
PDAction action = link.getAction();
String urlText = stripper.getTextForRegion( "" + j );
if( action instanceof PDActionURI )
{
PDActionURI uri = (PDActionURI)action;
System.out.println( "Page " + pageNum +":'" + urlText.trim() + "'=" + uri.getURI() );
}
}
}
}
It works in two parts, one is getting the URL which is easy, the other is getting the URL text, which is done with a text extraction at the rectangle of the annotation.
Upvotes: 10
Reputation: 1151
We must get hyperlink information and internal link(ex. move page....). I using code below:
int pageNum = 0;
for (PDPage page : originalPDF.getPages()) {
pageNum++;
List<PDAnnotation> annotations = page.getAnnotations();
for (PDAnnotation annot : annotations) {
if (annot instanceof PDAnnotationLink) {
// get dimension of annottations
PDAnnotationLink link = (PDAnnotationLink) annot;
// get link action include link url and internal link
PDAction action = link.getAction();
// get link internal some case specal
PDDestination pDestination = link.getDestination();
if (action != null) {
if (action instanceof PDActionURI || action instanceof PDActionGoTo) {
if (action instanceof PDActionURI) {
// get uri link
PDActionURI uri = (PDActionURI) action;
System.out.println("uri link:" + uri.getURI());
} else {
if (action instanceof PDActionGoTo) {
// get internal link
PDDestination destination = ((PDActionGoTo) action).getDestination();
PDPageDestination pageDestination;
if (destination instanceof PDPageDestination) {
pageDestination = (PDPageDestination) destination;
} else {
if (destination instanceof PDNamedDestination) {
pageDestination = originalPDF.getDocumentCatalog().findNamedDestinationPage((PDNamedDestination) destination);
} else {
// error handling
break;
}
}
if (pageDestination != null) {
System.out.println("page move: " + (pageDestination.retrievePageNumber() + 1));
}
}
}
}
} else {
if (pDestination != null) {
PDPageDestination pageDestination;
if (pDestination instanceof PDPageDestination) {
pageDestination = (PDPageDestination) pDestination;
} else {
if (pDestination instanceof PDNamedDestination) {
pageDestination = originalPDF.getDocumentCatalog().findNamedDestinationPage((PDNamedDestination) pDestination);
} else {
// error handling
break;
}
}
if (pageDestination != null) {
System.out.println("page move: " + (pageDestination.retrievePageNumber() + 1));
}
} else {
//
}
}
}
}
}
Upvotes: 3