Extract PDF comments into HTML

Question

Input PDF document with comment

I have a PDF document with highlight and comment on the highlight ("my comment") (downlload).

Desired output

I want to convert the PDF into text, where comment is in tags, something like this:

ONE TWO THREE    
FOUR FIVE SIX SEVEN

Question

Can anyone help me how to implement method:

private double getDistance(PDAnnotation ann, TextPosition firstProsition) {...}

or the method

private boolean isTextAnnotated()

to determine if the annotation ann is at the position of the text? If possible also the text position of the comment would be nice to determine.

JAVA code

Anyway I got lost regarding how to determine, if annotation is related to the currently processed text. I also do not know, if it is possible to identify exact part of the text.

                PDFParser parser = new PDFParser(new FileInputStream(file));
                parser.parse();
                cosDoc = parser.getDocument();

                pdfStripper = new PDFTextStripper()
                {
                    List la;
                    private boolean closeWithEnd;
                    @Override
                    protected void startPage(PDPage page) throws IOException
                    {
                        la = page.getAnnotations(); // init pages
                        startOfLine = true;
                        super.startPage(page);
                    }

                    @Override
                    protected void writeLineSeparator() throws IOException
                    {
                        startOfLine = true;
                        super.writeLineSeparator();
                        if(closeWithEnd) {
                            writeString("  ");
                        }
                    }

                    @Override
                    protected void writeString(String text, List textPositions) throws IOException
                    {
                        if (startOfLine)
                        {
                            TextPosition firstProsition = textPositions.get(0);
                            PDAnnotation ann;
                            if((ann = isTextAnnotated(firstProsition, text)) != null) {
                                writeString("  ");
                                closeWithEnd = true;
                            } else {
                                closeWithEnd = false;
                            }
                            startOfLine = false;
                        }
                        super.writeString(text+" ", textPositions);
                    }
                    private PDAnnotation isTextAnnotated(TextPosition firstProsition, String text) {
                        for (PDAnnotation ann : la) {
                            System.out.println(text+" ------------- "+getDistance(ann, firstProsition));
                        }
                        return null;
                    }
                    private double getDistance(PDAnnotation ann, TextPosition firstProsition) {
                        TODO - how to get distance
                        return 0.0;
                    }
                    boolean startOfLine = true;
                };

                pdDoc = new PDDocument(cosDoc);
                pdfStripper.setStartPage(0);
                pdfStripper.setEndPage(pdDoc.getNumberOfPages());
                String parsedText = pdfStripper.getText(pdDoc);

Maven dependencies

junit junit 3.8.1 test org.apache.pdfbox pdfbox 1.8.10 org.apache.tika tika-core 1.13 commons-io commons-io 2.4 log4j log4j 1.2.17 info.debatty java-string-similarity RELEASE org.apache.opennlp opennlp-tools 1.6.0

Amber · Accepted Answer

You can get the annotation rectangle and see if it contains both the upper left and lower right corner of each text position. Since writeString contains several characters you'll want to check each character individually since the annotation may cover just a subset of the characters. The annotation may also wrap lines, so you will want to check at the end of the page (not at the end of each line) if you need to close your html tag. Note that the rectangle you get from the annotation is in PDF space. But the coordinates you get from the TextPosition is in java space. So when you check Rectangle.contains you'll need to translate the text position coordinates to PDF space.

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

public class MyPDFTextStripper extends PDFTextStripper
{
    public MyPDFTextStripper() throws IOException
    {
        super();
        // TODO Auto-generated constructor stub
    }

    PDPage currentPage;
    List pageAnnotations;
    private boolean needsEndTag;
    boolean startOfLine = true;

    @Override
    protected void startPage(PDPage page) throws IOException
    {
        currentPage = page;
        pageAnnotations = currentPage.getAnnotations();
        super.startPage(page);
    }

    @Override
    protected void writeString(String text, List textPositions) throws IOException
    {
        StringBuilder newText = new StringBuilder();
        PDAnnotation currentAnnot = null;
        for (TextPosition textPosition : textPositions)
        {
            PDAnnotation annotation = getAnnotation(textPosition);
            if (annotation != null)
            {
                if (currentAnnot == null)
                {
                    // if the currentAnnot is null, start a new annotation
                    newText.append("");
                }
                else if (!currentAnnot.getAnnotationName().equals(annotation.getAnnotationName()))
                {
                    // if the current Annot is different, end it and start a new
                    // one
                    newText.append("");
                }
                // remember this in case the annotation wraps lines
                needsEndTag = true;
                currentAnnot = annotation;
            }
            else if (currentAnnot != null)
            {
                // if no new annotation is associated with the text, but there used to be, close the tag
                newText.append("");
                currentAnnot = null;
                needsEndTag = false;
            }
            newText.append(textPosition.getCharacter());
        }
        super.writeString(newText.toString(), textPositions);
    }

    private PDAnnotation getAnnotation(TextPosition textPosition)
    {
        float textX1 = textPosition.getX();
        // Translate the y coordinate to PDF Space
        float textY1 = currentPage.findMediaBox().getHeight() - textPosition.getY();
        float textX2 = textX1 + textPosition.getWidth();
        float textY2 = textY1 + textPosition.getHeight();

        for (PDAnnotation annotation : pageAnnotations)
        {
            if (annotation.getRectangle().contains(textX1, textY1) && annotation.getRectangle().contains(textX2, textY2))
            {
                return annotation;
            }
        }
        return null;
    }

    @Override
    public String getPageEnd()
    {
        // if the annotation wraps lines and extends to the end of the document, need to add the end tag
        if (needsEndTag)
        {
            return "" + super.getPageEnd();
        }
        return super.getPageEnd();
    }

    public static void main(String[] args) throws Exception
    {
        File file = new File(args[0]);
        PDFParser parser = new PDFParser(new FileInputStream(file));
        parser.parse();
        COSDocument cosDoc = parser.getDocument();

        MyPDFTextStripper pdfStripper = new MyPDFTextStripper();

        PDDocument pdDoc = new PDDocument(cosDoc);
        pdfStripper.setStartPage(0);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        String parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText);
    }
}

Extract PDF comments into HTML

Answers (1)

Related Questions