Alex Torrisi
Alex Torrisi

Reputation: 97

Highlight words inside existing PDF

I need to highlight a set of words inside an existing PDF given specific coordinates that i have already extracted. I am working with pdfbox by Apache (last version 2.0.8). There is an example file I can use to such a purpose (AddAnnotations.java inside the pdfbox website) but I think this example was compiled with an older Java version as the following import does not work:

import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;

Can anyone help me with that? Which is the simplest way to highlight words by using this library?

Upvotes: 2

Views: 5188

Answers (4)

viewxy
viewxy

Reputation: 1

Very greatful for the answers above! I based my solution on the PDFBox answers.

I modified a little bit so it only highlights the specific word, and not the whole row. You have to tweak the start and end positions. I got rid of isFound boolean and moved the rest of the code inside the if block. Then added two variables, startPosition and endPosition. Here is the modified snippet of code:

int startPosition = string.indexOf(criteria[i])
int endPosition = startPosition + criteria[i].length()
                
posXInit = textPositions.get(startPosition).getXDirAdj();
posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
width = textPositions.get(startPosition).getWidthDirAdj();
height = textPositions.get(startPosition).getHeightDir();

I use groovy so a tinybit different, but here is the whole function:

@Override
public void writeString(String string, List<TextPosition> textPositions) throws IOException {
    float posXInit = 0
    float posXEnd = 0
    float posYInit = 0
    float posYEnd = 0
    float width = 0
    float height = 0 
    float fontHeight = 0
    String[] criteria = ["Word2", "Word5"];

    for (int i = 0; i < criteria.length; i++) {
        if (string.contains(criteria[i])) {
            int startPosition = string.indexOf(criteria[i])
            int endPosition = startPosition + criteria[i].length()
            
            posXInit = textPositions.get(startPosition).getXDirAdj();
            posXEnd = textPositions.get(endPosition - 1).getXDirAdj() + textPositions.get(endPosition - 1).getWidth();
            posYInit = textPositions.get(startPosition).getPageHeight() - textPositions.get(startPosition).getYDirAdj();
            posYEnd = textPositions.get(startPosition).getPageHeight() - textPositions.get(endPosition - 1).getYDirAdj();
            width = textPositions.get(startPosition).getWidthDirAdj();
            height = textPositions.get(startPosition).getHeightDir();

            println(string + "X-Init = " + posXInit + "; Y-Init = " + posYInit + "; X-End = " + posXEnd + "; Y-End = " + posYEnd + "; Font-Height = " + fontHeight);

            /* numeration is index-based. Starts from 0 */

            float[] quadPoints = [posXInit, posYEnd + height + 2, posXEnd, posYEnd + height + 2, posXInit, posYInit - 2, posXEnd, posYEnd - 2];

            List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
            PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup("Highlight");

            PDRectangle position = new PDRectangle();
            position.setLowerLeftX((float) posXInit);
            position.setLowerLeftY((float) posYEnd);
            position.setUpperRightX((float) posXEnd);
            position.setUpperRightY((float) (posYEnd +  height));

            highlight.setRectangle(position);

            // quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
            // of the area to be highlighted

            highlight.setQuadPoints(quadPoints);
            float[] components = [ (float) 1, (float) 1, (float) (100 / 255)]
            PDColor yellow = new PDColor(components, PDDeviceRGB.INSTANCE);
            highlight.setColor(yellow);
            annotations.add(highlight);
        } 
    }
}

And the final document looks like this:

enter image description here

Upvotes: 0

Vivek Srivastava
Vivek Srivastava

Reputation: 37

Highlight specific words in a document using PDFclown.

package com.NLP.demo;

import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;

public class PDFCrownDemo  {

    public static void main() throws IOException {
        PDFCrownDemo PDFCrownDemo=new PDFCrownDemo();
        PDFCrownDemo.highlighttext();

    }

    public void highlighttext() throws IOException{

        org.pdfclown.files.File file = new org.pdfclown.files.File("src/main/resources/XXX.pdf");   
        String textRegEx = "Contract";
        Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);

        TextExtractor textExtractor = new TextExtractor(true, true);

        for(final Page page : file.getDocument().getPages())
        {
          Map<Rectangle2D,List<ITextString>> textStrings = textExtractor.extract(page);
          final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
         textExtractor.filter(textStrings,new TextExtractor.IIntervalFilter()
            {
              @Override
              public boolean hasNext()
              {return matcher.find();}

              @Override
              public Interval next()
              {return new Interval(matcher.start(), matcher.end());}

              @Override
              public void process(Interval interval,ITextString match)
              {
                // Defining the highlight box of the text pattern match...
                List highlightQuads = new ArrayList();
                {
                  /*
                    NOTE: A text pattern match may be split across multiple contiguous lines,
                    so we have to define a distinct highlight box for each text chunk.
                  */
                  Rectangle2D textBox = null;
                  for(TextChar textChar : match.getTextChars())
                  {
                    Rectangle2D textCharBox = textChar.getBox();
                    if(textBox == null)
                    {textBox = (Rectangle2D)textCharBox.clone();}
                    else
                    {
                      if(textCharBox.getY() > textBox.getMaxY())
                      {
                        highlightQuads.add(Quad.get(textBox));
                        textBox = (Rectangle2D)textCharBox.clone();
                      }
                      else
                      {textBox.add(textCharBox);}
                    }
                  }
                  highlightQuads.add(Quad.get(textBox));
                }
                // Highlight the text pattern match!
                new TextMarkup(page,MarkupTypeEnum.Highlight, highlightQuads);
              }

              @Override
              public void remove(
                )
              {throw new UnsupportedOperationException();}
            }
            );
        }

        //file.save(SerializationModeEnum.Incremental);
        file.save(new java.io.File("src/main/resources/XXX.pdf"), SerializationModeEnum.Standard);
    }

}

Upvotes: -2

Vivek Srivastava
Vivek Srivastava

Reputation: 37

Here is the code to highlight specific words inside a PDF document. Please note this is working for highlighting the line of the search text. Highlight specific words in a PDF is still in progress... Any suggestion to highlight specific words on top of this code will be highly appreciated.

This script was built using Apache PDFBox 2.0.8

    import java.io.ByteArrayOutputStream;
    import java.io.File;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.util.List;

    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.common.PDRectangle;
    import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
    import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.TextPosition;

    public class PDFhighlightDemo extends PDFTextStripper {

        public PDFhighlightDemo()  throws IOException {
            super();
        }

        public static void main(String[] args)  throws IOException {
            PDDocument document = null;
            String fileName = "Demo1.pdf";
            try {
                document = PDDocument.load( new File(fileName) );
                PDFTextStripper stripper = new PDFhighlightDemo();
                stripper.setSortByPosition( true );

                stripper.setStartPage( 0 );
                stripper.setEndPage( document.getNumberOfPages() );

                Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
                stripper.writeText(document, dummy);

                File file1 = new File("FinalPDF.pdf");
                document.save(file1);
            }
            finally {
                if( document != null ) {
                    document.close();
                }
            }
        }

        /**
         * Override the default functionality of PDFTextStripper.writeString()
         */

        @Override
        protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
            boolean isFound = false;

            float posXInit1  = 0, 
                    posXEnd1   = 0, 
                    posYInit1  = 0,
                    posYEnd1   = 0,
                    width1     = 0, 
                    height1    = 0, 
                    fontHeight1 = 0;

            String[] criteria = {"angular", "prepared"};

            for (int i = 0; i < criteria.length; i++) {
                if (string.contains(criteria[i])) {
                    isFound = true;
                } 
            }
            if (isFound) {

                for(TextPosition textPosition:textPositions) {

                  posXInit1 = textPositions.get(0).getXDirAdj(); 
                  posXEnd1  = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth();
                  posYInit1 = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj();
                  posYEnd1  = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj();
                  width1    = textPositions.get(0).getWidthDirAdj();
                  height1   = textPositions.get(0).getHeightDir();

                }


                float quadPoints[] = {posXInit1, posYEnd1 + height1 + 2, posXEnd1, posYEnd1 + height1 + 2, posXInit1, posYInit1 - 2, posXEnd1, posYEnd1 - 2};

                List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
                PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);

                PDRectangle position = new PDRectangle();
                position.setLowerLeftX(posXInit1);
                position.setLowerLeftY(posYEnd1);
                position.setUpperRightX(posXEnd1);
                position.setUpperRightY(posYEnd1 + height1);

                highlight.setRectangle(position);

                // quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right) 
                // of the area to be highlighted

                highlight.setQuadPoints(quadPoints);

                PDColor yellow = new PDColor(new float[]{1, 1, 1 / 255F}, PDDeviceRGB.INSTANCE);
                highlight.setColor(yellow);
                annotations.add(highlight);
            }
        }

    }

Upvotes: 4

Alex Torrisi
Alex Torrisi

Reputation: 97

Here is the code to highlight ALL the words inside a PDF document. Highlighting only a specific set of words can be easily performed modifying this script. Please note this is only a test and further checks are needed for words that terminates in a new line as well as words placed in negative landscape/portrait PDF pages. Optimizing this script is also possible.

This script was built using Apache PDFBox 2.0.8.

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;

public class TestAnnotatePDF extends PDFTextStripper
{
    static List<double[]> coordinates;
    static ArrayList tokenStream;

    public TestAnnotatePDF() throws IOException
    {
        //data structed containing coordinates information for each token
        coordinates = new ArrayList<>();

        //List of words extracted from text (considering a whitespace-based tokenization)
        tokenStream = new ArrayList();
    }

    public static void main(String [] args) throws IOException
    {

        try
        {   
           //Loading an existing document
           File file = new File("MyDocument");
           PDDocument document = PDDocument.load(file);

           //extended PDFTextStripper class
           PDFTextStripper stripper = new TestAnnotatePDF();

           //Get number of pages
           int number_of_pages = document.getDocumentCatalog().getPages().getCount();

           //The method writeText will invoke an override version of writeString
           Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
           stripper.writeText(document, dummy);

           //Print collected information
           System.out.println(tokenStream);
           System.out.println(tokenStream.size());
           System.out.println(coordinates.size());

           double page_height;
           double page_width;
           double width, height, minx, maxx, miny, maxy;
           int rotation;

           //scan each page and highlitht all the words inside them
           for (int page_index = 0; page_index < number_of_pages; page_index++)
           {   
               //get current page
               PDPage page = document.getPage(page_index);

               //Get annotations for the selected page
               List<PDAnnotation> annotations = page.getAnnotations();

               //Define a color to use for highlighting text
               PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);

               //Page height and width
               page_height = page.getMediaBox().getHeight();
               page_width  = page.getMediaBox().getWidth();

               //Scan collected coordinates
               for (int i=0; i<coordinates.size(); i++)
                  {
                   //if the current coordinates are not related to the current
                   //page, ignore them
                   if ((int) coordinates.get(i)[4] != (page_index+1))
                      continue;
                   else
                   {
                       //get rotation of the page...portrait..landscape..
                       rotation = (int) coordinates.get(i)[7];

                       //page rotated of 90degrees
                       if (rotation == 90)
                       {
                           height = coordinates.get(i)[5];
                           width = coordinates.get(i)[6];
                           width = (page_height * width)/page_width;

                           //define coordinates of a rectangle
                           maxx = coordinates.get(i)[1];
                           minx = coordinates.get(i)[1] - height;
                           miny = coordinates.get(i)[0];
                           maxy = coordinates.get(i)[0] + width;
                       }
                       else //i should add here the cases -90/-180 degrees
                       {
                           height = coordinates.get(i)[5];
                           minx = coordinates.get(i)[0];
                           maxx = coordinates.get(i)[2];
                           miny = page_height - coordinates.get(i)[1];
                           maxy = page_height - coordinates.get(i)[3] + height;
                       }

                       //Add an annotation for each scanned word
                       PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                       txtMark.setColor(red);
                       txtMark.setConstantOpacity((float)0.3); // 30% transparent
                       PDRectangle position = new PDRectangle();
                       position.setLowerLeftX((float) minx);
                       position.setLowerLeftY((float) miny);
                       position.setUpperRightX((float) maxx);
                       position.setUpperRightY((float) ((float) maxy+height));
                       txtMark.setRectangle(position);

                       float[] quads = new float[8];
                       quads[0] = position.getLowerLeftX();  // x1
                       quads[1] = position.getUpperRightY()-2; // y1
                       quads[2] = position.getUpperRightX(); // x2
                       quads[3] = quads[1]; // y2
                       quads[4] = quads[0];  // x3
                       quads[5] = position.getLowerLeftY()-2; // y3
                       quads[6] = quads[2]; // x4
                       quads[7] = quads[5]; // y5
                       txtMark.setQuadPoints(quads);
                       txtMark.setContents(tokenStream.get(i).toString());
                       annotations.add(txtMark);
                   }    
               }
           }

           //Saving the document in a new file
           File highlighted_doc = new File("MyDocument_final.pdf");
           document.save(highlighted_doc);

        document.close();
    }
    catch(IOException e)
    {
        System.out.println(e);
    }

}

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{ 
    String token = "";
    int token_length = textPositions.size();
    int counter = 1;
    double minx = 0,maxx = 0,miny = 0,maxy =0; 
    double height = 0;
    double width = 0;
    int rotation = 0;

    for (TextPosition text : textPositions)
    {          
        rotation = text.getRotation();

        if (text.getHeight() > height)
            height = text.getHeight(); 

        if (text.getWidth() > width)
            width = text.getWidth();

        //if it is the first char of the current word
        if (counter == 1)
        {
            minx = text.getX();
            miny = text.getY();
        }

        //if it is the last char of the current word
        if (counter == token_length)
        {
            maxx = text.getEndX();
            maxy = text.getY();
        }

        token += text;
        counter += 1;

    }

    tokenStream.add(token);
    double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
    coordinates.add(word_coordinates);
}}

Upvotes: 7

Related Questions