Prakhar Tiwari
Prakhar Tiwari

Reputation: 9

How to get exact coordinate of each word of pdf using itextsharp?

I have implemented LocationTextExtractionStrategy of my own. The renderinfo in some pdfs reads combination of words as chunks while in others it reads character by character

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf.parser;

namespace PDFAnnotater
{
    public class TestLTES : LocationTextExtractionStrategy
    {
        int WordIndex;
        //Hold each coordinate
        public List<RectAndText> myPoints = new List<RectAndText>();
        public string formedText="";
        public Vector[] charBottomLeft;
        public Vector[] charTopRight;
        public int Counter=0;
        //The string that we're searching for
        public string TextToSearchFor { get; set; }
        public bool found = false;

    //How to compare strings
    public System.Globalization.CompareOptions CompareOptions { get; set; }

    public TestLTES(string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
    {
        TextToSearchFor = textToSearchFor;
        CompareOptions = compareOptions;
        charBottomLeft = new Vector[1];
        charTopRight = new Vector[1];

    }
    public TestLTES(int index, string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
    {
        TextToSearchFor = textToSearchFor;
        CompareOptions = compareOptions;
        WordIndex = index;

    }


    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);
        //See if the current chunk contains the text
        if (renderInfo.GetText().Length >= this.TextToSearchFor.Length)
        {
            if (renderInfo.GetText().Split().Contains(this.TextToSearchFor))
            {
                var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);


                //Grab the individual characters
                var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
                var textpos = renderInfo.GetText();
                //.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();


                //Grab the first and last character
                var firstChar = chars.First();
                var lastChar = chars.Last();


                //Get the bounding box for the chunk of text
                var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
                var topRight = lastChar.GetAscentLine().GetEndPoint();

                //Create a rectangle from it
                var rect = new iTextSharp.text.Rectangle(
                                                        bottomLeft[Vector.I1],
                                                        bottomLeft[Vector.I2],
                                                        topRight[Vector.I1],
                                                        topRight[Vector.I2]
                                                        );

                IntegerRectangle TempRect = new IntegerRectangle();
                TempRect.Top = (int)Math.Truncate(rect.Top);
                TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
                TempRect.Left = (int)Math.Truncate(rect.Left);
                TempRect.Right = (int)Math.Truncate(rect.Right);


                //Add this to our main collection
                this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
            }

            else
            {
                return;
            }
        }
        else
        {
            if (renderInfo.GetText() != "" && renderInfo.GetText()!=" ")
            {
                string[] renderTextArray = renderInfo.GetText().Split();
                for (int i = 0; i < renderTextArray.Length; i++)
                {
                    if (TextToSearchFor.Contains(renderTextArray[i]))
                    {
                        if (charBottomLeft != null && charTopRight != null)
                        {
                            var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderTextArray[i], this.TextToSearchFor, this.CompareOptions);
                            var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
                            //Grab the first and last character
                            var firstChar = chars.First();
                            var lastChar = chars.Last();


                            //Get the bounding box for the chunk of text
                            var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
                            var topRight = lastChar.GetAscentLine().GetEndPoint();
                            if (FoundChars.foundCharsList.Contains(renderTextArray[i] + bottomLeft + topRight))
                                return;

                            formedText = formedText + renderTextArray[i];
                            if (Counter > 0)
                            {
                                charBottomLeft = ReAllocate(charBottomLeft, Counter + 1);
                                charTopRight = ReAllocate(charTopRight, Counter + 1);
                            }
                            charBottomLeft[Counter] = bottomLeft;
                            charTopRight[Counter] = topRight;
                            Counter++;
                            FoundChars.foundCharsList.Add(renderTextArray[i] + bottomLeft + topRight);
                            if (formedText == TextToSearchFor)
                            {
                                var bLeft = charBottomLeft[0];
                                var tRight = charTopRight[Counter - 1];
                                Counter = 0;
                                formedText = "";
                                charBottomLeft = null;
                                charTopRight = null;
                                var rect = new iTextSharp.text.Rectangle(
                                                               bLeft[Vector.I1],
                                                               bLeft[Vector.I2],
                                                               tRight[Vector.I1],
                                                               tRight[Vector.I2]
                                                               );

                                IntegerRectangle TempRect = new IntegerRectangle();
                                TempRect.Top = (int)Math.Truncate(rect.Top);
                                TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
                                TempRect.Left = (int)Math.Truncate(rect.Left);
                                TempRect.Right = (int)Math.Truncate(rect.Right);


                                //Add this to our main collection
                                this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
                            }
                        }
                    }
                }
            }
        }
    }
    private Vector[] ReAllocate(Vector[] arr,int counter)
    {

            Vector[] ReAllocatedArray = new Vector[counter];

                for (int j = 0; j < arr.Length; j++)
                {
                  ReAllocatedArray[j] = arr[j];
                }

            return ReAllocatedArray;

      }
   }
}

In some cases it is working properly while in some cases it is not. Is there some other feasible way to get the coordinate of a particular word.

Upvotes: 0

Views: 1247

Answers (1)

Joris Schellekens
Joris Schellekens

Reputation: 9012

The best way to solve this is to look at how SimpleTextExtractionStrategy works. In this strategy, iText also processes chunks and turns them into a String.

The general workflow is:

  • get all TextRenderInfo events
  • turn them into CharacterRenderInfo events
  • sort the list of CharacterRenderInfo events in logical reading order
  • go over the list, aggregating characters into words if they are close together (this is a heuristic, iText uses 'less than the width of a single space in the given font').
  • now you have the boundaries (provided by CharacterRenderInfo.getBoundingBox) and the words

Upvotes: 1

Related Questions