Gonzo345
Gonzo345

Reputation: 1333

Extract coordinates of each separate word into a TextChunk in a pdf file

Following this actual solution I am trying to get all the words inside a TextChunk and each of its coordinates (actual page, top, bottom, left, right).

Since a TextChunk could be a phrase, a word or whatever, I tried to do this manually, counting on the last word's rectangle and cutting it each time. I noticed this manual method could be so buggy (I would need to manually count on special characters and so on), so I asked myself if ITextSharp provides any easier way to perform this.

My Chunk and LocationTextExtractionStragy inherited classes are the following:

public class Chunk
{
    public Guid Id { get; set; }
    public Rectangle Rect { get; set; }
    public TextRenderInfo Render { get; set; }
    public BaseFont BF { get; set; }
    public string Text { get; set; }
    public int FontSize { get; set; }


    public Chunk(Rectangle rect, TextRenderInfo renderInfo)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = Render.GetText();
        Initialize();
    }

        
    public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = text;
        Initialize();
    }

    
    private void Initialize()
    {
        this.Id = Guid.NewGuid();
        this.BF = Render.GetFont();
        this.FontSize = ObtainFontSize();
    }

    private int ObtainFontSize()
    {
        return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
    }
}

public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
    //Save each coordinate
    public List<Chunk> ChunksInPage = new List<Chunk>();
        
    //Automatically called on each chunk on PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);
        if (string.IsNullOrWhiteSpace(renderInfo.GetText())
                || renderInfo == null)
                return;

        //Get chunk Vectors
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create Rectangle based on previous Vectors
        var rect = new Rectangle(
                           bottomLeft[Vector.I1],
                           bottomLeft[Vector.I2],
                           topRight[Vector.I1],
                           topRight[Vector.I2]);

        if (rect == null)
                return;

        //Add each chunk with its coordinates
        ChunksInPage.Add(new Chunk(rect, renderInfo));
    }
}

So once I get the file and so on, I proceed this way:

private void ProcessContent()
{
    for (int page= 1; page <= pdfReader.NumberOfPages; page++)
    {
        var strategy = new LocationTextExtractionPersonalizada();

        var currentPageText = PdfTextExtractor.GetTextFromPage(
                                          pdfReader,
                                          page,
                                          strategy);
        
        //Here is where I want to get each word with its coordinates
        var chunksWords= ChunkRawToWord(strategy.ChunksInPage);
    }
}

private List<Chunk> ChunkRawToWord(IList<Chunk> chunks)
{
    if (chunks == null || chunks[0] == null)
            return null;

    var words = new List<Chunk>();
    //Poor RegEx pattern to get the word and its wathever
    string pattern = @"[@&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?";

    var something = chunks[0].Render.GetCharacterRenderInfos();

    for (int i = 0; i < chunks.Count; i++)
    {
        var wordsInChunk = Regex.Matches(
                                          chunks[i].Text,
                                          pattern,
                                          RegexOptions.IgnoreCase);
                

        var rectangleChunk = new Rectangle(chunks[i].Rect);
        for (int j = 0; j < wordsInChunk.Count; j++)
        {
            if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value))
                continue;

        var word = new Chunk(
                                   rectangleChunk, 
                                   chunks[i].Render, 
                                   wordsInChunk[j].ToString());
                    
            if (j == 0)
            {
                word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize);
                    words.Add(word);
                    continue;
            }

            if (words.Count <= 0)
                continue;

            word.Rect.Left = words[j - 1].Rect.Right;
            word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize);
            words.Add(word);
        }
    }

    return words;
}

Afterwards, I wrote a comment on Mkl's solution, being replied with "use getCharacterRenderInfos()", which I use and I get every single character into a TextRenderInfo's List.

I'm sorry but I'm starting to mix concepts, ways to find out how to apply that solution and blowing my mind.

I would really appreciate a hand here.

Upvotes: 0

Views: 1126

Answers (1)

Roger Saladrigas
Roger Saladrigas

Reputation: 356

You can use the method TextRenderInfo.GetCharacterRenderInfos() to get a collection of TextRenderInfo for each and every char in your chunk. Then you can could regroup the individual characters into words and calculate the rectangle that contains the word using the coordinates of the first and last TextRenderInfo in that word.

In your custom text extraction strategy:

 var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
 protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
    {
        var resultInfo = new List<TextRenderInfo>();
        var chars = currentInfo.GetCharacterRenderInfos();

        foreach (var charRenderInfo in chars)
        {
            resultInfo.Add(charRenderInfo);
            var currentChar = charRenderInfo.GetText();
            if (_separators.Contains(currentChar))
            {
                ProcessWord(currentInfo, resultInfo);
                resultInfo.Clear();
            }
        }
        ProcessWord(currentInfo, resultInfo);
    }
 private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
    {
        var firstRender = wordChunks.FirstOrDefault();
        var lastRender = wordChunks.LastOrDefault();
        if (firstRender == null || lastRender == null)
        {
            return;
        }
        var startCoords = firstRender.GetDescentLine().GetStartPoint();
        var endCoords = lastRender.GetAscentLine().GetEndPoint();
        var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
        var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
        _chunks.Add(new CustomTextChunk(wordText, wordLocation));
    }

Upvotes: 2

Related Questions