Reputation: 1333
Following this actual solution I am trying to get all the words inside a TextChunk
and each of its coordinates (actual page
, top
, bottom
, left
, right
).
Since a TextChunk
could be a phrase, a word or whatever, I tried to do this manually, counting on the last word's rectangle and cutting it each time. I noticed this manual method could be so buggy (I would need to manually count on special characters and so on), so I asked myself if ITextSharp provides any easier way to perform this.
My Chunk
and LocationTextExtractionStragy
inherited classes are the following:
public class Chunk
{
public Guid Id { get; set; }
public Rectangle Rect { get; set; }
public TextRenderInfo Render { get; set; }
public BaseFont BF { get; set; }
public string Text { get; set; }
public int FontSize { get; set; }
public Chunk(Rectangle rect, TextRenderInfo renderInfo)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = Render.GetText();
Initialize();
}
public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = text;
Initialize();
}
private void Initialize()
{
this.Id = Guid.NewGuid();
this.BF = Render.GetFont();
this.FontSize = ObtainFontSize();
}
private int ObtainFontSize()
{
return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
}
}
public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
//Save each coordinate
public List<Chunk> ChunksInPage = new List<Chunk>();
//Automatically called on each chunk on PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
if (string.IsNullOrWhiteSpace(renderInfo.GetText())
|| renderInfo == null)
return;
//Get chunk Vectors
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create Rectangle based on previous Vectors
var rect = new Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]);
if (rect == null)
return;
//Add each chunk with its coordinates
ChunksInPage.Add(new Chunk(rect, renderInfo));
}
}
So once I get the file and so on, I proceed this way:
private void ProcessContent()
{
for (int page= 1; page <= pdfReader.NumberOfPages; page++)
{
var strategy = new LocationTextExtractionPersonalizada();
var currentPageText = PdfTextExtractor.GetTextFromPage(
pdfReader,
page,
strategy);
//Here is where I want to get each word with its coordinates
var chunksWords= ChunkRawToWord(strategy.ChunksInPage);
}
}
private List<Chunk> ChunkRawToWord(IList<Chunk> chunks)
{
if (chunks == null || chunks[0] == null)
return null;
var words = new List<Chunk>();
//Poor RegEx pattern to get the word and its wathever
string pattern = @"[@&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?";
var something = chunks[0].Render.GetCharacterRenderInfos();
for (int i = 0; i < chunks.Count; i++)
{
var wordsInChunk = Regex.Matches(
chunks[i].Text,
pattern,
RegexOptions.IgnoreCase);
var rectangleChunk = new Rectangle(chunks[i].Rect);
for (int j = 0; j < wordsInChunk.Count; j++)
{
if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value))
continue;
var word = new Chunk(
rectangleChunk,
chunks[i].Render,
wordsInChunk[j].ToString());
if (j == 0)
{
word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
continue;
}
if (words.Count <= 0)
continue;
word.Rect.Left = words[j - 1].Rect.Right;
word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
}
}
return words;
}
Afterwards, I wrote a comment on Mkl's solution, being replied with "use getCharacterRenderInfos()", which I use and I get every single character into a TextRenderInfo's List.
I'm sorry but I'm starting to mix concepts, ways to find out how to apply that solution and blowing my mind.
I would really appreciate a hand here.
Upvotes: 0
Views: 1126
Reputation: 356
You can use the method TextRenderInfo.GetCharacterRenderInfos()
to get a collection of TextRenderInfo
for each and every char in your chunk. Then you can could regroup the individual characters into words and calculate the rectangle that contains the word using the coordinates of the first and last TextRenderInfo
in that word.
In your custom text extraction strategy:
var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo = new List<TextRenderInfo>();
var chars = currentInfo.GetCharacterRenderInfos();
foreach (var charRenderInfo in chars)
{
resultInfo.Add(charRenderInfo);
var currentChar = charRenderInfo.GetText();
if (_separators.Contains(currentChar))
{
ProcessWord(currentInfo, resultInfo);
resultInfo.Clear();
}
}
ProcessWord(currentInfo, resultInfo);
}
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
{
var firstRender = wordChunks.FirstOrDefault();
var lastRender = wordChunks.LastOrDefault();
if (firstRender == null || lastRender == null)
{
return;
}
var startCoords = firstRender.GetDescentLine().GetStartPoint();
var endCoords = lastRender.GetAscentLine().GetEndPoint();
var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
_chunks.Add(new CustomTextChunk(wordText, wordLocation));
}
Upvotes: 2