Reputation: 561
I'm trying to use PDFBox 2.0 for text extraction. I would like to get information on the font size of specific characters and the position rectangle of that character on the page. I've implemented this in PDFBox 1.6 using a PDFTextStripper:
PDFParser parser = new PDFParser(is);
try{
parser.parse();
}catch(IOException e){
}
COSDocument cosDoc = parser.getDocument();
PDDocument pdd = new PDDocument(cosDoc);
final StringBuffer extractedText = new StringBuffer();
PDFTextStripper textStripper = new PDFTextStripper(){
@Override
protected void processTextPosition(TextPosition text) {
extractedText.append(text.getCharacter());
logger.debug("text position: "+text.toString());
}
};
textStripper.setSuppressDuplicateOverlappingText(false);
for(int pageNum = 0;pageNum<pdd.getNumberOfPages();pageNum++){
PDPage page = (PDPage) pdd.getDocumentCatalog().getAllPages().get(pageNum);
textStripper.processStream(page, page.findResources(), page.getContents().getStream());
}
pdd.close();
But in the 2.0 version of PDFBox, the processStream
method has been removed.
How can I achieve the same with PDFBox 2.0?
I've tried the following:
PDDocument pdd = PDDocument.load(inputStream);
PDFTextStripper textStripper = new PDFTextStripper(){
@Override
protected void processTextPosition(TextPosition text){
int pos = PDFdocument.length();
String textadded = text.getUnicode();
Range range = new Range(pos,pos+textadded.length());
int pagenr = this.getCurrentPageNo();
Rectangle2D rect = new Rectangle2D.Float(text.getX(),text.getY(),text.getWidth(),text.getHeight());
}
};
textStripper.setSuppressDuplicateOverlappingText(false);
for(int pageNum = 0;pageNum<pdd.getNumberOfPages();pageNum++){
PDPage page = (PDPage) pdd.getDocumentCatalog().getPages().get(pageNum);
textStripper.processPage(page);
}
pdd.close();
The processTextPosition(TextPosition text)
method does not get called.
Any suggestions would be very welcome.
Upvotes: 3
Views: 18265
Reputation: 1
Here is an implementation which uses @tilmanhausherr suggestion(s):
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
class PDFParserTextStripper extends PDFTextStripper
{
public PDFParserTextStripper(PDDocument pdd) throws IOException
{
super();
document = pdd;
}
public void stripPage(int pageNr) throws IOException
{
this.setStartPage(pageNr+1);
this.setEndPage(pageNr+1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(document,dummy); // This call starts the parsing process and calls writeString repeatedly.
}
@Override
protected void writeString(String string,List<TextPosition> textPositions) throws IOException
{
for (TextPosition text : textPositions) {
System.out.println("String[" + text.getXDirAdj()+","+text.getYDirAdj()+" fs="+text.getFontSizeInPt()+" xscale="+text.getXScale()+" height="+text.getHeightDir()+" space="+text.getWidthOfSpace()+" width="+text.getWidthDirAdj()+" ] "+text.getUnicode());
}
}
public static void extractText(InputStream inputStream)
{
PDDocument pdd = null;
try
{
pdd = PDDocument.load(inputStream);
PDFParserTextStripper stripper = new PDFParserTextStripper(pdd);
stripper.setSortByPosition(true);
for (int i=0; i<pdd.getNumberOfPages(); i++)
{
stripper.stripPage(i);
}
}
catch (IOException e)
{
// throw error
}
finally
{
if (pdd != null)
{
try
{
pdd.close();
}
catch (IOException e)
{
}
}
}
}
public static void main(String[] args) throws IOException
{
File f = new File("C:\\PathToYourPDF\\pdfFile.pdf");
FileInputStream fis = null;
try
{
fis = new FileInputStream(f);
extractText(fis);
}
catch(IOException e)
{
e.printStackTrace();
}
finally
{
try
{
if(fis != null)
fis.close();
}
catch(IOException ex)
{
ex.printStackTrace();
}
}
}
}
Upvotes: 0
Reputation: 561
The DrawPrintTextLocations
example, suggested by @tilmanhausherr, provided the solution to my problem.
The parser is started using the following code (the inputStream
is the input stream from the URL of the PDF file):
PDDocument pdd = null;
try {
pdd = PDDocument.load(inputStream);
PDFParserTextStripper stripper = new PDFParserTextStripper(PDFdocument,pdd);
stripper.setSortByPosition(true);
for (int i=0;i<pdd.getNumberOfPages();i++){
stripper.stripPage(i);
}
} catch (IOException e) {
// throw error
} finally {
if (pdd!=null) {
try {
pdd.close();
} catch (IOException e) {
}
}
}
This code uses a custom subclass of PDFTextStripper
:
class PDFParserTextStripper extends PDFTextStripper {
public PDFParserTextStripper() throws IOException {
super();
}
public void stripPage(int pageNr) throws IOException {
this.setStartPage(pageNr+1);
this.setEndPage(pageNr+1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(document,dummy); // This call starts the parsing process and calls writeString repeatedly.
}
@Override
protected void writeString(String string,List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
System.out.println("String[" + text.getXDirAdj()+","+text.getYDirAdj()+" fs="+text.getFontSizeInPt()+" xscale="+text.getXScale()+" height="+text.getHeightDir()+" space="+text.getWidthOfSpace()+" width="+text.getWidthDirAdj()+" ] "+text.getUnicode());
}
}
}
Upvotes: 4