Reputation: 29
I'm extracting data from PDF to excel. In that PDF contains table also. I used Itext- pdf
to covert PDF to text & with the help of apache poi
covert text to excel. but I'm not able to retrieve the data to store in the database. I tried PDF-BOX
, ASPOSE
also Same result I'm getting. If any one knows, Please help me to solve this issue.
Here is my code
// pdf to text using itext
PdfReader reader = new PdfReader(
"C:\\Users\\mohmeds\\Desktop\\BOI_SCFS banking.pdf_page_1.pdf");
PdfReaderContentParser parser = new PdfReaderContentParser(
reader);
// PrintWriter out = new PrintWriter(new FileOutputStream(txt));
TextExtractionStrategy strategy;
String line = null;
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
strategy = parser.processContent(i,
new SimpleTextExtractionStrategy());
line = strategy.getResultantText();
}
reader.close();
// using apache poi text to excel converter
org.apache.poi.ss.usermodel.Workbook wb = new HSSFWorkbook();
CreationHelper helper = wb.getCreationHelper();
Sheet sheet = wb.createSheet("new sheet");
System.out.println("link------->" + line);
List<String> lines = IOUtils.readLines(new StringReader(line));
for (int i = 0; i < lines.size(); i++) {
String str[] = lines.get(i).split(",");
Row row = sheet.createRow((short) i);
for (int j = 0; j < str.length; j++) {
row.createCell(j).setCellValue(
helper.createRichTextString(str[j]));
}
}
FileOutputStream fileOut = new FileOutputStream(
"C:\\Users\\mohmeds\\Desktop\\someName1.xls");
wb.write(fileOut);
fileOut.close();
Upvotes: 1
Views: 5191
Reputation: 105
Your question is a little vague, but if you are looking to store data from the PDF into a database, it's likely you'll want to extract the data as CSV rather than Excel. Also, the code here eliminates the middle step of converting PDF to Text, then Text to Excel. When defining the format, choose 'csv':
package com.pdftables.examples;
import java.io.File;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class ConvertToFile {
private static List<String> formats = Arrays.asList(new String[] { "csv", "xml", "xlsx-single", "xlsx-multiple" });
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Command line: <API_KEY> <FORMAT> <PDF filename>");
System.exit(1);
}
final String apiKey = args[0];
final String format = args[1].toLowerCase();
final String pdfFilename = args[2];
if (!formats.contains(format)) {
System.out.println("Invalid output format: \"" + format + "\"");
System.exit(1);
}
// Avoid cookie warning with default cookie configuration
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
File inputFile = new File(pdfFilename);
if (!inputFile.canRead()) {
System.out.println("Can't read input PDF file: \"" + pdfFilename + "\"");
System.exit(1);
}
try (CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build()) {
HttpPost httppost = new HttpPost("https://pdftables.com/api?format=" + format + "&key=" + apiKey);
FileBody fileBody = new FileBody(inputFile);
HttpEntity requestBody = MultipartEntityBuilder.create().addPart("f", fileBody).build();
httppost.setEntity(requestBody);
System.out.println("Sending request");
try (CloseableHttpResponse response = httpclient.execute(httppost)) {
if (response.getStatusLine().getStatusCode() != 200) {
System.out.println(response.getStatusLine());
System.exit(1);
}
HttpEntity resEntity = response.getEntity();
if (resEntity != null) {
final String outputFilename = getOutputFilename(pdfFilename, format.replaceFirst("-.*$", ""));
System.out.println("Writing output to " + outputFilename);
final File outputFile = new File(outputFilename);
FileUtils.copyToFile(resEntity.getContent(), outputFile);
} else {
System.out.println("Error: file missing from response");
System.exit(1);
}
}
}
}
private static String getOutputFilename(String pdfFilename, String suffix) {
if (pdfFilename.length() >= 5 && pdfFilename.toLowerCase().endsWith(".pdf")) {
return pdfFilename.substring(0, pdfFilename.length() - 4) + "." + suffix;
} else {
return pdfFilename + "." + suffix;
}
}
}
https://github.com/pdftables/java-pdftables-api/blob/master/pdftables.java
Upvotes: 1