Reputation: 105
I have a PDF file saved in Google Drive, I want to find a text from that file i.e USD then pick the value next to found text i.e: 167.1764, and insert it in my google spreadsheet.
Below is the preview of my PDF File. Link to my PDF File.
Here is the code below which I tried but failed to find the text and reached to that value which is next to it.
below is my code.
function extractTextFromPDF() {
var drive = DriveApp;
var folders = drive.getFolderById('folderid');
var newfile = folders.getFilesByName('08-Sep-2021.pdf');
if(newfile.hasNext()){
var file1 = newfile.next().getBlob();
}
var blob = file1;
var resource = {
title: blob.getName(),
mimeType: blob.getContentType()
};
// Enable the Advanced Drive API Service
var file = Drive.Files.insert(resource, blob, {ocr: true, ocrLanguage: "en"});
// Extract Text from PDF file
var doc = DocumentApp.openById(file.id);
var text = doc.getBody().getText();
Logger.log(text);
//DriveApp.getFileById(file.id).setTrashed(true);
var body = doc.getBody();
var foundElement = body.findText("(USD)");
while (foundElement != null) {
// Get the text object from the element
var foundText = foundElement.getElement().asText();
// Where in the element is the found text?
var start = foundElement.getStartOffset();
var end = foundElement.getEndOffsetInclusive();
}
// i want the value of USD i.e 167.1144 in log
Logger.log(foundText);
}
Upvotes: 0
Views: 1258
Reputation: 1610
With the help of RegEx you can extract this. I'm not the best with those patterns. But maybe somebody else can optimize so the split is not necessary. (here is a link).
The code:
function extractTextFromPDF() {
const folders = DriveApp.getFolderById('1QVo_pxxx387WPH9Yx');
const newfile = folders.getFilesByName('08-Sep-2021.pdf');
if(newfile.hasNext()){
var file1 = newfile.next().getBlob();
}
const blob = file1;
const resource = {
title: blob.getName(),
mimeType: blob.getContentType()
};
// Enable the Advanced Drive API Service
const file = Drive.Files.insert(resource, blob, {convert: true});
// Extract Text from PDF file
const doc = DocumentApp.openById(file.id);
const text = doc.getBody().getText();
Logger.log(text);
const buying = /USD\n(.*?)$/gm.exec(text)[1].trim();
const selling = /USD\n\s*\S*\n(.*?)$/gm.exec(text)[1].trim();
console.log(buying)
console.log(selling)
//Remove the converted file.
DriveApp.getFileById(file.id).setTrashed(true);
}
Upvotes: 3