Reputation: 1055
I'm just facing a little problem in getting the link to the PDF file when I run a script to convert PDF files to Google docs with OCR then populate the results into a google sheets.
So far I get the name of the file created (Google Docs) and the extracted text.
what I'm trying to achieve: on the 3rd column, I'd like to get the link of the PDF file corresponding to the Google docs created
I've tried to include the variable pdf to the push array:
var pdf = document.getId();
...
output.push([name, body, pdf]);
but I get the same ID for all files. Ideally I'd like to get the full link to the PDF file and not only it's ID, to avoid having to create a column with:
=CONCATENATE("https://drive.google.com/file/d/",C2)
where C2 contains the ID of the PDF file.
Here is the code:
function extractTextOnOpen() {
var folderName = "OCR TESTS";
var sheetId = "SHEET'S ID HERE";
//Define Project folder
var folder = DriveApp.getFoldersByName(folderName).next();
var folderId = folder.getId();
//Find all PDFs in folder
var documents = folder.getFilesByType("application/pdf");
while (documents.hasNext()) {
//Convert each PDF to a Google Doc with OCR
var document = documents.next();
// Get the PDF link to insert in the sheet for reference
var pdf = document.getId();
var imageName = document.getName();
var docName = imageName.split("\.")[0];
var file = {
title: docName,
mimeType: "application/pdf"
// for images, use: "image/png"
}
Drive.Files.insert(file, document, { ocr: true });
//Store newly-created Google Doc in the same project folder
var newFile = DriveApp.getFilesByName(docName).next();
folder.addFile(newFile);
var rootFolder = DriveApp.getRootFolder();
rootFolder.removeFile(newFile);
}
//Find all Google Docs in the project folder
var docs = folder.getFilesByType("application/vnd.google-apps.document");
//Set up spreadsheet
var ss = SpreadsheetApp.openById(sheetId);
SpreadsheetApp.setActiveSpreadsheet(ss);
Logger.log('File name: ' + ss.getName());
// specify the sheet to insert the results
var sheet = ss.getSheetByName('Sheet1');
// Set up the spreadsheet to display the results
var headers = [["File Name", "Test Paper Scanned", "PDF Link"]];
sheet.getRange("A1:C").clear()
sheet.getRange("A1:C1").setValues(headers);
var output = [];
//Populate spreadsheet with OCR text
while (docs.hasNext()) {
var file = docs.next();
var docId = file.getId();
var doc = DocumentApp.openById(docId);
var name = doc.getName();
var body = doc.getBody().getText();
output.push([name, body, pdf]);
// write data to the sheet
sheet.getRange(2, 1, output.length, 3).setValues(output);
}};
Upvotes: 0
Views: 929
Reputation: 9571
You have an unnecessary loop and so you're losing your opportunity to save the PDF URLs. I've changed the order of your code to show you how this works. Essentially, all of the work happens within the first loop where you iterate through the PDFs.*
function extractTextOnOpen() {
var folderName = "OCR TESTS";
var sheetId = "SHEET'S ID HERE";
//Set up spreadsheet
var ss = SpreadsheetApp.openById(sheetId);
// specify the sheet to insert the results
var sheet = ss.getSheetByName("Sheet1");
// Set up the spreadsheet to display the results
var headers = ["File Name", "Test Paper Scanned", "PDF Link"];
sheet.getRange("A1:C").clear()
var output = [headers];
//Define Project folder
var folder = DriveApp.getFoldersByName(folderName).next();
var folderId = folder.getId();
//Find all PDFs in folder
var pdfs = folder.getFilesByType("application/pdf");
while (pdfs.hasNext()) {
//Convert each PDF to a Google Doc with OCR
var pdf = pdfs.next();
var imageName = pdf.getName();
var docName = imageName.split("\.")[0];
var file = {
title: docName,
mimeType: "application/pdf"
};
var driveFile = Drive.Files.insert(file, pdf, { ocr: true });
//Store newly-created Google Doc in the same project folder
var newFile = DriveApp.getFileById(driveFile.id);
folder.addFile(newFile);
var rootFolder = DriveApp.getRootFolder();
rootFolder.removeFile(newFile);
//Get the Google Doc data
var doc = DocumentApp.openById(driveFile.id);
var name = doc.getName();
var body = doc.getBody().getText();
output.push([name, body, pdf.getUrl()]);
}
//Print to the sheet
sheet.getRange(1, 1, output.length, output[0].length).setValues(output);
}
In the code above, note that the Drive API returns a file, so that's used in the proceeding .getFileById()
calls. You can then use either .getUrl()
or .getDownloadUrl()
.
var driveFile = Drive.Files.insert(file, pdf, { ocr: true });
//Store newly-created Google Doc in the same project folder
var newFile = DriveApp.getFileById(driveFile.id);
Also, you're using the batch .setValues()
, which is faster, but you have it in a loop. I updated the script to print to the sheet only once at the very end.
* If you really wanted to go through two loops of PDFs and then Google Docs, then you'd need to map the PDF IDs to the Google Doc IDs in that first loop.
Upvotes: 2
Reputation: 26796
Sample:
var pdf = document.getUrl();
Upvotes: 0