Reputation: 27
var pdfParser = require('pdf-parser')
var fs = require('fs')
var PDF_PATH = __dirname + '/pdfs'
var results = []
var failed = []
fs.readdir(PDF_PATH, function(err, files){
if(err){
return console.log(err)
}
for(const file of files){
let the_ent = {
'name': '',
'other data': []
}
pdfParser.pdf2json(PDF_PATH + '/' + file, function(error, pdf){
if(error != null){
console.log(error)
}else if(pdf['pages'] == undefined){
failed.push(file)
console.log(file +' failed')
}else{
//populate 'results' array
}
console.log(/*pdf_data*/)
results.push(/*pdf_data*/)
})
}
console.log(results)
console.log(failed)
results = JSON.stringify(results)
//fs.writeFileSync() write results to json
})
I don't know what is wrong with me this morning, I can't work out how to write this in async; obviously the logs/writefile at the bottom fire as soon as the script executes.
I have tried wrapping in async functions and awaiting the readdir / pdf parsing instead of using callbacks - clearly not correctly. I'm just trying to parse every pdf in a folder - push what I want to some arrays and then log them once the loop finishes zzz.
Upvotes: 1
Views: 1400
Reputation: 707238
I would promisify the async operations and use async/await
. For the fs
operations, use the new fs.promises
API. For others, use util.promisify()
to make promisified versions.
The resolved value of the parsePDFs function I create will be an array of JSON and an array of failed filenames so you get both pieces of information back:
const util = require('util');
const pdfParser = require('pdf-parser');
// make promisified version of the function
const pdfParser.pdf2jsonP = util.promisify(pdfParser.pdf2json);
const fsp = require('fs').promises;
const path = require('path');
const PDF_PATH = path.join(__dirname, 'pdfs');
async function parsePDFs(dir) {
const files = await fsp.readdir(dir);
const results = [];
const failed = [];
for (const file of files) {
let the_ent = {
'name': '',
'other data': []
}
try {
let pdf = await pdfParser.pdf2jsonP(path.join(dir, file));
if (!pdf || pdf.pages === undefined) {
throw new Error("pdf.pages is empty")
}
results.push(pdf);
} catch(e){
console.log(e);
failed.push(file);
}
}
// this will be the resolved value of the returned promise
return {results, failed};
}
parsePDFs(PDF_PATH).then(data => {
console.log("failed filenames: " data.failed);
console.log("json results: ", data.results);
// do something with data.results and data.failed
}).catch(err => {
console.log(err);
});
Note: You declare, but never use the variable the_ent
.
Upvotes: 2
Reputation: 18909
You can use util.promisify
to promisify the sync functions:
const readdir = util.promisify(fs.readdir);
const reader = util.promisify(pdfParser.pdf2json);
Minimal demo:
const fs = require('fs');
const util = require('util');
var pdfParser = require('pdf-parser');
const readdir = util.promisify(fs.readdir);
const reader = util.promisify(pdfParser.pdf2json);
var PDF_PATH = __dirname + '/pdfs';
(async () => {
async function processFiles() {
let results = [];
let files = await readdir(PDF_PATH);
for (const file of files) {
let pdf = await reader(PDF_PATH + '/' + file);
results.push(pdf);
}
return results;
}
const result = await processFiles();
console.info(result);
})();
Upvotes: 1
Reputation: 138257
Wrap the smallest asynchronous tasks into Promises, then use async/await to combine them:
// the Promise wrapper:
const parsePdf = file => new Promise((res, rej) => pdfParser.pdf2json(file, (err, r) => err ? rej(err) : res(r));
(async function () { // start an asynchronous context
const PDF_PATH = __dirname + '/pdfs';
const results = [], failed = []; // prefer const over let
// reading files in a promising way is already provided natively:
const files = await fs.promises.readdir(PDF_PATH);
for(const file of files){ // this is in series, in parallel would probably be faster
let the_ent = {
name: '',
'other data': [], // avoid whitespaces!
};
const pdf = await parsePdf(PDF_PATH + '/' +file);
if(pdf.pages === undefined) { // prefer equality (===) over comparison (==)
failed.push(file);
console.log(file + ' failed');
} else {
// populate 'results' array
}
}
console.log(results, failed);
})();
You can probably process the files in parallel too.
Upvotes: 2