Reputation: 95
I'm using axios to recover a PDF file and use pdfreader library to extract the code.
This is the code of the data recovery :
var options = {
method: 'get',
url: url,
headers: { 'User-Agent': 'PostmanRuntime/7.26.8' },
timeout: 5000,
responseEncoding: 'utf8',
maxRedirects: 15,
httpAgent: new http.Agent({ keepAlive: true }),
responseType: 'arraybuffer'
};
let response = await axios(options);
let response_text;
if (response.headers['content-type'].indexOf('pdf') != -1) {
var buff = new Buffer.alloc(0);
buff = Buffer.concat([buff, response.data]);
let temp = await extract_pdf.readlines(buff).catch(function (error) { console.log("readlines Error :"+error); return []; });
response_text = temp.join(' ').trim();
}
}
This is the code of the extraction :
const pdfreader = require("pdfreader");
/*
if second param is set then a space ' ' inserted whenever text
chunks are separated by more than xwidth
this helps in situations where words appear separated but
this is because of x coords (there are no spaces between words)
each page is a different array element
*/
async function readlines(buffer, xwidth) {
return new Promise((resolve, reject) => {
var pdftxt = new Array();
var pg = 0;
new pdfreader.PdfReader().parseBuffer(buffer, function (err, item) {
if (err) { console.log("pdf reader error: " + err); }
else if (!item) {
pdftxt.forEach(function (a, idx) {
pdftxt[idx].forEach(function (v, i) {
pdftxt[idx][i].splice(1, 2);
});
});
resolve(pdftxt);
} else if (item && item.page) {
pg = item.page - 1;
pdftxt[pg] = [];
} else if (item.text) {
var t = 0;
var sp = "";
pdftxt[pg].forEach(function (val, idx) {
if (val[1] == item.y) {
if (xwidth && item.x - val[2] > xwidth) {
sp += " ";
} else {
sp = "";
}
pdftxt[pg][idx][0] += sp + item.text;
t = 1;
}
});
if (t == 0) {
pdftxt[pg].push([item.text, item.y, item.x]);
}
}
});
// PDFreader sometimes doesn't provide an answer and doesn't throw an error. The promise never fulfills and the program exists without a log
setTimeout(function () {
reject('Promise timed out after ' + 9000 + ' ms');
}, 9000);
}).catch(error => { return; });
}
module.exports = {
readlines: readlines
};
The problem raises with this particular address : https://recomedical.fr/69c5c1b33d30582973551f5fb7b7b54c/4f3c18607c2d381808000000.pdf
The script freezes and there's no way to interrupt the extraction and/or raise an error.
Thank you for your help
Upvotes: 1
Views: 588
Reputation: 95
I found the answer to my question.
pdfreader works very badly with UTF-8 documents where there are foreign characters.
I had to change for pdf-parse, that works much better.
Upvotes: 1