Juanse
Juanse

Reputation: 95

pdfreader NodeJS it's blocked reading a remote PDF file

I'm using axios to recover a PDF file and use pdfreader library to extract the code.

This is the code of the data recovery :

    var options = {
          method: 'get',
          url: url,
          headers: { 'User-Agent': 'PostmanRuntime/7.26.8' },
          timeout: 5000,
          responseEncoding: 'utf8',
          maxRedirects: 15,
          httpAgent: new http.Agent({ keepAlive: true }),
          responseType: 'arraybuffer'
        };
    
        let response = await axios(options);
        let response_text;

  if (response.headers['content-type'].indexOf('pdf') != -1) {
      var buff = new Buffer.alloc(0);
      buff = Buffer.concat([buff, response.data]);

      let temp = await extract_pdf.readlines(buff).catch(function (error) { console.log("readlines Error :"+error); return []; });
      response_text = temp.join(' ').trim();
      }
    }

This is the code of the extraction :

const pdfreader = require("pdfreader");

/*
if second param is set then a space ' ' inserted whenever text
chunks are separated by more than xwidth
this helps in situations where words appear separated but
this is because of x coords (there are no spaces between words)

each page is a different array element
*/

async function readlines(buffer, xwidth) {
 return new Promise((resolve, reject) => {
  var pdftxt = new Array();
  var pg = 0;
  new pdfreader.PdfReader().parseBuffer(buffer, function (err, item) {
   if (err) { console.log("pdf reader error: " + err); }
   else if (!item) {
    pdftxt.forEach(function (a, idx) {
     pdftxt[idx].forEach(function (v, i) {
      pdftxt[idx][i].splice(1, 2);
     });
    });
    resolve(pdftxt);
   } else if (item && item.page) {
    pg = item.page - 1;
    pdftxt[pg] = [];
   } else if (item.text) {
    var t = 0;
    var sp = "";
    pdftxt[pg].forEach(function (val, idx) {
     if (val[1] == item.y) {
      if (xwidth && item.x - val[2] > xwidth) {
       sp += " ";
      } else {
       sp = "";
      }
      pdftxt[pg][idx][0] += sp + item.text;
      t = 1;
     }
    });
    if (t == 0) {
     pdftxt[pg].push([item.text, item.y, item.x]);
    }
   }
  });

  // PDFreader sometimes doesn't provide an answer and doesn't throw an error. The promise never fulfills and the program exists without a log
  setTimeout(function () {
   reject('Promise timed out after ' + 9000 + ' ms');
  }, 9000);

 }).catch(error => { return; });

}

module.exports = {
 readlines: readlines
};

The problem raises with this particular address : https://recomedical.fr/69c5c1b33d30582973551f5fb7b7b54c/4f3c18607c2d381808000000.pdf

The script freezes and there's no way to interrupt the extraction and/or raise an error.

Thank you for your help

Upvotes: 1

Views: 588

Answers (1)

Juanse
Juanse

Reputation: 95

I found the answer to my question.

pdfreader works very badly with UTF-8 documents where there are foreign characters.

I had to change for pdf-parse, that works much better.

Upvotes: 1

Related Questions