Ayushi Gupta
Ayushi Gupta

Reputation: 91

How to Extract data from pdf file in nodejs

I have already used 'pdf.js-extract' npm module to fetch data from pdf.

var PDFExtract = require('pdf.js-extract').PDFExtract;

var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"

pdfExtract.extract(filename , function (err, data) {
    if (err) return console.log(err);
    console.log(JSON.stringify(data));
});

But I am not getting the desired result. I want to fetch the relevant information from invoice pdf like tax, total amount paid, seller address and save the data fetched into the mongodb collection

Upvotes: 7

Views: 24202

Answers (4)

Dario Paez
Dario Paez

Reputation: 59

File readPdf.js

const readPdf = (file) => new Promise((resolve, reject) => {
      try {
          pdfExtract.extract( file, function (error, text) {
            (error) ? reject(new Error('El archivo no se pudo leer')) : resolve(text)
            return text;
          });
        // Set up the timeout
        setTimeout(function () {
          reject('Promise timed out after ' + 10000 + ' ms');
        }, 10000);
        return data;
      } catch (error) {
        return false;
      }
    });
    module.exports = { readPdf };

File xxx.js

var {readPdf}= require('readPdf');   

  readPdf(files)
      .then(response => {
       console.log(response) // this is your data is 
        }).catch(err => console.log(err));
        return response;
      });

Upvotes: -1

Fairouz Amor
Fairouz Amor

Reputation: 1

using the pdf-extract npm package (https://www.npmjs.com/package/pdf-extract) allows you to extract text from a pdf .

// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract

var extract = (function() {

  'use strict';

  var fs = require('fs');
  var path = require('path');
  var pdfExtract = require('pdf-extract');

  var defaultOptions = {
    type: 'ocr',
    ocr_flags: [
      '-l eng',
    ]
  };

  // Execute script if not used as a module
  if (!module.parent) {

    init(process.argv[2]);
  }

  function init(filePath, options, callback) {

    callback = callback || function (error, response) {

      if (error) { return console.error(error); }

      return console.log(response);
    };

    options = options || defaultOptions;

    if (!filePath) {

      return callback(new Error('No input file (PDF) specified.'));
    }

    processFile(filePath, ocrLanguage, callback);
  }

  function processFile(filePath, ocrLanguage, callback) {

    var processor = pdfExtract(filePath, options, function (error) {

      if (error) {

        callback(error);
      }
    });

    processor.on('complete', function (data) {

      saveFile(filePath + '.txt', data.text_pages, callback);
    });

    processor.on('error', function (error) {

      callback(error);
    });
  }

  function saveFile(filePath, string, callback) {

    // Normalize file path
    filePath = path.normalize(filePath);

    try {

      callback('Saved file ' + filePath);

      // Save file
      return fs.writeFileSync(filePath, string, 'utf8');
    } catch (error) {

      callback(error);
    }
  }

  module.exports = {

    init: init
  };
}());

Upvotes: -1

sanjeev kumar
sanjeev kumar

Reputation: 71

Please refer to GitHub Repository of pdf.js-extract npm module https://github.com/ffalt/pdf.js-extract

Below file is given at example/example.js path at github link

var fs = require('fs');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
pdfExtract.extract('./example.pdf', {} /* options*/, function (err, data) {
    if (err) return console.log(err);
    fs.writeFileSync('./example-output.json', JSON.stringify(data, null, '\t'));
    var lines = PDFExtract.utils.pageToLines(data.pages[0], 2);
    var rows = PDFExtract.utils.extractTextRows(lines);
    var text = rows.map(function (row) {
        return row.join('');
    }).join('\n');
    fs.writeFileSync('./example-output.txt', text);
    console.log(JSON.stringify(data, null, '\t'));
});

Hope it will work for you

Upvotes: 0

Liberateur
Liberateur

Reputation: 1487

You must write a function by invoice format (fn company1, fn company2...).

Here is an example with three different functions to retrieve data in the export of the pdf.js-extract module:

// Sample invoice
let sampleInvoice =
{
  "pages":
  [
    {
      "content":
      [
        {
          "x": 348.41,
          "y": 125.59899999999993,
          "str": "Invoice Number",
          "dir": "ltr",
          "width": 61.61760000000001,
          "height": 8.8,
          "fontName": "g_d0_f2"
        },
        {
          "x": 451.935,
          "y": 125.59899999999993,
          "str": "INV-3337",
          "dir": "ltr",
          "width": 37.171200000000006,
          "height": 8.8,
          "fontName": "g_d0_f2"
        }
      ]
    }
  ]
};


// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));


function searchByPosition(pages,x,y)
{
    // Set position range (difference max)
    let range = 10;

    // Init x and y positions
    x = Math.floor(x/range), y = Math.floor(y/range);

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test position x and y and if match return content
            if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}


function searchByPrev(pages,txt)
{
    // Init txt
    txt = txt.toLowerCase();

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test text  and if match return next content
            // (If you write j-1, you can have searchByNext function)
            if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])

                // Return result
                return pages[i].content[j+1].str;

    // No results found
    return 'NotFound';
}


function searchByFormat(pages,regex)
{
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test regex and if match return content
            if(regex.test(pages[i].content[j].str))

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}

TRY HERE : https://jsfiddle.net/dkhqzg6s/

Upvotes: 1

Related Questions