Water223
Water223

Reputation: 75

local PDF file scraping in node.js

I have uploaded a pdf via a MEAN stack web application using fs. I want to extract certain fields from the pdf and display them on the web app. I have looked at a couple npm packages like pdf.js, pdf2json. I can't figure out the documentation and javascript callbacks used in the examples available. Please help!

Upvotes: 1

Views: 7934

Answers (2)

Water223
Water223

Reputation: 75

I am running the code out of my server side controller.

module.exports = (function() {
return {
    add: function(req, res) {
        var tmp_path = req.files.pdf.path;
        var target_path = './uploads/' + req.files.pdf.name;
        fs.rename(tmp_path, target_path, function(err) {
            if (err) throw err;
            // delete the temporary file, so that the explicitly set temporary upload dir does not get filled with unwanted files
            fs.unlink(tmp_path, function() {
                if (err) throw err;
            //edit here pdf parser

            res.redirect('#/');

            });
        })
    },
    show: function(req, res) {

    var pdfParser = new PDFParser();

    var _onPDFBinDataReady = function (pdf) {
      console.log('Loaded pdf:\n');

      for (var i in pdf.data.Pages) {

        var page = pdf.data.Pages[i];
        // console.log(page.Texts);
        for (var j in page.Texts) { 
          var text = page.Texts[j];
          // console.log(text.R[0].T);

        }
      }
      console.log(JSON.stringify(pdf));
    };
    // Create an error handling function
    var _onPDFBinDataError = function (error) {
      console.log(error);
    };
    pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
    // Register error handling function
    pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
    // Construct the file path of the pdf
    var pdfFilePath = './uploads/Invoice_template.pdf';
    // Load the pdf. When it is loaded your data ready function will be called.
    pdfParser.loadPDF(pdfFilePath);

},

//end controller

}

Upvotes: 0

user1465368
user1465368

Reputation:

I hope I can help answer your question. Using pdf2json can be used to parse a pdf and extract the text. There are a couple of steps that need to be taken to get it working. I have adapted the example from https://github.com/modesty/pdf2json.

The setup is to install pdf2json in the node app, and also underscore. The example page didn't explain the need to define your own callback functions. It also used self instead of this to register them. So, with the appropriate changes the code to extract all the text from the pdf will be something like this:

// Get the dependencies that have already been installed
// to ./node_modules with `npm install <dep>`in the root director
// of your app 

var _ = require('underscore'),
    PDFParser = require('pdf2json');

var pdfParser = new PDFParser();

// Create a function to handle the pdf once it has been parsed.
// In this case we cycle through all the pages and extraxt
// All the text blocks and print them to console.
// If you do `console.log(JSON.stringify(pdf))` you will 
// see how the parsed pdf is composed. Drill down into it
// to find the data you are looking for.
var _onPDFBinDataReady = function (pdf) {
  console.log('Loaded pdf:\n');
  for (var i in pdf.data.Pages) {
    var page = pdf.data.Pages[i];
    for (var j in page.Texts) { 
      var text = page.Texts[j];
      console.log(text.R[0].T);
    }
  }
};

// Create an error handling function
var _onPDFBinDataError = function (error) {
  console.log(error);
};

// Use underscore to bind the data ready function to the pdfParser
// so that when the data ready event is emitted your function will
// be called. As opposed to the example, I have used `this` instead
// of `self` since self had no meaning in this context
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));

// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));

// Construct the file path of the pdf
var pdfFilePath = 'test3.pdf';

// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);

Upvotes: 2

Related Questions