Prasoon Shukla
Prasoon Shukla

Reputation: 51

Problem in getting result form tessearct.js in PDF

I am building a simple javaScript OCR(optical content recognition) app using tesseract.js and I am passing {tess_create_pdf: "1"} into the .recognize() method to get the result in pdf format but it's not working. So please someone let me know the problem i am making.

const express  = require('express');
const app      = express();
const fs       = require('fs');
const multer   = require('multer');
const { createWorker } = require("tesseract.js");
const worker           = createWorker();

app.set("view engine", "ejs");
const storage = multer.diskStorage({
    destination: (req, file, cb) => {
        cb(null, './uploads');
    },
    filename: (req, file, cb) => {
        cb(null, file.originalname);
    }
})

const upload = multer({storage: storage}).single('avatar');

app.get('/', (req, res) => {
    res.render('index');
});

app.post('/upload', (req, res) => {
    upload(req, res, err => {
        fs.readFile(`./uploads/${req.file.originalname}`, (err, data) => {
        if(err) return console.log('this is your error', err);

        (async ()=> {
            await worker.load();
            await worker.loadLanguage('eng');
            await worker.initialize('eng');
            const { data: { text } } = await worker.recognize(data, { tessjs_create_pdf: "1"});
            res.send(text);
            await worker.terminate();
        })();
    });
  })
})

var port = 3000 || process.env.PORT;
app.listen(port, () => {
     console.log("server has started!!!!");
})

Upvotes: 1

Views: 5098

Answers (2)

MR. WEERASINGHE U.K.
MR. WEERASINGHE U.K.

Reputation: 131

You have to use getPDF() function to generate PDF file. (in tesseract.js 2.1.4) Add following codes after recognizing text and before terminating worker. Use writeFileSync in fs to write file to disk.

const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));

If you want to download the generated PDF file, redirect user to another route using...

    res.redirect("/download")

...and add following codes to your route. File will be saved in root directory so we can use __dirname as the path.

app.get("/download", (req, res) => {
    const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
    res.download(file);
});

Your final code will look something like this. To avoid conflicting, I have renamed the parameter data to img in line fs.readline... (line 3) in following code.

app.post('/upload', (req, res) => {
    upload(req, res, err => {
        fs.readFile(`./uploads/${req.file.originalname}`, (err, img) => {
        if(err) return console.log('this is your error', err);

        (async ()=> {
            await worker.load();
            await worker.loadLanguage('eng');
            await worker.initialize('eng');
            await worker.recognize(img);

            const { data } = await worker.getPDF("Tesseract OCR Result");
            fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
            res.redirect("/download");

            await worker.terminate();
        })();
    });
  })
})

app.get("/download", (req, res) => {
    const file = `${__dirname}/tesseract-ocr-result.pdf`;
    res.download(file);
});

Upvotes: 2

Sandeep chand
Sandeep chand

Reputation: 607

you can use pdf kit

const express = require('express');
const app = express();
const fs = require('fs');
const multer = require('multer');
const { createWorker } = require('tesseract.js');
const worker = createWorker({
  logger: m => console.log(m)
});
const cors = require('cors');
const PDFDocument = require('pdfkit');
// Create a document
const doc = new PDFDocument();
// Pipe its output somewhere, like to a file or HTTP response
// See below for browser usage
doc.pipe(fs.createWriteStream('tesseract.js-ocr-result.pdf'));
app.use(cors());
var bodyParser = require('body-parser');
app.use(bodyParser.json({ limit: '50mb' }));
app.use(
  bodyParser.urlencoded({
    extended: true,
    limit: '50mb',
    parameterLimit: 1000000
  })
);
var Storage = multer.diskStorage({
  destination: (req, file, callback) => {
    callback(null, __dirname + '/images');
  },
  filename: (req, file, callback) => {
    callback(null, file.originalname);
  }
});
var upload = multer({
  storage: Storage
}).single('avatar');
app.post('/upload', (req, res) => {
  upload(req, res, err => {
    console.log('Request ---', req.body);
    console.log('Request file ---', req.file);

    fs.readFile(`./images/${req.file.originalname}`, (err, image) => {
      if (err) {
        console.log(err);
      }
      (async () => {
        await worker.load();
        await worker.loadLanguage('eng');
        await worker.initialize('eng');
        const {
          data: { text }
        } = await worker.recognize(image);

        doc.image(image, {
          fit: [250, 300],
          align: 'center',
          valign: 'center'
        });
        doc
          .addPage()
          .fontSize(25)
          .text(text);
        doc.end();

        await worker.terminate();
      })();
    });
  });
});
app.get('/download', (req, res) => {
  const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
  res.download(file);
});
app.listen(5000, () => {
  console.log('server Started');
});

Upvotes: 0

Related Questions