Reputation: 51
I am building a simple javaScript OCR(optical content recognition) app using tesseract.js and I am passing {tess_create_pdf: "1"} into the .recognize() method to get the result in pdf format but it's not working. So please someone let me know the problem i am making.
const express = require('express');
const app = express();
const fs = require('fs');
const multer = require('multer');
const { createWorker } = require("tesseract.js");
const worker = createWorker();
app.set("view engine", "ejs");
const storage = multer.diskStorage({
destination: (req, file, cb) => {
cb(null, './uploads');
},
filename: (req, file, cb) => {
cb(null, file.originalname);
}
})
const upload = multer({storage: storage}).single('avatar');
app.get('/', (req, res) => {
res.render('index');
});
app.post('/upload', (req, res) => {
upload(req, res, err => {
fs.readFile(`./uploads/${req.file.originalname}`, (err, data) => {
if(err) return console.log('this is your error', err);
(async ()=> {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(data, { tessjs_create_pdf: "1"});
res.send(text);
await worker.terminate();
})();
});
})
})
var port = 3000 || process.env.PORT;
app.listen(port, () => {
console.log("server has started!!!!");
})
Upvotes: 1
Views: 5098
Reputation: 131
You have to use getPDF() function to generate PDF file. (in tesseract.js 2.1.4) Add following codes after recognizing text and before terminating worker. Use writeFileSync in fs to write file to disk.
const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
If you want to download the generated PDF file, redirect user to another route using...
res.redirect("/download")
...and add following codes to your route. File will be saved in root directory so we can use __dirname as the path.
app.get("/download", (req, res) => {
const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
res.download(file);
});
Your final code will look something like this. To avoid conflicting, I have renamed the parameter data to img in line fs.readline... (line 3) in following code.
app.post('/upload', (req, res) => {
upload(req, res, err => {
fs.readFile(`./uploads/${req.file.originalname}`, (err, img) => {
if(err) return console.log('this is your error', err);
(async ()=> {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.recognize(img);
const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
res.redirect("/download");
await worker.terminate();
})();
});
})
})
app.get("/download", (req, res) => {
const file = `${__dirname}/tesseract-ocr-result.pdf`;
res.download(file);
});
Upvotes: 2
Reputation: 607
you can use pdf kit
const express = require('express');
const app = express();
const fs = require('fs');
const multer = require('multer');
const { createWorker } = require('tesseract.js');
const worker = createWorker({
logger: m => console.log(m)
});
const cors = require('cors');
const PDFDocument = require('pdfkit');
// Create a document
const doc = new PDFDocument();
// Pipe its output somewhere, like to a file or HTTP response
// See below for browser usage
doc.pipe(fs.createWriteStream('tesseract.js-ocr-result.pdf'));
app.use(cors());
var bodyParser = require('body-parser');
app.use(bodyParser.json({ limit: '50mb' }));
app.use(
bodyParser.urlencoded({
extended: true,
limit: '50mb',
parameterLimit: 1000000
})
);
var Storage = multer.diskStorage({
destination: (req, file, callback) => {
callback(null, __dirname + '/images');
},
filename: (req, file, callback) => {
callback(null, file.originalname);
}
});
var upload = multer({
storage: Storage
}).single('avatar');
app.post('/upload', (req, res) => {
upload(req, res, err => {
console.log('Request ---', req.body);
console.log('Request file ---', req.file);
fs.readFile(`./images/${req.file.originalname}`, (err, image) => {
if (err) {
console.log(err);
}
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const {
data: { text }
} = await worker.recognize(image);
doc.image(image, {
fit: [250, 300],
align: 'center',
valign: 'center'
});
doc
.addPage()
.fontSize(25)
.text(text);
doc.end();
await worker.terminate();
})();
});
});
});
app.get('/download', (req, res) => {
const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
res.download(file);
});
app.listen(5000, () => {
console.log('server Started');
});
Upvotes: 0