last_fix
last_fix

Reputation: 379

Convert pdf to html files using node.js and pdf.js

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

Upvotes: 7

Views: 13711

Answers (1)

Jan
Jan

Reputation: 2247

Maybe I found something similar - I am working with local PDF file and browser. I made small changes in ready made viewer.js / PDF.js, it should be possible to process using both Node.js & browser.

This script include's PDF specified by argument to viewer.js Webpack and start browser.

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer.js');
    let wp = fs.readFileSync(viewerJSpath, 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath, wp, 'utf-8');
    const c = path.join(__dirname, 'viewer.html');
    chp.execSync(c);
});

Then tried to add original width as next style parameter to renderTextLayer's appendText method and elements sort by position to TextLayerBuilder's render method next2 this.textLayerDiv.appendChild(textLayerFrag);.

All mentioned PDF.js changes on my Github it seems only web and build folders are required (except npm i -g datauri fox example).

Readability improvements ? #12512 PR

Console script / bookmarklet


Using puppeteer and slightly modified PDF.js it is possible to convert directly (works both head/less, but element sizes slightly differ)

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();
});

Fixes required for puppeteer/chromium:

const message = exception?.message; // => exception.message
page: this.pageLabel ?? this.id // => this.pageLabel || this.id

viewer.js => viewerSrc.js basic additions:

function webViewerPageRendered({
...
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    }); // generate all remaining pages
  }
}

class BaseViewer {
  constructor(options) {
    this.pageNo = []; // rendered pages array
...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
    if (this.pageNo.indexOf(val) < 0) {
      this.pageNo.push(val);
    }
    if (this.pagesCount - 1 <= this.pageNo.length) {
      window.reader(elLists); // sent result back 2 node.js
    }

And result looks like {PageNo:{ElNo:{data}, ...}, ...} and could be simply translated to web page or further processed.

{
    "1": {
        "0": {
            "x": 99.9871,
            "y": 98.0496,
            "w": 557.695,
            "h": 22,
            "text": "Trace-based Just-in-Time Type Specialization for Dynamic",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 99.9871px; top: 98.0496px; width: 557.695px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.970163);"
        },
        "1": {
            "x": 327.478,
            "y": 122.793,
            "w": 102.707,
            "h": 22,
            "text": "Languages",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 327.478px; top: 122.793px; width: 102.707px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.932262);"
        },
...
    "2": {
        "0": {
            "x": 393.677,
            "y": 90.3408,
            "w": 192.909,
            "h": 11,
            "text": "1 for (var i = 2; i < 100; ++i) {",
            "ff": "monospace",
            "fs": "11.1347px",
            "cssText": "left: 393.677px; top: 90.3408px; width: 192.909px; font-size: 11.1347px; font-family: monospace; transform: scaleX(0.875232);"
        },
        "1": {
            "x": 67.0588,
            "y": 91.7599,
            "w": 173.346,
            "h": 11,
            "text": "Hence, recording and compiling a trace",
            "ff": "sans-serif",
            "fs": "11.1347px",
            "cssText": "left: 67.0588px; top: 91.7599px; width: 173.346px; font-size: 11.1347px; font-family: sans-serif; transform: scaleX(0.895175);"
        },






Summary of changes (in original gh-pages branch):

- changes in PDF.js

  function appendText(task, geom, styles) {
...
    let left, top;
+++              , width;
...
    if (angle === 0) {
      left = tx[4];
      top = tx[5] - fontAscent;
    } else {
      left = tx[4] + fontAscent * Math.sin(angle);
      top = tx[5] - fontAscent * Math.cos(angle);
    }
+++ width = geom.width * task._viewport.transform[0];

    textDiv.style.left = `${left}px`;
    textDiv.style.top = `${top}px`;
+++ textDiv.style.width = `${width}px`;

- new nodeView.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const viewerJSpath = path.join(__dirname, './viewer');
const content = datauri(pdf);
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
const c = path.join(__dirname, 'viewer.ff');
chp.execSync(c);

- new openFF.bat: start node nodeView.js %1

- new pdf2sortedMergedTexts.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();

});

- changed viewer.js -> viewerSrc.js

function webViewerPageRendered({
... +++
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    });
  }
}
...
class BaseViewer {
  constructor(options) {
+++ this.pageNo = [];

...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
+++ if (this.pageNo.indexOf(val) < 0) {
+++   this.pageNo.push(val);
+++   console.log(this.pageNo);
+++ }
+++ if (this.pagesCount - 1 <= this.pageNo.length) {
+++   window.reader(elLists);
+++ }

    this._currentPageNumber = val;

  render(timeout = 0) {
...
    this.textLayerRenderTask.promise.then(() => {
      this.textLayerDiv.appendChild(textLayerFrag);
+++   this.reorder(this.textLayerDiv);

... new
  reorder(_src) {
    const src = _src.children;
    let els = [];
    const elDest = [];
    for (let j = 0; j < src.length; j++) {
        const i = src[j];
        if (i.className === 'endOfContent') continue;
        els.push({ x: parseFloat(i.style.left), y: parseFloat(i.style.top), w: parseFloat(i.style.width), h: i.offsetHeight, text: i.innerText, ff: i.style.fontFamily, fs: i.style.fontSize, cssText: i.style.cssText });
    }
    els.sort((a, b) => {
      if (Math.abs(a.y - b.y) <= 1) {
          if (Math.abs(a.x - b.x) <= 1) return 0;
          else return a.x - b.x;
      } else return a.y - b.y;
    });
    let elMin = els[0];
    for (let i = 1; i < els.length; i++) {
         if (elMin.x + elMin.w + 1 >= els[i].x &&
          Math.abs(elMin.y - els[i].y) < 1 &&
          elMin.h === els[i].h &&
          elMin.ff === els[i].ff &&
          elMin.fs === els[i].fs) {
            elMin.text += els[i].text;
            elMin.w = els[i].x + els[i].w - elMin.x;
            if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
            continue;
        }
        if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
        elMin = els[i];
    }
    if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
    els = _src;
    while (els.lastChild) els.removeChild(els.lastChild);
    const elList = [];
    if (window.elLists === undefined) window.elLists = {};
    const uqIdx = { x: [], y: [] };
    for (let i = 0; i < elDest.length; i++) {
        const o = document.createElement('DIV');
        o.innerHTML = elDest[i].text;
        o.setAttribute('style', elDest[i].cssText + 'width:' + elDest[i].w + 'px;position:absolute;');
        els.appendChild(o);
        elList.push([elDest[i].x, elDest[i].x + elDest[i].w, o, elDest[i].y, elDest[i].y + elDest[i].h, elDest[i].text]);
        if (uqIdx.x.indexOf(elDest[i].x) < 0) uqIdx.x.push(elDest[i].x);
        if (uqIdx.y.indexOf(elDest[i].y) < 0) uqIdx.y.push(elDest[i].y);
    }
    elLists[_src.parentElement.getAttribute("data-page-Number")] = Object.assign({}, elDest);
  }

- changed viewer.css

+++ input{padding:0px;border:1px solid #e0e0e0;}input:focus{background-color:red;}

Upvotes: 3

Related Questions