Efficient Text Extraction from PDFs: Implementing OCR with JavaScript and AppScript



Source Code to extract text from pdf documents.


<!DOCTYPE html>
<html lang="en">
<head>
    <!-- Set the character encoding to UTF-8 -->
    <meta charset="UTF-8">
    <!-- Specify the compatibility mode for Internet Explorer -->
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <!-- Set the viewport to control the layout on mobile devices -->
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <!-- Set the title of the web page -->
    <title>PDF to Text Extractor</title>
    <!-- Include the PDF.js library -->
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.min.js" integrity="sha512-ml/QKfG3+Yes6TwOzQb7aCNtJF4PUyha6R3w8pSTo/VJSywl7ZreYvvtUso7fKevpsI+pYVVwnu82YO0q3V6eg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
    <!-- Add some styling for the web page -->
    <style>
        /* Center align the heading */
        h1 {
            width: 100%;
            text-align: center;
        }
        /* Set the display and alignment for the main and result sections */
        .pdfwork, .afterupload {
            display: flex;
            align-items: center;
            justify-content: center;
            flex-direction: column;
            width: 100%;
        }
        /* Add margin-top to all elements inside .pdfwork */
        .pdfwork * {
            margin-top: 5px;
        }
        /* Hide the result section initially */
        .afterupload {
            display: none;
        }
        /* Hide the "Extract Another PDF" button initially */
        .another {
            display: none;
        }
    </style>
</head>
<body>
    <!-- Display the heading -->
    <h1>PDF To Text Extractor</h1>
    <!-- Create a div container for the file upload form and result section -->
    <div class="pdfwork">
        <!-- Button to extract another PDF (hidden initially) -->
        <button class="another" onclick="location.reload()">Extract Another PDF</button>
        <!-- Display text "Select PDF" -->
        <span>Select PDF</span>
        <!-- File input field for selecting the PDF file -->
        <input type="file" class="selectpdf">
        <!-- Display text "Password :" -->
        <span>Password :</span>
        <!-- Password input field (optional) -->
        <input type="password" class="pwd" placeholder='optional'>
        <!-- Button to upload the selected PDF -->
        <button class="upload">Upload</button>
        <!-- Result section (hidden initially) -->
        <div class="afterupload">
            <!-- Display text "Select Page" -->
            <span>Select Page</span>
            <!-- Dropdown menu for selecting the page -->
            <select class="selectpage" onchange="afterProcess()"></select>
            <!-- Download link for the extracted text file -->
            <a href="" class="download" download>Download Pdf Text</a>
            <!-- Textarea to display the extracted text -->
            <textarea class="pdftext"></textarea>
        </div>
    </div>
    <!-- JavaScript code -->
<script>
    // Set the worker source for PDF.js library
    pdfjsLib.GlobalWorkerOptions.workerSrc = "https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.worker.min.js";
    
    // Get references to various elements
    let pdfinput = document.querySelector(".selectpdf"); // Reference to the PDF file input field
    let pwd = document.querySelector(".pwd"); // Reference to the password input field
    let upload = document.querySelector(".upload"); // Reference to the upload button
    let afterupload = document.querySelector(".afterupload"); // Reference to the result section
    let select = document.querySelector("select"); // Reference to the page selection dropdown
    let download = document.querySelector(".download"); // Reference to the download link
    let pdftext = document.querySelector(".pdftext"); // Reference to the text area for displaying extracted text
    
    // Event listener for the upload button click
    upload.addEventListener('click', () => {
        let file = pdfinput.files[0]; // Get the selected PDF file
        if (file != undefined && file.type == "application/pdf") {
            let fr = new FileReader(); // Create a new FileReader object
            fr.readAsDataURL(file); // Read the file as data URL
            fr.onload = () => {
                let res = fr.result; // Get the result of file reading
                if (pwd.value == "") {
                    extractText(res, false); // Extract text without password
                } else {
                    extractText(res, true); // Extract text with password
                }
            }
        } else {
            alert("Select a valid PDF file");
        }
    });
    
    let alltext = []; // Array to store all extracted text
    
    // Asynchronous function to extract text from the PDF
    async function extractText(url, pass) {
        try {
            let pdf;
            if (pass) {
                pdf = await pdfjsLib.getDocument({ url: url, password: pwd.value }).promise; // Get the PDF document with password
            } else {
                pdf = await pdfjsLib.getDocument(url).promise; // Get the PDF document without password
            }
            let pages = pdf.numPages; // Get the total number of pages in the PDF
            for (let i = 1; i <= pages; i++) {
                let page = await pdf.getPage(i); // Get the page object for each page
                let txt = await page.getTextContent(); // Get the text content of the page
                let text = txt.items.map((s) => s.str).join(""); // Concatenate the text items into a single string
                alltext.push(text); // Add the extracted text to the array
            }
            alltext.map((e, i) => {
                select.innerHTML += `<option value="${i+1}">${i+1}</option>`; // Add options for each page in the page selection dropdown
            });
            afterProcess(); // Display the result section
        } catch (err) {
            alert(err.message);
        }
    }
    
    // Function to handle the post-processing after text extraction
    function afterProcess() {
        pdftext.value = alltext[select.value - 1]; // Display the extracted text for the selected page
        download.href = "data:text/plain;charset=utf-8," + encodeURIComponent(alltext[select.value - 1]); // Set the download link URL for the extracted text
        afterupload.style.display = "flex"; // Display the result section
        document.querySelector(".another").style.display = "unset"; // Display the "Extract Another PDF" button
    }
</script>
	</body>
</html>

This is a simple HTML and JavaScript code for a PDF to text converter web application. The code is well-commented, but I'll still provide a step-by-step explanation below:

  1. The HTML markup defines the basic structure of the web application. It contains a header, a main section, and a script tag.
  2. The head section of the HTML file includes some meta tags to define the character set, viewport, and the title of the page. It also includes a link to the PDF.js library, which is a JavaScript library for working with PDF files in the browser.

  3. In the body section of the HTML file, there is a div with class pdfwork which contains the file upload form and the result section. There is also a h1 tag that contains the title of the web application.

  4. Within the pdfwork div, there are several elements, including a button, two input fields (one for selecting the PDF file and one for entering a password if the PDF is encrypted), and a button for uploading the PDF.

  5. When the user clicks the Upload button, an event listener is triggered, which reads the uploaded file using the FileReader API and passes the result to the extractText() function.

  6. The extractText() function extracts the text from the uploaded PDF file using the pdf.js library. If the PDF file is encrypted, the password entered by the user is used to decrypt the file. The function then extracts the text from each page of the PDF and stores it in an array called alltext.

  7. Once all the text has been extracted, the select element is populated with options for each page of the PDF. The afterProcess() function is then called to display the result section of the web application.

  8. The afterProcess() function displays the extracted text for the selected page in a textarea element and also provides a download link to download the text file.

  9. Finally, the script section of the HTML file defines several variables and event listeners to handle user interactions with the web application.


More Posts For You!


5 More HTML Tricks For You

5 HTML Tricks That You Can Use To Enhance Your Web Pages

Effortlessly Convert Excel to JSON with JavaScript: Streamline Your Data Management with Sheet Js

Integrating OCR Capabilities into Web Applications with JavaScript and AppScript

Efficient Text Extraction from PDFs: Implementing OCR with JavaScript and AppScript

Build a CRUD Application Using Google Sheets as Database with HTML, CSS, and JavaScript

Integrating OpenAI with Google Sheets using AppScript for Automated Responses

Upload Images to Google Drive and Google Sheets from HTML File Input: A Complete JavaScript and AppScript Guide

Convert HTML and CSS to PDF with JavaScript Using HTML2PDF.js

Create a Functional Contact Form Using HTML, CSS, JavaScript, and Google AppScript

Build a CRUD App Using HTML, CSS, JavaScript, and IndexedDB API

Reading Data From Google Sheets to HTML Using JavaScript and Apps Script

PrevNext