Yash Singhal
Yash Singhal

Reputation: 17

Parsing of text from Pdf in reactjs

I'm using the pdf.js library to extract text from PDF files, but the extracted text isn't formatted correctly, with some lines ending up at the end. The PDF file usually contains a resume, and since different resumes can have varying layouts and word structures, how can I segment the parsed text into different sections like introduction, education, and experience?

here is my code for parsing the pdf into text format

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";



function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("\n");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "\n";
          }
          extractedText += pageLines;
        }
  
        setExtractedText(extractedText);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
        
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };
    return (
       <div>
        <input
          type="file"
          onChange={handleFileChange}
          accept=".pdf"
          ref={fileInputRef}
          style={{ display: "none" }}
        />
        <button className="UploadButton" onClick={openFileDialog}>
          Upload PDF
        </button>
      <div className="ScrollableContainer">
            {extractedText && (
             
              <HTMLContent text={extractedText}/>
              
            )}
          </div>
          </div>

 );
}

i have tried to convert it into html but pdfjs-dist does not allow to correctly convert it into htmL

so can someone suggest what other ways by which i can parse the text

Upvotes: -2

Views: 374

Answers (1)

christin babu
christin babu

Reputation: 65

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";

function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("\n");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "\n";
          }
          extractedText += pageLines;
        }

        // Segment the extracted text into sections
        const sections = segmentText(extractedText);

        // Update state with segmented text
        setExtractedText(sections);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };

  // Function to segment text into sections
  const segmentText = (text) => {
    // Split text into lines
    const lines = text.split("\n");

    // Define section keywords
    const sectionKeywords = ["education", "experience", "skills", "summary"];

    // Initialize sections object
    const sections = {};

    // Initialize current section
    let currentSection = "";

    // Iterate over lines to identify section boundaries
    lines.forEach((line) => {
      const lowerCaseLine = line.toLowerCase();

      // Check if line contains a section keyword
      const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
      if (matchedKeyword) {
        currentSection = matchedKeyword;
        if (!sections[currentSection]) {
          sections[currentSection] = [];
        }
      } else {
        // Add line to current section
        if (currentSection !== "") {
          sections[currentSection].push(line);
        }
      }
    });

    return sections;
  };

  const openFileDialog = () => {
    if (fileInputRef.current) {
      fileInputRef.current.click();
    }
  };

  return (
    <div>
      <input
        type="file"
        onChange={handleFileChange}
        accept=".pdf"
        ref={fileInputRef}
        style={{ display: "none" }}
      />
      <button className="UploadButton" onClick={openFileDialog}>
        Upload PDF
      </button>
      <div className="ScrollableContainer">
        {Object.keys(extractedText).map((section, index) => (
          <div key={index}>
            <h2>{section.toUpperCase()}</h2>
            <ul>
              {extractedText[section].map((item, idx) => (
                <li key={idx}>{item}</li>
              ))}
            </ul>
          </div>
        ))}
      </div>
    </div>
  );
}

export default PDFParser;

Upvotes: 0

Related Questions