Parsing of text from Pdf in reactjs

Question

I'm using the pdf.js library to extract text from PDF files, but the extracted text isn't formatted correctly, with some lines ending up at the end. The PDF file usually contains a resume, and since different resumes can have varying layouts and word structures, how can I segment the parsed text into different sections like introduction, education, and experience?

here is my code for parsing the pdf into text format

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";



function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("
");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "
";
          }
          extractedText += pageLines;
        }
  
        setExtractedText(extractedText);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
        
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };
    return (
       
        
        
      
            {extractedText && (
             
              
              
            )}
          
          

 );
}

i have tried to convert it into html but pdfjs-dist does not allow to correctly convert it into htmL

so can someone suggest what other ways by which i can parse the text

christin babu · Accepted Answer

import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";

function PDFParser() {
  const [extractedText, setExtractedText] = useState("");
  const [pdfSrc, setPdfSrc] = useState(null);
  const [selectedFileName, setSelectedFileName] = useState("");
  const fileInputRef = useRef(null);

  const handleFileChange = async (event) => {
    const selectedFile = event.target.files[0];
  
    if (!selectedFile) {
      return;
    }
  
    const fileReader = new FileReader();
    fileReader.onload = async () => {
      const arrayBuffer = fileReader.result;
  
      try {
        pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
        const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
  
        const numPages = pdf.numPages;
        let extractedText = "";
  
        for (let i = 1; i <= numPages; i++) {
          const page = await pdf.getPage(i);
          const pageText = await page.getTextContent();
          
          // Map over text items and join them with a newline character
          const pageLines = pageText.items.map((item) => item.str).join("
");
  
          // Append the lines from this page to the extracted text
          if (extractedText !== "") {
            extractedText += "
";
          }
          extractedText += pageLines;
        }

        // Segment the extracted text into sections
        const sections = segmentText(extractedText);

        // Update state with segmented text
        setExtractedText(sections);
        setPdfSrc(URL.createObjectURL(selectedFile));
        setSelectedFileName(selectedFile.name);
      } catch (error) {
        console.error("Error parsing PDF:", error);
      }
    };
  
    setExtractedText("");
    fileReader.readAsArrayBuffer(selectedFile);
  };

  // Function to segment text into sections
  const segmentText = (text) => {
    // Split text into lines
    const lines = text.split("
");

    // Define section keywords
    const sectionKeywords = ["education", "experience", "skills", "summary"];

    // Initialize sections object
    const sections = {};

    // Initialize current section
    let currentSection = "";

    // Iterate over lines to identify section boundaries
    lines.forEach((line) => {
      const lowerCaseLine = line.toLowerCase();

      // Check if line contains a section keyword
      const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
      if (matchedKeyword) {
        currentSection = matchedKeyword;
        if (!sections[currentSection]) {
          sections[currentSection] = [];
        }
      } else {
        // Add line to current section
        if (currentSection !== "") {
          sections[currentSection].push(line);
        }
      }
    });

    return sections;
  };

  const openFileDialog = () => {
    if (fileInputRef.current) {
      fileInputRef.current.click();
    }
  };

  return (
    
      
      
      
        {Object.keys(extractedText).map((section, index) => (
          
            {section.toUpperCase()}
            
              {extractedText[section].map((item, idx) => (
                {item}
              ))}
            
          
        ))}
      
    
  );
}

export default PDFParser;

Parsing of text from Pdf in reactjs

Answers (1)

Related Questions