Michael
Michael

Reputation: 345

Extract Date from Excel File and Replace specific text in a PDF file in R

I am looking to process a set of Excel files located within a specified folder (top_folder). For each Excel file found, the script performs the following tasks:

Extract Quote Date: It extracts a quote date from the Excel file.

Find Matching PDF: It searches for a corresponding PDF file that matches the naming pattern derived from the Excel file.

Modify PDF Content: Once a matching PDF file is found, the script searches for a specific pattern related to the quote date within the PDF content and replaces it with a formatted quote date obtained earlier from the Excel file.

Save Modified PDF: Finally, the modified PDF content is saved to a new file with a _modified.pdf suffix.

So far I have been able to get to step 2 only. I am stuck on modifying the PDF content. How can I repalce the quote date in the PDF file and save it? Here is my attempt so far:

library(RDCOMClient)
library(pdftools)

process_subfolders <- function(top_folder) {
  # Initialize Excel application
  xlApp <- COMCreate("Excel.Application")
  xlApp[["Visible"]] <- TRUE  # Set to TRUE for debugging purposes
  
  # List all files in the top folder
  all_files <- list.files(top_folder, recursive = TRUE, full.names = TRUE)
  
  # Filter for Excel files that start with 2 letters followed by 6 digits
  pattern <- "^.{1,2}\\d{6}.*\\.xlsm$"
  excel_files <- all_files[grep(pattern, basename(all_files), ignore.case = TRUE)]
  
  for (file_path in excel_files) {
    cat("Processing file:", file_path, "\n")
    
    xlBook <- NULL
    
    tryCatch({
      # Normalize path for Excel
      file_path <- normalizePath(file_path, winslash = "\\", mustWork = FALSE)
      
      # Open Excel workbook
      xlBook <- xlApp$Workbooks()$Open(file_path)
      
      if (is.null(xlBook)) {
        cat("Failed to open workbook:", file_path, "\n")
        next
      }
      
      #cat("Workbook opened:", file_path, "\n")
      
      # Get the date from cell C4 on the 'FURNITURE RENTAL' sheet
      xlSheet <- xlBook$Sheets("FURNITURE RENTAL")
      date_cell <- xlSheet$Range("C4")$Value()
      quote_date <- as.Date(date_cell, origin = "1899-12-30")
      
      # Format the date
      formatted_quote_date <- format(quote_date, "%A, %d %B %Y")
      
      cat("Quote date extracted:", formatted_quote_date, "\n")
      
      # Close the workbook
      xlBook$Close(FALSE)
      
      # Find the corresponding PDF file
      pdf_files <- all_files[grep("INVENTORY DOCKET.*\\.pdf$", all_files, ignore.case = TRUE)]
      
      if (length(pdf_files) == 0) {
        cat("No matching PDF files found for:", file_path, "\n")
        next
      }
      
      pdf_file <- pdf_files[1]  # Assuming the first match is the correct one
      
      cat("Processing PDF file:", pdf_file, "\n")
      
      # Extract PDF content
      pdf_text <- pdf_text(pdf_file)
      
      # Check if pdf_text is a character vector and concatenate if necessary
      if (is.character(pdf_text)) {
        pdf_text <- paste(pdf_text, collapse = "\n")
      }
      
      #  Debugging output to inspect extracted PDF text
      cat("PDF text extracted:\n", pdf_text, "\n")
      
      # Find and replace the quote date in the PDF
      old_date_pattern <- "(?i)QUOTE\\s*DATE\\s*\\b(\\w+, \\d+ \\w+ \\d{4})\\b"
      
      # Attempt to extract the quote date using the defined pattern
      extracted_date <- str_match(pdf_text, old_date_pattern)[, 2]
      
      if (is.na(extracted_date)) {
        cat("No 'QUOTE DATE:' found in PDF\n")
      } else {
        cat("Extracted quote date:", extracted_date, "\n")
      }
      
      # Replace the old quote date with the new formatted date
      modified_pdf_text <- str_replace(pdf_text, old_date_pattern, paste("QUOTE DATE: ", formatted_quote_date))
      
      cat("modified_pdf_text:", modified_pdf_text)
       
      # this is where I am stuck      
      
    })
  }
  
  xlApp$Quit()
  cat("All files processed.\n")
}

# Example usage
top_folder <- top_folder_path
process_subfolders(top_folder)

When I print modified_pdf_text, it shows the text has been updated with the new date. I just can't figure out the rest of the process.

Upvotes: 0

Views: 30

Answers (0)

Related Questions