Reputation: 345
I am looking to process a set of Excel files located within a specified folder (top_folder). For each Excel file found, the script performs the following tasks:
Extract Quote Date: It extracts a quote date from the Excel file.
Find Matching PDF: It searches for a corresponding PDF file that matches the naming pattern derived from the Excel file.
Modify PDF Content: Once a matching PDF file is found, the script searches for a specific pattern related to the quote date within the PDF content and replaces it with a formatted quote date obtained earlier from the Excel file.
Save Modified PDF: Finally, the modified PDF content is saved to a new file with a _modified.pdf suffix.
So far I have been able to get to step 2 only. I am stuck on modifying the PDF content. How can I repalce the quote date in the PDF file and save it? Here is my attempt so far:
library(RDCOMClient)
library(pdftools)
process_subfolders <- function(top_folder) {
# Initialize Excel application
xlApp <- COMCreate("Excel.Application")
xlApp[["Visible"]] <- TRUE # Set to TRUE for debugging purposes
# List all files in the top folder
all_files <- list.files(top_folder, recursive = TRUE, full.names = TRUE)
# Filter for Excel files that start with 2 letters followed by 6 digits
pattern <- "^.{1,2}\\d{6}.*\\.xlsm$"
excel_files <- all_files[grep(pattern, basename(all_files), ignore.case = TRUE)]
for (file_path in excel_files) {
cat("Processing file:", file_path, "\n")
xlBook <- NULL
tryCatch({
# Normalize path for Excel
file_path <- normalizePath(file_path, winslash = "\\", mustWork = FALSE)
# Open Excel workbook
xlBook <- xlApp$Workbooks()$Open(file_path)
if (is.null(xlBook)) {
cat("Failed to open workbook:", file_path, "\n")
next
}
#cat("Workbook opened:", file_path, "\n")
# Get the date from cell C4 on the 'FURNITURE RENTAL' sheet
xlSheet <- xlBook$Sheets("FURNITURE RENTAL")
date_cell <- xlSheet$Range("C4")$Value()
quote_date <- as.Date(date_cell, origin = "1899-12-30")
# Format the date
formatted_quote_date <- format(quote_date, "%A, %d %B %Y")
cat("Quote date extracted:", formatted_quote_date, "\n")
# Close the workbook
xlBook$Close(FALSE)
# Find the corresponding PDF file
pdf_files <- all_files[grep("INVENTORY DOCKET.*\\.pdf$", all_files, ignore.case = TRUE)]
if (length(pdf_files) == 0) {
cat("No matching PDF files found for:", file_path, "\n")
next
}
pdf_file <- pdf_files[1] # Assuming the first match is the correct one
cat("Processing PDF file:", pdf_file, "\n")
# Extract PDF content
pdf_text <- pdf_text(pdf_file)
# Check if pdf_text is a character vector and concatenate if necessary
if (is.character(pdf_text)) {
pdf_text <- paste(pdf_text, collapse = "\n")
}
# Debugging output to inspect extracted PDF text
cat("PDF text extracted:\n", pdf_text, "\n")
# Find and replace the quote date in the PDF
old_date_pattern <- "(?i)QUOTE\\s*DATE\\s*\\b(\\w+, \\d+ \\w+ \\d{4})\\b"
# Attempt to extract the quote date using the defined pattern
extracted_date <- str_match(pdf_text, old_date_pattern)[, 2]
if (is.na(extracted_date)) {
cat("No 'QUOTE DATE:' found in PDF\n")
} else {
cat("Extracted quote date:", extracted_date, "\n")
}
# Replace the old quote date with the new formatted date
modified_pdf_text <- str_replace(pdf_text, old_date_pattern, paste("QUOTE DATE: ", formatted_quote_date))
cat("modified_pdf_text:", modified_pdf_text)
# this is where I am stuck
})
}
xlApp$Quit()
cat("All files processed.\n")
}
# Example usage
top_folder <- top_folder_path
process_subfolders(top_folder)
When I print modified_pdf_text
, it shows the text has been updated with the new date. I just can't figure out the rest of the process.
Upvotes: 0
Views: 30