Reputation: 1
I have been able to use the Python code in a Linux based OS, but when I tried to run it the same code on a Windows based OS, I got deprecation messages.
My question is: How can the code be updated to overcome the deprecation issues?
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfFileReader(file)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead."
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead."
reader.getNumPages() / reader.numPages ➔ len(reader.pages)
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(pdf_reader.len(reader.pages)):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF_File_name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "AttributeError: 'PdfReader' object has no attribute 'len'"
import PyPDF2
import openpyxl
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "computers.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Output: "PyPDF2.errors.DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead."
Upvotes: -1
Views: 3469
Reputation: 1
Thanks (Abdul and Musabbir) for the feedback, I have updated the code as suggested, also using the Migration Guide to update the deprecated elements: https://pypdf2.readthedocs.io/en/3.x/user/migration-1-to-2.html
This code now runs on Python 3x using a Windows OS:
import openpyxl
import PyPDF2
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "PDF-file-name.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)
Upvotes: 0
Reputation: 1885
The way you're trying to use the methods to read pdf
has been deprecated in the new version. Follow the PdfFileReader class documentation to know more. Here's your corrected code:
import openpyxl
from PyPDF2 import PdfFileReader
def pdf_to_text(pdf_file):
text = ""
with open(pdf_file, "rb") as file:
pdf_reader = PdfFileReader(file)
print(pdf_reader.numPages)
for page_num in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
def save_text_to_excel(text, excel_file):
workbook = openpyxl.Workbook()
sheet = workbook.active
lines = text.split("\n")
for row_num, line in enumerate(lines, 1):
sheet.cell(row=row_num, column=1, value=line)
workbook.save(excel_file)
if __name__ == "__main__":
pdf_file = "test.pdf"
excel_file = "output.xlsx"
pdf_text = pdf_to_text(pdf_file)
print(pdf_text)
save_text_to_excel(pdf_text, excel_file)
Upvotes: 1