VicRam0001
VicRam0001

Reputation: 1

Update Python code - the PyPDF2 library has deprecated objects used in Python code

I have been able to use the Python code in a Linux based OS, but when I tried to run it the same code on a Windows based OS, I got deprecation messages.

My question is: How can the code be updated to overcome the deprecation issues?

  1. The Python code used was:
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead."

  1. So I updated to this Python code:
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead."

  1. I next updated Python code based on recommendation from https://pypdf2.readthedocs.io/en/latest/user/migration-1-to-2.html which states to update:

reader.getNumPages() / reader.numPages ➔ len(reader.pages)

import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(pdf_reader.len(reader.pages)):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF_File_name.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "AttributeError: 'PdfReader' object has no attribute 'len'"

  1. I updated the code based on comments by 'Abdul Aziz Barkat': Typo: pdf_reader.len(reader.pages) compare that to len(reader.pages) as stated in the deprecation message... You have to write len(pdf_reader.pages), len is a builtin function in Python.
import PyPDF2
import openpyxl

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
            return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "computers.pdf"
    excel_file = "output.xlsx"

pdf_text = pdf_to_text(pdf_file)
save_text_to_excel(pdf_text, excel_file)

Output: "PyPDF2.errors.DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead."

Upvotes: -1

Views: 3469

Answers (2)

VicRam0001
VicRam0001

Reputation: 1

Thanks (Abdul and Musabbir) for the feedback, I have updated the code as suggested, also using the Migration Guide to update the deprecated elements: https://pypdf2.readthedocs.io/en/3.x/user/migration-1-to-2.html

This code now runs on Python 3x using a Windows OS:

import openpyxl
import PyPDF2

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "PDF-file-name.pdf"
    excel_file = "output.xlsx"
    pdf_text = pdf_to_text(pdf_file)
    save_text_to_excel(pdf_text, excel_file)

Upvotes: 0

Musabbir Arrafi
Musabbir Arrafi

Reputation: 1885

The way you're trying to use the methods to read pdf has been deprecated in the new version. Follow the PdfFileReader class documentation to know more. Here's your corrected code:

import openpyxl
from PyPDF2 import PdfFileReader

def pdf_to_text(pdf_file):
    text = ""
    with open(pdf_file, "rb") as file:
        pdf_reader = PdfFileReader(file)
        print(pdf_reader.numPages)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
        return text

def save_text_to_excel(text, excel_file):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    lines = text.split("\n")
    for row_num, line in enumerate(lines, 1):
        sheet.cell(row=row_num, column=1, value=line)
        workbook.save(excel_file)

if __name__ == "__main__":
    pdf_file = "test.pdf"
    excel_file = "output.xlsx"
    pdf_text = pdf_to_text(pdf_file)
    print(pdf_text)
    save_text_to_excel(pdf_text, excel_file)

Upvotes: 1

Related Questions