UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 162: invalid start byte

I am fetching data from s3 and I need to extract the text from a pdf file.

import boto3

from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

s3_client = boto3.client('s3')
s3_bucket_name = 'XXXXXX'
s3 = boto3.resource('s3',
                    aws_access_key_id = 'XXXXXXXX',
                    aws_secret_access_key='XXXXXXX')

obj = s3.Object(s3_bucket_name, 'XXXXXX.pdf').get()
data = obj['Body'].read()

output_string = StringIO()
with open(data, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(output_string.getvalue())

I'm getting this error:

with open(data, 'rb') as in_file: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 162: invalid start byte

Upvotes: 1

Views: 7282

Answers (2)

standalone_2045
standalone_2045

Reputation: 37

Like the above answer says, fetch the PDF document from your S3 bucket and parse the bytes stream:

import io
import gzip
import boto3
from pdfminer.high_level import extract_text

s3 = boto3.resource(
            service_name="s3",
            region_name="my_s3_region_name",
            aws_access_key_id="my_access_key_id",
            aws_secret_access_key="my_secret_access_key")

s3_object = s3.Object(bucket_name="my_s3_bucket", key="my_s3_filepath")
pdf_data = s3_object.get()['Body'].read()

# Decompress first if required:
pdf_data = gzip.decompress(pdf_data)

# Pass byte stream to a PDF parser of your choice.
with io.BytesIO(pdf_data) as f:
    pdf_text = extract_text(f)

Bytes array after decompression:

b'%PDF-1.4\r\n%\xa1\xb3\xc5\xd7\r\n2 0 obj\r\n<</Count 2/Kids[ 5 0 R  6 0 R ]/MediaBox[ 0 0 595 842]/Resources<<>>/Rotat'

Parsed bytes stream:

"O-2022-19 
AN ORDINANCE AMENDING 
CERTAIN PROVISIONS OF THE 
COUNTY OF ALAMEDA ADMINISTRATIVE CODE"

Upvotes: 0

Zuljin
Zuljin

Reputation: 2640

open() method here can open only file from disk and you are probably passing bytes array to it. Try replacing

with open(data, 'rb') as in_file:

with ByteIO object as it accept byte array and create stream out of it

with io.BytesIO(data) as in_file:

More info here https://docs.python.org/3/library/io.html#binary-i-o

Upvotes: 2

Related Questions