Reputation: 55
I'm currently extracting pdf from an API using requests.get. I don't want to download them but just to extract the text from them.
response_pdf = requests.get(url, auth=TokenAuth(key))
text = convert_pdf_to_txt(response_pdf.content)
Here is the code for the function convert_pdf_to_txt:
def convert_pdf_to_txt(filename):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
#codec ='ISO-8859-1'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
return text
I get the following error: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 11: invalid start byte
The response_pdf.content is a "class 'bytes'" object and i don't know how to extract the text from it.
Any help would be highly appreciated !
Upvotes: 0
Views: 862
Reputation: 169416
You're passing in a bytestring to be interpreted as a filename to open, which is not good.
Instead, you can read the bytestring into io.BytesIO()
and pass it in as fp
:
def convert_pdf_to_txt(fp):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = "utf-8"
# codec ='ISO-8859-1'
laparams = LAParams()
device = TextConverter(
rsrcmgr, retstr, codec=codec, laparams=laparams
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
return text
response_pdf = requests.get(url, auth=TokenAuth(key))
pdf_stream = io.BytesIO(response_pdf.content)
text = convert_pdf_to_txt(pdf_stream)
This has the additional nicety to it that you can still use it with files:
with open('my_pdf', 'rb') as pdf_stream:
text = convert_pdf_to_txt(pdf_stream)
Upvotes: 1