Reputation:
I have tried most of the solutions on the site to extract data from the image, only this script worked with the format *.tif, and gave me correct data
'''
from PIL import Image
import glob
import pytesseract
image_list = []
for filename in glob.glob(my_image):
im=Image.open(filename)
image_list.append(im)
pytesseract.pytesseract.tesseract_cmd="C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
texts = [pytesseract.image_to_string(img,lang = 'eng') for img in image_list]
'''
However, this is not working with *.png and *.jpg, I tried the following:
'''
import cv2
import numpy as np
image = cv2.imread('1.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
sharpen_kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
sharpen = cv2.filter2D(gray, -1, sharpen_kernel)
thresh = cv2.threshold(sharpen, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1)
result = 255 - close
'''
And like,
'''
import os
from PIL import Image
import cv2
import pytesseract
import ftfy
import uuid
filename = img
image = cv2.imread(os.path.join(filename))
gray = cv2.threshold(image, 200, 255, cv2.THRESH_BINARY)[1]
gray = cv2.resize(gray, (0, 0), fx=3, fy=3)
gray = cv2.medianBlur(gray, 9)
filename = str(uuid.uuid4())+".jpg"
cv2.imwrite(os.path.join(filename), gray)
config = ("-l eng --oem 3 --psm 11")
text = pytesseract.image_to_string(Image.open(os.path.join(filename)), config=config)
text = ftfy.fix_text(text)
text = ftfy.fix_encoding(text)
text = text.replace('-\n', '')
print(text)
'''
and such, but not given me data, how can I extract text from image like of invoice?
Upvotes: 0
Views: 552
Reputation: 80
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
print(pytesseract.image_to_string(r'D:\examplepdf2image.png'))
def escape(html):
"""Returns the given HTML with ampersands, quotes and carets encoded."""
return mark_safe(force_unicode(html).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
this a sample code put instead of trying to print text through many different variables from this to that just try to print the image itself first. Then work on how to improve from there. One last thing is that this will let python work without errors making it easy to understand as well. The second piece of code with the def escape shows how to import an html file which you have to put your pieces of code into so you change it to your liking.
Upvotes: 1