Reputation: 37
I am trying to OCR standard forms (they are scanned both front and back)
I only want to OCR The second image on the scan (the one with the textual information) - is there a way to detect and split them, and only process the right one? Sorry if I'm missing out on something essential, just starting off.
import pytesseract as tess
import os
from PIL import Image
import pandas as pd
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/OCR/DC_SCANS_TEST" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir("/Users/oliviervandhuynslager/PycharmProjects/OCR/DC_SCANS_TEST"):
count = count + 1
fileName.append(imageName)
fileName.sort()#generate list from texts.
#%%
# APPEND (OCR) text from images TO LIST fullText
for imageName in os.listdir("/Users/oliviervandhuynslager/PycharmProjects/OCR/DC_SCANS_TEST"):
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
text = tess.image_to_string(img, lang="eng")
fullText.append(text)
Upvotes: 0
Views: 1515
Reputation: 3328
Here is working example for presented images:
import cv2
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe"
img = cv2.imread("BFezy.png", 0)
kernel = np.ones((25, 25), np.uint8)
eroded = cv2.erode(img, kernel, iterations=2)
dilated = cv2.dilate(eroded, kernel, iterations=1)
thresholded = cv2.threshold(dilated, 150, 255, cv2.THRESH_BINARY_INV)[1]
countours = cv2.findContours(th, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0]
if len(countours) == 2:
x, y, w, h = cv2.boundingRect(countours[0])
crop = img[y:h + y, x:w + x]
text = pytesseract.image_to_string(crop)
print(text)
Upvotes: 2