Reputation: 405
I have an image and want to detect the text regions in it.
I tried TiRG_RAW_20110219 project but the results are not satisfactory. If the input image is this:
it is producing this as output:
Can anyone suggest some alternative. I wanted this to improve the output of tesseract by sending it only the text region as input.
Upvotes: 38
Views: 90766
Reputation: 4592
import cv2
def captch_ex(file_name):
img = cv2.imread(file_name)
img_final = cv2.imread(file_name)
img2gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, mask = cv2.threshold(img2gray, 180, 255, cv2.THRESH_BINARY)
image_final = cv2.bitwise_and(img2gray, img2gray, mask=mask)
ret, new_img = cv2.threshold(image_final, 180, 255, cv2.THRESH_BINARY) # for black text , cv.THRESH_BINARY_INV
'''
line 8 to 12 : Remove noisy portion
'''
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3,
3)) # to manipulate the orientation of dilution , large x means horizonatally dilating more, large y means vertically dilating more
dilated = cv2.dilate(new_img, kernel, iterations=9) # dilate , more the iteration more the dilation
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # findContours returns 3 variables for getting contours
for contour in contours:
# get rectangle bounding contour
[x, y, w, h] = cv2.boundingRect(contour)
# Don't plot small false positives that aren't text
if w < 35 and h < 35:
continue
# draw rectangle around contour on original image
cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 255), 2)
'''
#you can crop image and send to OCR , false detected will return no text :)
cropped = img_final[y :y + h , x : x + w]
s = file_name + '/crop_' + str(index) + '.jpg'
cv2.imwrite(s , cropped)
index = index + 1
'''
# write original image with added contours to disk
cv2.imshow('captcha_result', img)
cv2.waitKey()
file_name = 'your_image.jpg'
captch_ex(file_name)
Upvotes: 69
Reputation: 21203
You can use a deep learning based text detector called Efficient and Accurate Scene Text - EAST. It can be used with OpenCV functions but first you need to download the trained model from frozen_east_text_detection.pb
The following code and its comments was borrowed in its entirety from here -text_detection.py
. Remember to pass the downloaded .pb
file into cv2.dnn.readNet()
Highlights:
cv2.dnn.readNet()
as a .pb
file.layerNames
each for probabilities of containing text and bounding box coordinatescv2.dnn.blobFromImage()
where the image is considered to be a blob. It undergoes mean subtraction, scaling and channel swapping. more details on these herenet.setInput()
along with the output layers.For more code explanation Please refer here
Code:
image = cv2.imread('path_to_image')
orig = image.copy()
(H, W) = image.shape[:2]
# set the new width and height and then determine the ratio in change
# for both the width and height
(newW, newH) = (320, 320)
rW = W / float(newW)
rH = H / float(newH)
# resize the image and grab the new image dimensions
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]
# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"]
# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet('path_containing_frozen_east_text_detection.pb')
# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
# grab the number of rows and columns from the scores volume, then
# initialize our set of bounding box rectangles and corresponding
# confidence scores
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
# loop over the number of rows
for y in range(0, numRows):
# extract the scores (probabilities), followed by the geometrical
# data used to derive potential bounding box coordinates that
# surround text
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
for x in range(0, numCols):
# ignore probability values below 0.75
if scoresData[x] < 0.75:
continue
# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image
(offsetX, offsetY) = (x * 4.0, y * 4.0)
# extract the rotation angle for the prediction and then
# compute the sin and cosine
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
# use the geometry volume to derive the width and height of
# the bounding box
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
# compute both the starting and ending (x, y)-coordinates for
# the text prediction bounding box
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
startX = int(endX - w)
startY = int(endY - h)
# add the bounding box coordinates and probability score to
# our respective lists
rects.append((startX, startY, endX, endY))
confidences.append(scoresData[x])
# apply non-maxima suppression to suppress weak, overlapping bounding
# boxes
boxes = non_max_suppression(np.array(rects), probs=confidences)
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
# scale the bounding box coordinates based on the respective
# ratios
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
# draw the bounding box on the image
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
cv2.imwrite('path_to_save', orig)
Result:
Although the result is not as expected, it is pretty close
UPDATE:
To crop and save each individual bounding box as an image do the following:
# take a copy o the original image
image2 = orig.copy()
for i, (startX, startY, endX, endY) in enumerate(boxes):
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
cropped = image2[startY:endY, startX:endX]
cv2.imwrite(r'Cropped_result\crop_img_{}.jpg'.format(i), cropped)
Upvotes: 0
Reputation: 46580
Since no one has posted a complete solution, here's an approach. Using the observation that the desired text is in white and that words are structured in a horizontal alignment, we can use color segmentation to extract and OCR the letters.
Perform color segmentation. We load the image, convert to HSV format, define lower/upper ranges and perform color segmentation using cv2.inRange()
to obtain a binary mask
Dilate to connect text characters. We create a horizontal shaped kernel using cv2.getStructuringElement()
then dilate using cv2.dilate()
to combine individual letters into a single contour
Remove non-text contours. We find contours with cv2.findContours()
and filter using aspect ratio to remove non-text characters. Since the text is in a horizontal orientation, if the contour is determined to be less than a predefined aspect ratio threshold then we remove the non-text contour by filling in the contour with cv2.drawContours()
Perform OCR. We bitwise-and the dilated image with the initial mask to isolate only text characters and invert the image so that the text is in black with the background in white. Finally, we throw the image into Pytesseract OCR
Here's a visualization of each step:
Input image
Mask generated from color segmentation
# Load image, convert to HSV format, define lower/upper ranges, and perform
# color segmentation to create a binary mask
image = cv2.imread('1.jpg')
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
lower = np.array([0, 0, 218])
upper = np.array([157, 54, 255])
mask = cv2.inRange(hsv, lower, upper)
Dilated image to connect text-contours and removed non-text contours using aspect ratio filtering
# Create horizontal kernel and dilate to connect text characters
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,3))
dilate = cv2.dilate(mask, kernel, iterations=5)
# Find contours and filter using aspect ratio
# Remove non-text contours by filling in the contour
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
ar = w / float(h)
if ar < 5:
cv2.drawContours(dilate, [c], -1, (0,0,0), -1)
Bitwise-and both masks and invert to get result ready for OCR
# Bitwise dilated image with mask, invert, then OCR
result = 255 - cv2.bitwise_and(dilate, mask)
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)
Result from Pytesseract OCR using --psm 6
configuration setting to assume a uniform block of text. Look here for more configuration options
All women become
like their mothers.
That is their tragedy.
No man does.
That's his.
OSCAR WILDE
Full code
import cv2
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Load image, convert to HSV format, define lower/upper ranges, and perform
# color segmentation to create a binary mask
image = cv2.imread('1.jpg')
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
lower = np.array([0, 0, 218])
upper = np.array([157, 54, 255])
mask = cv2.inRange(hsv, lower, upper)
# Create horizontal kernel and dilate to connect text characters
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,3))
dilate = cv2.dilate(mask, kernel, iterations=5)
# Find contours and filter using aspect ratio
# Remove non-text contours by filling in the contour
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
ar = w / float(h)
if ar < 5:
cv2.drawContours(dilate, [c], -1, (0,0,0), -1)
# Bitwise dilated image with mask, invert, then OCR
result = 255 - cv2.bitwise_and(dilate, mask)
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)
cv2.imshow('mask', mask)
cv2.imshow('dilate', dilate)
cv2.imshow('result', result)
cv2.waitKey()
The HSV lower/upper color range was determined using this HSV color thresholder script
import cv2
import numpy as np
def nothing(x):
pass
# Load image
image = cv2.imread('1.jpg')
# Create a window
cv2.namedWindow('image')
# Create trackbars for color change
# Hue is from 0-179 for Opencv
cv2.createTrackbar('HMin', 'image', 0, 179, nothing)
cv2.createTrackbar('SMin', 'image', 0, 255, nothing)
cv2.createTrackbar('VMin', 'image', 0, 255, nothing)
cv2.createTrackbar('HMax', 'image', 0, 179, nothing)
cv2.createTrackbar('SMax', 'image', 0, 255, nothing)
cv2.createTrackbar('VMax', 'image', 0, 255, nothing)
# Set default value for Max HSV trackbars
cv2.setTrackbarPos('HMax', 'image', 179)
cv2.setTrackbarPos('SMax', 'image', 255)
cv2.setTrackbarPos('VMax', 'image', 255)
# Initialize HSV min/max values
hMin = sMin = vMin = hMax = sMax = vMax = 0
phMin = psMin = pvMin = phMax = psMax = pvMax = 0
while(1):
# Get current positions of all trackbars
hMin = cv2.getTrackbarPos('HMin', 'image')
sMin = cv2.getTrackbarPos('SMin', 'image')
vMin = cv2.getTrackbarPos('VMin', 'image')
hMax = cv2.getTrackbarPos('HMax', 'image')
sMax = cv2.getTrackbarPos('SMax', 'image')
vMax = cv2.getTrackbarPos('VMax', 'image')
# Set minimum and maximum HSV values to display
lower = np.array([hMin, sMin, vMin])
upper = np.array([hMax, sMax, vMax])
# Convert to HSV format and color threshold
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
mask = cv2.inRange(hsv, lower, upper)
result = cv2.bitwise_and(image, image, mask=mask)
# Print if there is a change in HSV value
if((phMin != hMin) | (psMin != sMin) | (pvMin != vMin) | (phMax != hMax) | (psMax != sMax) | (pvMax != vMax) ):
print("(hMin = %d , sMin = %d, vMin = %d), (hMax = %d , sMax = %d, vMax = %d)" % (hMin , sMin , vMin, hMax, sMax , vMax))
phMin = hMin
psMin = sMin
pvMin = vMin
phMax = hMax
psMax = sMax
pvMax = vMax
# Display result image
cv2.imshow('image', result)
if cv2.waitKey(10) & 0xFF == ord('q'):
break
cv2.destroyAllWindows()
Upvotes: 39
Reputation: 1365
If you don't mind getting your hands dirty you could try and grow those text regions into one bigger rectangular region, which you feed to tesseract all at once.
I'd also suggest trying to threshold the image several times and feeding each of those to tesseract separately to see if that helps at all. You can compare the output to dictionary words to automatically determine if a particular OCR result is good or not.
Upvotes: 6