I'm trying to detect text from items, which may be rotated in various directions. I've tried using Tesseract, EasyOCR, and EAST for text detection and extraction, but I am encountering issues with rotated text. Tesseract has given me the closest results, but it still incorrectly extracts the text when it is rotated.
Is there any possible way to extract text correctly, regardless of its rotation? I've included some sample images for better understanding.
Someone suggested rotating the images and detecting text each time, but this solution is too time-consuming in my case (70 hours per run). Here is the code I used:
import os
import cv2
import pytesseract
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
# Directory containing the images
directory = 'Camera2/front'
# Ensure pytesseract can find the tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Adjust path as necessary
# Initialize an empty list to store results
results = []
# Get the list of image files in the directory
image_files = [f for f in os.listdir(directory) if f.endswith('.jpeg') or f.endswith('.jpg')]
def preprocess_image(image):
# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding to preprocess the image
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
return gray, binary
def detect_text(image):
# Preprocess the image
gray, binary = preprocess_image(image)
# Perform OCR on the preprocessed image
text = pytesseract.image_to_string(binary, config='--psm 3 -l eng --oem 3') # Using page segmentation mode 3
# Check if any text is detected
return bool(text.strip()), text, gray
def rotate_image(image, angle):
# Get the image dimensions
(h, w) = image.shape[:2]
# Calculate the center of the image
center = (w / 2, h / 2)
# Perform the rotation
matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(image, matrix, (w, h))
return rotated
# Iterate through each file in the directory with tqdm for progress visualization
for filename in tqdm(image_files, desc="Processing images"):
filepath = os.path.join(directory, filename)
# Load the current image
original_image = cv2.imread(filepath)
# Initialize text detection result
has_text = False
detected_text = ""
gray_image = None
# Rotate the image from 0 to 359 degrees
for angle in tqdm(range(0, 360)):
rotated_image = rotate_image(original_image, angle)
has_text, detected_text, gray_image = detect_text(rotated_image)
if has_text:
# Plotting the original and preprocessed images
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Original image
axes[0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
axes[0].set_title('Original Image')
# Gray scale image
if gray_image is not None:
axes[1].imshow(gray_image, cmap='gray')
axes[1].set_title('Grayscale Image with Adjusted Thresholding')
if has_text:
print(f"Text detected in {filename}:")
# Store text in results list if it's longer than 3 characters
if len(detected_text) > 3:
image_id = filename.replace('.jpeg', '').replace('.jpg', '')
results.append({'ID': image_id, 'text': detected_text})
print(f"No text detected in {filename}.")
results_df = pd.DataFrame(results)
You don't need to rotate the image by a single degree each time if you can figure out the alignment of your image. Given your text is in general longer than a single letter, its bounding box should be an rectangle with width > height.
You could then:
(x1, x2, y1, y2)
You will have to calculate the bounding boxes first but you only need to do this once for the initial image. Should be around 180 times faster than your current approach.
