Dai Zhang
Dai Zhang

Reputation: 11

image to text conversion using Tesseract

I am trying to load all images in a folder and extract text from images. I keep getting error message for the second for loop. For example,

AttributeError: 'numpy.ndarray' object has no attribute 'read'

It seems I cannot access list Img. Any idea?

# import OpenCV, Numpy, Python image library, Tesseract OCR
import os
import cv2
import numpy 
from PIL import Image
import pytesseract
import glob

#set tesseract path
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'

#read all image with .jpg format in a specifying folder
img = []    

for i in glob.glob("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\\Reports\\Image\\*.jpg"):
    n= cv2.imread(i,0)   #convert image to grayscale    
    print(i)
    img.append(n)


for j in img:
    im = Image.open(j)
    text = pytesseract.image_to_string (j, lang='eng')
    with open("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\Reports\\Image\\test.txt", "w") as f:
    f.write(text.encode('utf8'))

Upvotes: 1

Views: 2952

Answers (1)

baodev
baodev

Reputation: 37

I have Mac OSX but you can adjust this code to file Window's path directory.

import os
from os import path
from glob import glob 
from pytesseract import image_to_string
from PIL import Image, ImageEnhance, ImageFilter

def enhance_img(filename):
    # Enhance image and save as under new name
    im = im.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(2)
    im = im.convert('1')
    im.save('newfilename')

def convert_img(filename):
    image = Image.open(filename)

    # Convert image to text
    file = open ('parsing.txt', 'a')
    file.write(image_to_string(image))
    file.close

def find_ext(dir, ext):
    return glob(path.join(dir, "*.{}".format(ext)))

# use the following for change directory
    # os.chdir(path)
filename = find_ext("","png")

for file in filename:
    # convert image to text
    convert_img(file)

If you want to enhance the image then include the following block and adjust the code above to loop through the new filenames.

def enhance_img(filename):
    # Enhance image and save as under new name
    im = im.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(2)
    im = im.convert('1')
    im.save('newfilename')

For file in filename:
    # to enhance image if needed 
    newfilename = filename[-3] + '_1.png'
    enhance_img(file)

Upvotes: 1

Related Questions