Azure OCR API stalls after 90 images. How to fix connection error?

Question

I'm trying to run the Azure OCR API on 6000+ images. Unfortunately the code stalls after just 90 images.

Documentation:

Input: 6000+ Images (.png)

Desired Output:

File with extracted text only
File with extracted text and their corresponding bounding boxes

Error Msg: ConnectionError: HTTPSConnectionPool(host='westcentralus.api.cognitive.microsoft.com', port=443): Max retries exceeded with url: /vision/v2.0/ocr?language=unk&detectOrientation=true (Caused by NewConnectionError(': Failed to establish a new connection: [Errno -2] Name or service not known',))

I've provided a delay of 60 seconds after every 10 images, which should ideally take care of the 20 transactions per minute quota.

import warnings
warnings.filterwarnings("ignore")

import glob
import os
import requests
import pandas as pd
import time

# Replace the value of subscription_key with your subscription key.
subscription_key = "{key}"
assert subscription_key

# Replace the value of vision_base_url (not necessary for trial version)
vision_base_url="https://westcentralus.api.cognitive.microsoft.com/vision/v2.0/"
analyze_url = vision_base_url + "ocr"

# Initializing Source and Output Directories
source_directory = glob.glob('folder/with/6000/images/*.png')
output_directory_textFiles = 'folder/for/saving/6000/textFiles/'
output_directory_JSONFiles = 'folder/for/saving/6000/JSONFiles/'

if not os.path.exists(output_directory_textFiles):
        os.makedirs(output_directory_textFiles)

if not os.path.exists(output_directory_JSONFiles):
        os.makedirs(output_directory_JSONFiles)

# Define Function for Extracting Text

def extract_text(image_path):
# Read the image into a byte array
image_data = open(image_path, "rb").read()
headers    = {'Ocp-Apim-Subscription-Key': subscription_key,'Content-Type': 'application/octet-stream'}
params     = {'language': 'unk', 'detectOrientation': 'true'}
response = requests.post(analyze_url, headers=headers, params=params, data=image_data)
analysis = response.json()

# Extract the word bounding boxes and text.
line_infos = [region["lines"] for region in analysis["regions"]]
word_infos = []
for line in line_infos:
    for word_metadata in line:
        for word_info in word_metadata["words"]:
            word_infos.append(word_info)
return(word_infos)

# Generating Text and JSON Files

counter = 0
for image in sorted(source_directory):
    counter += 1
    print(r'Processing %d %s' %(counter, image))

    word_infos = extract_text(image)

    filename = image.split('/')[-1].replace('.png', '')

    if len(word_infos) != 0:
        bboxOutput = pd.DataFrame(word_infos)
        bboxOutput[['x','y', 'width','height']] =     bboxOutput['boundingBox'].str.split(',',expand=True)
        bboxOutput = bboxOutput.drop(['boundingBox'], axis=1)

        textFile = bboxOutput['text']     

        textFile = textFile.to_csv(r'{}/{}.txt'.format(output_directory_textFiles, filename), header = False, index = None, sep = ',')
        jsonFile = bboxOutput.to_json(orient = 'records')
        with open(r'{}/{}.txt'.format(output_directory_JSONFiles, filename), 'w') as f:
            f.write(jsonFile)
            f.close()

    else:
        word_infos = pd.DataFrame(word_infos)
        textFile = word_infos.to_csv(r'{}/{}.txt'.format(output_directory_textFiles, filename), header = False, index = None, sep = ',')
        jsonFile = word_infos.to_json(orient = 'records')
        with open(r'{}/{}.txt'.format(output_directory_JSONFiles, filename), 'w') as f:
            f.write(jsonFile)
            f.close()

    if (counter % 10) == 0:
        time.sleep(60)

    else:
        pass

Azure OCR API stalls after 90 images. How to fix connection error?

Answers (1)

Related Questions