RuntimeWarning: coroutine 'process_all_documents' was never awaited chunk_documents.process_all_documents()

Question

Trying to fetch some documents from azure and run document intelligence on it. Original script was taking 14 hours to run. Tried to optimize it getting errors.

Error: RuntimeWarning: coroutine 'process_all_documents' was never awaited chunk_documents.process_all_documents() RuntimeWarning: Enable tracemalloc to get the object allocation traceback

import asyncio
import aiohttp
import csv
import config
import extract_metadata
from azure.storage.blob.aio import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer.aio import DocumentAnalysisClient
from azure.core.exceptions import HttpResponseError

async def list_pdfs(container_name, max_files):
    async with BlobServiceClient.from_connection_string(config.AZURE_STORAGE_CONNECTION_STRING) as blob_service_client:
        container_client = blob_service_client.get_container_client(container_name)
        pdf_files = []

        async for blob in container_client.list_blobs():
            if blob.name.endswith(".pdf"):
                pdf_files.append(blob.name)
            if max_files and len(pdf_files) >= max_files:
                break

        return pdf_files


async def analyze_document_from_blob(blob_url, client):
    try:
        poller = await client.begin_analyze_document_from_url(model_id="prebuilt-layout", document_url=blob_url)
        result = await poller.result()

        full_text = "
".join([line.content for page in result.pages for line in page.lines])
        return full_text

    except HttpResponseError as e:
        print(f"Error analyzing document from URL {blob_url}: {e.message}")
        return None


def chunk_text(text, chunk_size=6000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]


async def upload_to_blob(local_file_path, container_name, blob_name):
    async with BlobServiceClient.from_connection_string(config.AZURE_STORAGE_CONNECTION_STRING) as blob_service_client:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

        try:
            async with aiohttp.ClientSession() as session:
                async with session.put(blob_client.url, data=open(local_file_path, "rb")) as response:
                    if response.status == 201:
                        print(f"Uploaded {local_file_path} to {container_name}/{blob_name}")
                    else:
                        print(f"Failed to upload {local_file_path}. Status code: {response.status}")
        except Exception as e:
            print(f"Error uploading {local_file_path}: {e}")


async def process_document(filename, metadata_dict, document_client):
    metadata = metadata_dict.get(filename)
    if metadata is None:
        print(f"Skipping {filename} (no metadata found).")
        return None

    async with BlobServiceClient.from_connection_string(config.AZURE_STORAGE_CONNECTION_STRING) as blob_service_client:
        blob_client = blob_service_client.get_blob_client(container="content", blob=filename)
        extracted_text = await analyze_document_from_blob(blob_client.url, document_client)

    if not extracted_text:
        print(f"Skipping {filename} (analysis failed).")
        return None

    chunks = chunk_text(extracted_text)
    metadata_keys = ["idx", "PartitionKey", "RowKey", "timestamp", "title", "summary", 
                         "sourcefile", "documenttypes", "organizations", "agencies", "programs", "keywords"]
          # Define a fixed list of metadata keys
    rows = [[filename, i, chunk] + [metadata.get(key, "") for key in metadata_keys] for i, chunk in enumerate(chunks)]
    print(f"{filename} processed with {len(chunks)} chunks.")
    return rows


async def process_all_documents():
    metadata_dict = extract_metadata.load_metadata_from_blob()
    pdf_files = await list_pdfs("content", None)

    endpoint = config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT
    api_key = config.AZURE_DOCUMENT_INTELLIGENCE_KEY

    async with DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(api_key)) as document_client:
        semaphore = asyncio.Semaphore(10)  # Limit to 10 concurrent tasks
        tasks = [process_document(filename, metadata_dict, document_client) for filename in pdf_files]
        results = await asyncio.gather(*tasks)

    with open("document_chunks.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["filename", "chunk_id", "text"] + list(metadata_dict.keys()))

        for result in results:
            if result:
                writer.writerows(result)

    await upload_to_blob("document_chunks.csv", "metadata", "metadata_iris/document_chunks.csv")

    print("Processing completed.")


async def main():
    await process_all_documents()

if __name__ == "__main__":
    asyncio.run(main())

RuntimeWarning: coroutine 'process_all_documents' was never awaited chunk_documents.process_all_documents()

Answers (1)

Related Questions

RuntimeWarning: coroutine &#39;process_all_documents&#39; was never awaited chunk_documents.process_all_documents()

Answers (1)

Related Questions

RuntimeWarning: coroutine 'process_all_documents' was never awaited chunk_documents.process_all_documents()