Niranjanp
Niranjanp

Reputation: 323

How to extract header, paragraph, table structure from pdf using azure form recognizer in python

I would like to extract the data like Header, paragraphs, tables, pagenumber, pagefooter from the pdf in the dataframe format using the azure form recognizer using python.

PFB expected output.

enter image description here

I have tried using layout model but the from the response i am not able to identify the header, paragraph or table

https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?view=doc-intel-3.1.0&viewFallbackFrom=form-recog-3.0.0&preserve-view=true&pivots=programming-language-python

import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

endpoint = "<your-endpoint>"
key = "<your-key>"

def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])

def analyze_layout():
    # sample form document
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-layout", formUrl)
    result = poller.result()

    for idx, style in enumerate(result.styles):
        print(
            "Document contains {} content".format(
                "handwritten" if style.is_handwritten else "no handwritten"
            )
        )

    for page in result.pages:
        print("----Analyzing layout from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            words = line.get_words()
            print(
                "...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
                    line_idx,
                    len(words),
                    line.content,
                    format_polygon(line.polygon),
                )
            )

            for word in words:
                print(
                    "......Word '{}' has a confidence of {}".format(
                        word.content, word.confidence
                    )
                )

        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
                    selection_mark.state,
                    format_polygon(selection_mark.polygon),
                    selection_mark.confidence,
                )
            )

    for table_idx, table in enumerate(result.tables):
        print(
            "Table # {} has {} rows and {} columns".format(
                table_idx, table.row_count, table.column_count
            )
        )
        for region in table.bounding_regions:
            print(
                "Table # {} location on page: {} is {}".format(
                    table_idx,
                    region.page_number,
                    format_polygon(region.polygon),
                )
            )
        for cell in table.cells:
            print(
                "...Cell[{}][{}] has content '{}'".format(
                    cell.row_index,
                    cell.column_index,
                    cell.content,
                )
            )
            for region in cell.bounding_regions:
                print(
                    "...content on page {} is within bounding box '{}'".format(
                        region.page_number,
                        format_polygon(region.polygon),
                    )
                )

    print("----------------------------------------")


if __name__ == "__main__":
    analyze_layout()

Upvotes: 2

Views: 2348

Answers (1)

SophieG
SophieG

Reputation: 91

I realise this only answer a part of your question but it may help a bit : Here is a fonction to get the tables and store them in a list of dataframe:

def get_tables(result):
    result_dict = result.to_dict()
    all_tables = []
    for idx, atable in enumerate(result_dict["tables"]):
        l = list()
        row_count = atable["row_count"]
        column_count = atable["column_count"]
        for aval in atable["cells"]:
            l.append(aval["content"])
        df = pd.DataFrame(np.array(l).reshape(row_count, column_count))
        df.columns = df.iloc[0]
        df = df.drop(df.index[0])
        all_tables.append(df)
    return all_tables

You get the whole text content by using result.content

If anyone as a quick way to get the content paragraph per paragraph I very interested :)

Upvotes: 0

Related Questions