Reputation: 323
I would like to extract the data like Header, paragraphs, tables, pagenumber, pagefooter from the pdf in the dataframe format using the azure form recognizer using python.
PFB expected output.
I have tried using layout model but the from the response i am not able to identify the header, paragraph or table
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
endpoint = "<your-endpoint>"
key = "<your-key>"
def format_polygon(polygon):
if not polygon:
return "N/A"
return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])
def analyze_layout():
# sample form document
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_analysis_client.begin_analyze_document_from_url(
"prebuilt-layout", formUrl)
result = poller.result()
for idx, style in enumerate(result.styles):
print(
"Document contains {} content".format(
"handwritten" if style.is_handwritten else "no handwritten"
)
)
for page in result.pages:
print("----Analyzing layout from page #{}----".format(page.page_number))
print(
"Page has width: {} and height: {}, measured with unit: {}".format(
page.width, page.height, page.unit
)
)
for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_polygon(line.polygon),
)
)
for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)
for selection_mark in page.selection_marks:
print(
"...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
selection_mark.state,
format_polygon(selection_mark.polygon),
selection_mark.confidence,
)
)
for table_idx, table in enumerate(result.tables):
print(
"Table # {} has {} rows and {} columns".format(
table_idx, table.row_count, table.column_count
)
)
for region in table.bounding_regions:
print(
"Table # {} location on page: {} is {}".format(
table_idx,
region.page_number,
format_polygon(region.polygon),
)
)
for cell in table.cells:
print(
"...Cell[{}][{}] has content '{}'".format(
cell.row_index,
cell.column_index,
cell.content,
)
)
for region in cell.bounding_regions:
print(
"...content on page {} is within bounding box '{}'".format(
region.page_number,
format_polygon(region.polygon),
)
)
print("----------------------------------------")
if __name__ == "__main__":
analyze_layout()
Upvotes: 2
Views: 2348
Reputation: 91
I realise this only answer a part of your question but it may help a bit : Here is a fonction to get the tables and store them in a list of dataframe:
def get_tables(result):
result_dict = result.to_dict()
all_tables = []
for idx, atable in enumerate(result_dict["tables"]):
l = list()
row_count = atable["row_count"]
column_count = atable["column_count"]
for aval in atable["cells"]:
l.append(aval["content"])
df = pd.DataFrame(np.array(l).reshape(row_count, column_count))
df.columns = df.iloc[0]
df = df.drop(df.index[0])
all_tables.append(df)
return all_tables
You get the whole text content by using result.content
If anyone as a quick way to get the content paragraph per paragraph I very interested :)
Upvotes: 0