How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

Question

I am writing an Azure skillset and I want to use: Skills.Vision.OcrSkill, Vision.ImageAnalysisSkill, SplitSkill, and AzureOpenAIEmbeddingSkill. I am not sure what the correct order is to use these skills. In my skillset, I first use Skills.Vision.OcrSkill and Vision.ImageAnalysisSkill, then use Text.MergeSkill twice to merge their outputs with the document content. After that, I chunk the output of the previous operation and finally use Skills.Text.AzureOpenAIEmbeddingSkill. Is this logic correct?

{
  "@odata.etag": "\"mioetag\"",
  "name": "mioskillset",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",
      "name": "#1",
      "description": "It extracts text and metadata from documents and applies OCR to images",
      "context": "/document",
      "parsingMode": "default",
      "dataToExtract": "contentAndMetadata",
      "inputs": [
        {
          "name": "file_data",
          "source": "/document/file_data",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "content",
          "targetName": "extracted_text"
        },
        {
          "name": "normalized_images",
          "targetName": "extracted_normalized_images"
        }
      ],
      "configuration": {
        "imageAction": "generateNormalizedImages",
        "normalizedImageMaxWidth@odata.type": "#Int64",
        "normalizedImageMaxWidth": 2000,
        "normalizedImageMaxHeight@odata.type": "#Int64",
        "normalizedImageMaxHeight": 2000
      }
    },
    {
      "description": "Extract text (plain and structured) from image.",
      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
      "context": "/document/extracted_normalized_images/*",
      "defaultLanguageCode": "en",
      "detectOrientation": true,
      "inputs": [
        {
          "name": "image",
          "source": "/document/extracted_normalized_images/*"
        }
      ],
      "outputs": [
        {
          "name": "text"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",
      "context": "/document",
      "insertPreTag": " ",
      "insertPostTag": " ",
      "inputs": [
        {
          "name": "text",
          "source": "/document/content"
        },
        {
          "name": "itemsToInsert",
          "source": "/document/extracted_normalized_images/*/text"
        },
        {
          "name": "offsets",
          "source": "/document/extracted_normalized_images/*/contentOffset"
        }
      ],
      "outputs": [
        {
          "name": "mergedText",
          "targetName": "merged_text"
        },
        {
          "name": "mergedOffsets",
          "targetName": "first_mergedOffsets"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Vision.ImageAnalysisSkill",
      "context": "/document/extracted_normalized_images/*",
      "visualFeatures": [
        "tags",
        "description"
      ],
      "inputs": [
        {
          "name": "image",
          "source": "/document/extracted_normalized_images/*"
        }
      ],
      "outputs": [
        {
          "name": "adult"
        },
        {
          "name": "brands"
        },
        {
          "name": "categories"
        },
        {
          "name": "description"
        },
        {
          "name": "faces"
        },
        {
          "name": "objects"
        },
        {
          "name": "tags"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",
      "context": "/document",
      "insertPreTag": " ",
      "insertPostTag": " ",
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_test"
        },
        {
          "name": "itemsToInsert",
          "source": "/document/extracted_normalized_images/*/description"
        },
        {
          "name": "offsets",
          "source": "/document/first_mergedOffsets"
        }
      ],
      "outputs": [
        {
          "name": "mergedText",
          "targetName": "final_merged_text"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "#2",
      "description": "It splits the text into overlapping segments for vectorization, with maximumPageLength set for Ada and textSplitMode configured to avoid breaking pages",
      "context": "/document/merged_text",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 3000,
      "pageOverlapLength": 100,
      "maximumPagesToTake": 0,
      "unit": "characters",
      "inputs": [
        {
          "name": "text",
          "source": "/document/final_merged_text",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
      "name": "#3",
      "description": "It vectorizes the text for semantic search",
      "context": "/document/merged_text/pages/*",
      "resourceUri": "https://mioservizio.openai.azure.com",
      "apiKey": "",
      "deploymentId": "text-embedding-ada-002",
      "dimensions": 1536,
      "modelName": "text-embedding-ada-002",
      "inputs": [
        {
          "name": "text",
          "source": "/document/merged_text/pages/*",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "embedding",
          "targetName": "embeddingVec"
        }
      ]
    }
  ],
  "indexProjections": {
    "selectors": [
      {
        "targetIndexName": "mioindice",
        "parentKeyFieldName": "parent_id",
        "sourceContext": "/document/merged_text/pages/*",
        "mappings": [
          {
            "name": "content",
            "source": "/document/merged_text/pages/*",
            "inputs": []
          },
          {
            "name": "contentVector",
            "source": "/document/merged_text/pages/*/embeddingVec",
            "inputs": []
          },
          {
            "name": "title",
            "source": "/document/metadata_storage_name",
            "inputs": []
          },
          {
            "name": "url",
            "source": "/document/metadata_storage_path",
            "inputs": []
          },
          {
            "name": "filepath",
            "source": "/document/metadata_storage_path",
            "inputs": []
          },
          {
            "name": "timestamp",
            "source": "/document/metadata_storage_last_modified",
            "inputs": []
          },
          {
            "name": "chat_id",
            "source": "/document/chat_id",
            "inputs": []
          },
          {
            "name": "sas_token",
            "source": "/document/sas_token",
            "inputs": [ ]
          }
        ]
      }
    ],
    "parameters": {
      "projectionMode": "skipIndexingParentDocuments"
    }
  }
}

I appreciate any suggestions.

How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

Answers (0)

Related Questions