How do I filter chunked pdfs in Azure AI vector search on parent metadata?

I am new to using the search functionality on Azure and I am struggling a lot to make my chunks filterable. This is my situation: I have a blob with hundreds of pdfs in different folders. Each blob has a list of metadata columns and values stored in them. To be able to search them semantically, I have applied a vector search out of the box using the Azure portal and clicking the "Import and vectorize"-button. This sets up the skillset for chunking, the indexer, and creates an index. From this, I can search my data, and filter it just fine. The issue is that the metadata from the parent document is not stored in each chunk, so I cannot filter the chunks when running a vector search.

I cannot for the life of me figure out how to map the parent metadata to each chunk! Can anyone help me with this? And is there a better way of doing it? Such as filtering directly on the parent documents? I am looking for the absolute easiest implementation here.

Thanks in advance.

I have tried a few variations of indexers, splitters etc. but mainly defining the metadata columns in the index in the exact same way as "title", and then trying to map the name "metadata_prosjektnummer" to "prosjektnummer" in the indexer (I don't know if this is the correct naming convention, the metadata in the blob-pdf is simply called "prosjektnummer"). This always yields null for some reason. I click the "run" button on the indexer to debug, and it doesn't seem like it always does any indexing it I get 0/0. If this is the culprit, how would I rerun the indexer when I change the JSON?

Below are my definitions. Disregard the references to names as I have tried to anonymize. Metadata remains the same though.

Indexer:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#indexers/$entity",
  "@odata.etag": "...",
  "name": "name",
  "description": null,
  "dataSourceName": "datasourcename",
  "skillsetName": "skillsetname",
  "targetIndexName": "indexname",
  "disabled": null,
  "schedule": null,
  "parameters": {
    "batchSize": null,
    "maxFailedItems": null,
    "maxFailedItemsPerBatch": null,
    "base64EncodeKeys": null,
    "configuration": {
      "dataToExtract": "contentAndMetadata",
      "parsingMode": "default"
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_name",
      "targetFieldName": "title",
      "mappingFunction": null
    },
    {
      "sourceFieldName": "prosjektnummer",
      "targetFieldName": "prosjektnummer",
      "mappingFunction": null
    }
  ],
  "outputFieldMappings": [],
  "cache": null,
  "encryptionKey": null
}

Index:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#indexes/$entity",
  "@odata.etag": "",
  "name": "name",
  "defaultScoringProfile": null,
  "fields": [
    {
      "name": "chunk_id",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": true,
      "key": true,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "keyword",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "parent_id",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": true,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "chunk",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "title",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "prosjektnummer",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "vector",
      "type": "Collection(Edm.Single)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": 1536,
      "vectorSearchProfile": "vectorprofilename",
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "corsOptions": null,
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "encryptionKey": null,
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
    "k1": null,
    "b": null
  },
  "semantic": null,
  "vectorSearch": {
    "algorithms": [
      {
        "name": "vector algorithm",
        "kind": "hnsw",
        "hnswParameters": {
          "metric": "cosine",
          "m": 4,
          "efConstruction": 400,
          "efSearch": 500
        },
        "exhaustiveKnnParameters": null
      }
    ],
    "profiles": [
      {
        "name": "vectorprofilename",
        "algorithm": "vectoralgorithmname",
        "vectorizer": "vector-vectorizer",
        "compression": null
      }
    ],
    "vectorizers": [
      {
        "name": "vector-vectorizer",
        "kind": "azureOpenAI",
        "azureOpenAIParameters": {
          "resourceUri": "https://documentsearch-oai.openai.azure.com",
          "deploymentId": "document-embedding",
          "apiKey": "<redacted>",
          "authIdentity": null
        },
        "customWebApiParameters": null
      }
    ],
    "compressions": []
  }
}

Skillset:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#skillsets/$entity",
  "@odata.etag": "",
  "name": "vector-skillset",
  "description": "Skillset to chunk documents and generate embeddings",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
      "name": "#1",
      "description": null,
      "context": "/document/pages/*",
      "resourceUri": "https://.openai.azure.com",
      "apiKey": "<redacted>",
      "deploymentId": "tool-embedding",
      "inputs": [
        {
          "name": "text",
          "source": "/document/pages/*"
        }
      ],
      "outputs": [
        {
          "name": "embedding",
          "targetName": "vector"
        }
      ],
      "authIdentity": null
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "#2",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 500,
      "maximumPagesToTake": 0,
      "inputs": [
        {
          "name": "text",
          "source": "/document/content"
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    }
  ],
  "cognitiveServices": null,
  "knowledgeStore": null,
  "indexProjections": {
    "selectors": [
      {
        "targetIndexName": "vector-menonrag",
        "parentKeyFieldName": "parent_id",
        "sourceContext": "/document/pages/*",
        "mappings": [
          {
            "name": "chunk",
            "source": "/document/pages/*",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "vector",
            "source": "/document/pages/*/vector",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "title",
            "source": "/document/metadata_storage_name",
            "sourceContext": null,
            "inputs": []
          }
        ]
      }
    ],
    "parameters": {
    "projectionMode": "skipIndexingParentDocuments"
  },
  "encryptionKey": null
}

Upvotes: 0

Views: 871

Answers (1)

I don't have a sleek solution to this. But I solved it by making sure each metadata column is mapped to each chunk in the splitter:

{
          "name": "prosjektnavn",
            "source": "/document/prosjektnavn",
            "sourceContext": null,
            "inputs": []
          }

And also including all of these in the index of course.

Upvotes: 0

Related Questions