Reputation: 95
I am new to using the search functionality on Azure and I am struggling a lot to make my chunks filterable. This is my situation: I have a blob with hundreds of pdfs in different folders. Each blob has a list of metadata columns and values stored in them. To be able to search them semantically, I have applied a vector search out of the box using the Azure portal and clicking the "Import and vectorize"-button. This sets up the skillset for chunking, the indexer, and creates an index. From this, I can search my data, and filter it just fine. The issue is that the metadata from the parent document is not stored in each chunk, so I cannot filter the chunks when running a vector search.
I cannot for the life of me figure out how to map the parent metadata to each chunk! Can anyone help me with this? And is there a better way of doing it? Such as filtering directly on the parent documents? I am looking for the absolute easiest implementation here.
Thanks in advance.
I have tried a few variations of indexers, splitters etc. but mainly defining the metadata columns in the index in the exact same way as "title", and then trying to map the name "metadata_prosjektnummer" to "prosjektnummer" in the indexer (I don't know if this is the correct naming convention, the metadata in the blob-pdf is simply called "prosjektnummer"). This always yields null for some reason. I click the "run" button on the indexer to debug, and it doesn't seem like it always does any indexing it I get 0/0. If this is the culprit, how would I rerun the indexer when I change the JSON?
Below are my definitions. Disregard the references to names as I have tried to anonymize. Metadata remains the same though.
Indexer:
{
"@odata.context": "https://documentsearch.search.windows.net/$metadata#indexers/$entity",
"@odata.etag": "...",
"name": "name",
"description": null,
"dataSourceName": "datasourcename",
"skillsetName": "skillsetname",
"targetIndexName": "indexname",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": null,
"maxFailedItemsPerBatch": null,
"base64EncodeKeys": null,
"configuration": {
"dataToExtract": "contentAndMetadata",
"parsingMode": "default"
}
},
"fieldMappings": [
{
"sourceFieldName": "metadata_storage_name",
"targetFieldName": "title",
"mappingFunction": null
},
{
"sourceFieldName": "prosjektnummer",
"targetFieldName": "prosjektnummer",
"mappingFunction": null
}
],
"outputFieldMappings": [],
"cache": null,
"encryptionKey": null
}
Index:
{
"@odata.context": "https://documentsearch.search.windows.net/$metadata#indexes/$entity",
"@odata.etag": "",
"name": "name",
"defaultScoringProfile": null,
"fields": [
{
"name": "chunk_id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": true,
"key": true,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "keyword",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "parent_id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": true,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "chunk",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "title",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "prosjektnummer",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"synonymMaps": []
},
{
"name": "vector",
"type": "Collection(Edm.Single)",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": 1536,
"vectorSearchProfile": "vectorprofilename",
"synonymMaps": []
}
],
"scoringProfiles": [],
"corsOptions": null,
"suggesters": [],
"analyzers": [],
"normalizers": [],
"tokenizers": [],
"tokenFilters": [],
"charFilters": [],
"encryptionKey": null,
"similarity": {
"@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
"k1": null,
"b": null
},
"semantic": null,
"vectorSearch": {
"algorithms": [
{
"name": "vector algorithm",
"kind": "hnsw",
"hnswParameters": {
"metric": "cosine",
"m": 4,
"efConstruction": 400,
"efSearch": 500
},
"exhaustiveKnnParameters": null
}
],
"profiles": [
{
"name": "vectorprofilename",
"algorithm": "vectoralgorithmname",
"vectorizer": "vector-vectorizer",
"compression": null
}
],
"vectorizers": [
{
"name": "vector-vectorizer",
"kind": "azureOpenAI",
"azureOpenAIParameters": {
"resourceUri": "https://documentsearch-oai.openai.azure.com",
"deploymentId": "document-embedding",
"apiKey": "<redacted>",
"authIdentity": null
},
"customWebApiParameters": null
}
],
"compressions": []
}
}
Skillset:
{
"@odata.context": "https://documentsearch.search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "",
"name": "vector-skillset",
"description": "Skillset to chunk documents and generate embeddings",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"name": "#1",
"description": null,
"context": "/document/pages/*",
"resourceUri": "https://.openai.azure.com",
"apiKey": "<redacted>",
"deploymentId": "tool-embedding",
"inputs": [
{
"name": "text",
"source": "/document/pages/*"
}
],
"outputs": [
{
"name": "embedding",
"targetName": "vector"
}
],
"authIdentity": null
},
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"name": "#2",
"description": "Split skill to chunk documents",
"context": "/document",
"defaultLanguageCode": "en",
"textSplitMode": "pages",
"maximumPageLength": 2000,
"pageOverlapLength": 500,
"maximumPagesToTake": 0,
"inputs": [
{
"name": "text",
"source": "/document/content"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "pages"
}
]
}
],
"cognitiveServices": null,
"knowledgeStore": null,
"indexProjections": {
"selectors": [
{
"targetIndexName": "vector-menonrag",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/pages/*",
"mappings": [
{
"name": "chunk",
"source": "/document/pages/*",
"sourceContext": null,
"inputs": []
},
{
"name": "vector",
"source": "/document/pages/*/vector",
"sourceContext": null,
"inputs": []
},
{
"name": "title",
"source": "/document/metadata_storage_name",
"sourceContext": null,
"inputs": []
}
]
}
],
"parameters": {
"projectionMode": "skipIndexingParentDocuments"
},
"encryptionKey": null
}
Upvotes: 0
Views: 871
Reputation: 95
I don't have a sleek solution to this. But I solved it by making sure each metadata column is mapped to each chunk in the splitter:
{
"name": "prosjektnavn",
"source": "/document/prosjektnavn",
"sourceContext": null,
"inputs": []
}
And also including all of these in the index of course.
Upvotes: 0