Reputation: 33
I have implemented a system where users can upload files via a UI file upload screen. These files are then stored in Azure Blob Storage and indexed into Azure Cognitive Search. Later, users can query this indexed content through a UI chat screen.
The process involves the following steps:
File Upload: Files are uploaded and indexed. Query Handling: When a user enters a question, Azure Cognitive Search is used to find related content. Content Processing: The question and the related content are sent to OpenAI ChatGPT for further processing.
We encounter an issue when users ask for a summary or overview of the file content (e.g., "summarize this file," "provide an overview," or "give an abstract").
The problem is that specific terms like "summarize," "overview," or "abstract" might not be present in the indexed content. As a result, Azure Cognitive Search cannot generate any related content for these types of queries, leading to challenges in processing the user's request effectively.
what I need to include in index, indexer or skillset etc for this problem.
------------------------------------
Skillset
----------------------------------
{
"@odata.context": "https://../$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC9C1A82415149\"",
"name": "skillset1719988983064",
"description": "",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Text.SplitSkill",
"name": "#1",
"description": null,
"context": "/document/content",
"defaultLanguageCode": "en",
"textSplitMode": "pages",
"maximumPageLength": 5000,
"pageOverlapLength": 0,
"maximumPagesToTake": 0,
"inputs": [
{
"name": "text",
"source": "/document/content"
}
],
"outputs": [
{
"name": "textItems",
"targetName": "pages"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.KeyPhraseExtractionSkill",
"name": "#2",
"description": null,
"context": "/document/content/pages/*",
"defaultLanguageCode": "en",
"maxKeyPhraseCount": null,
"modelVersion": null,
"inputs": [
{
"name": "text",
"source": "/document/content/pages/*"
}
],
"outputs": [
{
"name": "keyPhrases",
"targetName": "keyphrases"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"name": "#3",
"description": "Extracts text from images",
"context": "/document/normalized_images/*",
"textExtractionAlgorithm": "Printed",
"lineEnding": "Space",
"defaultLanguageCode": "en",
"detectOrientation": true,
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "text",
"targetName": "ocr_text"
}
]
}
],
"cognitiveServices": {
"@odata.type": "#Microsoft.Azure.Search.DefaultCognitiveServices",
"description": null
},
"knowledgeStore": null,
"indexProjections": null,
"encryptionKey": null
}
----------------------------------
Indexer
----------------------------
{
"@odata.context": "https:...net/$metadata#indexers/$entity",
"@odata.etag": "\"0x8DCA085E114529C\"",
"name": "indexer1720577168464",
"description": null,
"dataSourceName": "azureaidatasource",
"skillsetName": "skillset1719988983064",
"targetIndexName": "aisearchindex",
"disabled": null,
"schedule": null,
"parameters": {
"batchSize": null,
"maxFailedItems": null,
"maxFailedItemsPerBatch": null,
"base64EncodeKeys": null,
"configuration": {
"allowSkillsetToReadFileData": true,
"dataToExtract": "contentAndMetadata",
"imageAction": "generateNormalizedImages"
}
},
"fieldMappings": [],
"outputFieldMappings": [
{
"sourceFieldName": "/document/normalized_images/*/ocr_text",
"targetFieldName": "ocr_text"
},
{
"sourceFieldName": "/document/normalized_images/*/image_description",
"targetFieldName": "image_description"
},
{
"sourceFieldName": "/document/normalized_images/*/image_tags",
"targetFieldName": "image_tags"
}
],
"cache": null,
"encryptionKey": null
}
------------------------------
index
-------------------------------
{
"@odata.context": "https://../$metadata#indexes/$entity",
"@odata.etag": "\"0x8DCA08363DB521C\"",
"name": "aisearchindex",
"defaultScoringProfile": "",
"fields": [
{
"name": "id",
"type": "Edm.String",
"searchable": true,
"filterable": true,
"retrievable": true,
"stored": true,
"sortable": true,
"facetable": true,
"key": true,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "content",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_content_type",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_size",
"type": "Edm.Int64",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_last_modified",
"type": "Edm.DateTimeOffset",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_content_md5",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_name",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_path",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_storage_file_extension",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_content_type",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_language",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_author",
"type": "Edm.String",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "metadata_creation_date",
"type": "Edm.DateTimeOffset",
"searchable": false,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "keyphrases",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": "standard.lucene",
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "ocr_text",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "image_description",
"type": "Edm.String",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
},
{
"name": "image_tags",
"type": "Collection(Edm.String)",
"searchable": true,
"filterable": false,
"retrievable": true,
"stored": true,
"sortable": false,
"facetable": false,
"key": false,
"indexAnalyzer": null,
"searchAnalyzer": null,
"analyzer": null,
"normalizer": null,
"dimensions": null,
"vectorSearchProfile": null,
"vectorEncoding": null,
"synonymMaps": []
}
],
"scoringProfiles": [],
"corsOptions": null,
"suggesters": [],
"analyzers": [],
"normalizers": [],
"tokenizers": [],
"tokenFilters": [],
"charFilters": [],
"encryptionKey": null,
"similarity": {
"@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
"k1": null,
"b": null
},
"semantic": null,
"vectorSearch": null
}
Upvotes: 0
Views: 99
Reputation: 8085
You have to give some context around the search query to match with the content, at least a word should be given in the search query.
Below the sample pdf data I have used in index.
'
And the query i given in open ai chat.
summarize the document with the user details like Beneficiary name, Member ID, Employee code etc.
Output i got.
So, create a search query in such a way that it matches any word in Ai search index.
Upvotes: 0