Reputation: 5
I'm currently facing the same issue with the synonym analyzer.
Here is the behavior I'd like to achieve:
For the moment, the queries below return all the results with "cable" and "cordon"
Query #1 with "cable":
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "cable",
"analyzer": "french",
"fields": [
"label"
]
}
}
]
}
}
]
}
},
"sort": {
"_score": {
"order": "desc"
}
}
}
Query #2 with "cordon"
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"simple_query_string": {
"query": "cordon",
"analyzer": "french",
"fields": [
"label"
]
}
}
]
}
}
]
}
},
"sort": {
"_score": {
"order": "desc"
}
}
}
The index settings:
{
"lifecycle": {
"name": "pdb-policy"
},
"routing": {
"allocation": {
"include": {
"_tier_preference": "data_content"
}
}
},
"mapping": {
"total_fields": {
"limit": "10000"
}
},
"number_of_shards": "1",
"provided_name": "products2",
"creation_date": "1719059028223",
"analysis": {
"filter": {
"alphanumeric": {
"pattern": "[^a-zA-Z0-9]",
"type": "pattern_replace",
"replacement": ""
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"limit1term": {
"type": "limit",
"max_token_count": "1"
},
"synonym_monodirectional": {
"type": "synonym",
"synonyms_path": "synonyms\/words.txt",
"expand": "false"
},
"french_elision": {
"type": "elision",
"articles": [
"l",
"m",
"t",
"qu",
"n",
"s",
"j",
"d",
"c",
"jusqu",
"quoiqu",
"lorsqu",
"puisqu"
]
},
"limit3term": {
"type": "limit",
"max_token_count": "3"
},
"french_stemmer": {
"name": "french",
"type": "stemmer"
}
},
"normalizer": {
"transliterator": {
"filter": [
"asciifolding",
"lowercase"
]
}
},
"analyzer": {
"space_addition": {
"filter": [
"asciifolding",
"french_elision",
"lowercase",
"synonym_monodirectional",
"french_stop"
],
"tokenizer": "standard"
},
"prefix1term": {
"filter": [
"asciifolding",
"french_elision",
"lowercase",
"synonym_monodirectional",
"french_stop",
"limit1term"
],
"tokenizer": "standard"
},
"space_removal": {
"filter": [
"asciifolding",
"french_elision",
"lowercase",
"synonym_monodirectional",
"french_stop"
],
"tokenizer": "standard"
},
"french": {
"filter": [
"synonym_monodirectional"
],
"tokenizer": "standard"
},
"prefix3term": {
"filter": [
"asciifolding",
"french_elision",
"lowercase",
"synonym_monodirectional",
"french_stop",
"limit3term"
],
"tokenizer": "standard"
},
"transliterator": {
"filter": [
"asciifolding",
"lowercase"
],
"tokenizer": "keyword"
}
}
},
"priority": "100",
"number_of_replicas": "0",
"uuid": "ePDgepPcTm-NLcaePn6f3Q",
"version": {
"created": "7170999"
}
}
The synonyms file:
cable => cordon
The analyzers seem to be correct too:
{
"tokens": [
{
"token": "cordon",
"start_offset": 0,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 0
}
]
}
{
"tokens": [
{
"token": "cordon",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 0
}
]
}
Do you have any idea? Thank you for your help.
Upvotes: 0
Views: 58
Reputation: 30163
There are two issues here. The first issues is that your "french" analyzer is basically just a standard tokenizer with synonyms. It will not even lowercase tokens. See https://stackoverflow.com/a/78634671/783043 for detailed discussion.
The second issue is that during search we need to expand cable
into both cable
and cardon
instead of just cordon
. There are several ways to do it depending on the types of synonyms, but the simplest way would probably just adding the original token to the mapping. If we combine both solutions for both issues together we will get something like this:
DELETE test
PUT test
{
"mappings": {
"properties": {
"label": {
"type": "text",
"analyzer": "french_index",
"search_analyzer": "french_search"
}
}
},
"settings": {
"analysis": {
"filter": {
"french_elision": {
"type": "elision",
"articles_case": true,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
},
"synonym_monodirectional": {
"type": "synonym",
"synonyms": ["cable => cable,cordon"],
"expand": "true"
}
},
"analyzer": {
"french_search": {
"tokenizer": "standard",
"filter": [
"french_elision",
"lowercase",
"french_stop",
"synonym_monodirectional",
"french_stemmer"
]
},
"french_index": {
"tokenizer": "standard",
"filter": [
"french_elision",
"lowercase",
"french_stop",
"french_stemmer"
]
}
}
}
}
}
POST test/_bulk?refresh=true
{ "index": { "_id": "1" } }
{ "label": "This is a cable."}
{ "index": { "_id": "2" } }
{ "label": "C'est un cordon."}
POST test/_search
{
"query": {
"simple_query_string": {
"query": "cable",
"fields": [
"label"
]
}
}
}
POST test/_search
{
"query": {
"simple_query_string": {
"query": "cordon",
"fields": [
"label"
]
}
}
}
Upvotes: 0