Reputation: 1050
I am trying to implement a simple demo with Elasticsearch for Chinese. But there are some problems about the relevance of search results.
I created a new index with the mapping:
{
"tag": {
"mappings": {
"tag": {
"properties": {
"name": {
"type": "text",
"analyzer": "standard"
},
"note": {
"type": "text",
"analyzer": "standard"
},
"status": {
"type": "integer"
},
"synonyms": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
}
And the Request Body with a query "美国":
{
"query" : {
"bool" : {
"must" : {
"multi_match" : {
"query" : "美国",
"fields" : [ "name", "synonyms" ]
}
},
"filter" : {
"term" : {
"status" : 2
}
}
}
}
}
There are two records "中国" and "美国" matching the query. But the record "中国" got a higher score. The response JSON is following:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.7373906,
"hits": [ {
"_index": "tag",
"_type": "tag",
"_id": "5482361185636870",
"_score": 0.7373906,
"_source": {
"status": 2,
"name": "中国",
"note": "",
"synonyms": []
}
}, {
"_index": "tag",
"_type": "tag",
"_id": "5474649504748034",
"_score": 0.53484553,
"_source": {
"status": 2,
"name": "美国",
"note": "",
"synonyms": []
}
} ]
}
}
The record of "中国" got 0.7373906 but the record of "美国" only got 0.53484553.
The results with explain:
{
"hits": [
{
"_shard": "[tag][0]",
"_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
"_index": "tag",
"_type": "tag",
"_id": "5482361185636870",
"_score": 0.7373906,
"_source": {
"status": 2,
"name": "中国",
"note": "",
"synonyms": []
},
"_explanation": {
"value": 0.73739064,
"description": "sum of:",
"details": [
{
"value": 0.73739064,
"description": "sum of:",
"details": [
{
"value": 0.73739064,
"description": "max of:",
"details": [
{
"value": 0.73739064,
"description": "sum of:",
"details": [
{
"value": 0.73739064,
"description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.73739064,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.6931472,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 2,
"description": "docCount",
"details": []
}
]
},
{
"value": 1.0638298,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 3,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "status:[2 TO 2], product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "*:*, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
},
{
"_shard": "[tag][4]",
"_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
"_index": "tag",
"_type": "tag",
"_id": "5474649504748034",
"_score": 0.51623213,
"_source": {
"status": 2,
"name": "美国",
"note": "",
"synonyms": []
},
"_explanation": {
"value": 0.51623213,
"description": "sum of:",
"details": [
{
"value": 0.51623213,
"description": "sum of:",
"details": [
{
"value": 0.51623213,
"description": "max of:",
"details": [
{
"value": 0.51623213,
"description": "sum of:",
"details": [
{
"value": 0.25811607,
"description": "weight(name:美 in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.25811607,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.2876821,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 1,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.89722675,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 2,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
},
{
"value": 0.25811607,
"description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.25811607,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.2876821,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 1,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.89722675,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 2,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "status:[2 TO 2], product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "*:*, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
}
]
}
Upvotes: 1
Views: 182
Reputation: 3957
It seems that your index contain only a few documents and they fall into different shards. Each shrad has it's own term frequencies. By default ElasticSearch uses these local values. But you can change this behaviour by specifying search_type=dfs_query_then_fetch
querystring parameter or add corresponding body field like this
{
"search_type": "dfs_query_then_fetch",
"query": {
"bool": {
"must": {
"multi_match": {
"query": "美国",
"fields": [
"name",
"synonyms"
]
}
},
"filter": {
"term": {
"status": 2
}
}
}
}
}
Take a look at this article https://www.elastic.co/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch
Upvotes: 3