LCB
LCB

Reputation: 1050

Questions about Elasticsearch relevance of search results

I am trying to implement a simple demo with Elasticsearch for Chinese. But there are some problems about the relevance of search results.

I created a new index with the mapping:

{
    "tag": {
        "mappings": {
            "tag": {
                "properties": {
                    "name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "note": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "status": {
                        "type": "integer"
                    },
                    "synonyms": {
                        "type": "text",
                        "analyzer": "standard"
                    }
                }
            }
        }
    }
}

And the Request Body with a query "美国":

{
    "query" : {
         "bool" : {
             "must" : {
                 "multi_match" : {
                     "query" : "美国",
                     "fields" : [ "name", "synonyms" ]
                 }
             },
             "filter" : {
                 "term" : {
                     "status" : 2
                 }
             }
         }
     }
 }

There are two records "中国" and "美国" matching the query. But the record "中国" got a higher score. The response JSON is following:

{
    "took": 2,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 0.7373906,
        "hits": [ {
            "_index": "tag",
            "_type": "tag",
            "_id": "5482361185636870",
            "_score": 0.7373906,
            "_source": {
                "status": 2,
                "name": "中国",
                "note": "",
                "synonyms": []
            }
        }, {
            "_index": "tag",
            "_type": "tag",
            "_id": "5474649504748034",
            "_score": 0.53484553,
            "_source": {
                "status": 2,
                "name": "美国",
                "note": "",
                "synonyms": []
            }
        } ]
    }
}

The record of "中国" got 0.7373906 but the record of "美国" only got 0.53484553.

The results with explain:

{
  "hits": [
    {
      "_shard": "[tag][0]",
      "_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
      "_index": "tag",
      "_type": "tag",
      "_id": "5482361185636870",
      "_score": 0.7373906,
      "_source": {
        "status": 2,
        "name": "中国",
        "note": "",
        "synonyms": []
      },
      "_explanation": {
        "value": 0.73739064,
        "description": "sum of:",
        "details": [
          {
            "value": 0.73739064,
            "description": "sum of:",
            "details": [
              {
                "value": 0.73739064,
                "description": "max of:",
                "details": [
                  {
                    "value": 0.73739064,
                    "description": "sum of:",
                    "details": [
                      {
                        "value": 0.73739064,
                        "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.73739064,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.6931472,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 1.0638298,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 3,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      }
                    ]
                  }
                ]
              },
              {
                "value": 0,
                "description": "match on required clause, product of:",
                "details": [
                  {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "status:[2 TO 2], product of:",
                    "details": [
                      {
                        "value": 1,
                        "description": "boost",
                        "details": []
                      },
                      {
                        "value": 1,
                        "description": "queryNorm",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          },
          {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
              {
                "value": 0,
                "description": "# clause",
                "details": []
              },
              {
                "value": 1,
                "description": "*:*, product of:",
                "details": [
                  {
                    "value": 1,
                    "description": "boost",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      }
    },
    {
      "_shard": "[tag][4]",
      "_node": "Wh9qH0bcTAaVNrsP1Aiyxg",
      "_index": "tag",
      "_type": "tag",
      "_id": "5474649504748034",
      "_score": 0.51623213,
      "_source": {
        "status": 2,
        "name": "美国",
        "note": "",
        "synonyms": []
      },
      "_explanation": {
        "value": 0.51623213,
        "description": "sum of:",
        "details": [
          {
            "value": 0.51623213,
            "description": "sum of:",
            "details": [
              {
                "value": 0.51623213,
                "description": "max of:",
                "details": [
                  {
                    "value": 0.51623213,
                    "description": "sum of:",
                    "details": [
                      {
                        "value": 0.25811607,
                        "description": "weight(name:美 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.25811607,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.2876821,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 1,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 0.89722675,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      },
                      {
                        "value": 0.25811607,
                        "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:",
                        "details": [
                          {
                            "value": 0.25811607,
                            "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                            "details": [
                              {
                                "value": 0.2876821,
                                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "docFreq",
                                    "details": []
                                  },
                                  {
                                    "value": 1,
                                    "description": "docCount",
                                    "details": []
                                  }
                                ]
                              },
                              {
                                "value": 0.89722675,
                                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                                "details": [
                                  {
                                    "value": 1,
                                    "description": "termFreq=1.0",
                                    "details": []
                                  },
                                  {
                                    "value": 1.2,
                                    "description": "parameter k1",
                                    "details": []
                                  },
                                  {
                                    "value": 0.75,
                                    "description": "parameter b",
                                    "details": []
                                  },
                                  {
                                    "value": 2,
                                    "description": "avgFieldLength",
                                    "details": []
                                  },
                                  {
                                    "value": 2.56,
                                    "description": "fieldLength",
                                    "details": []
                                  }
                                ]
                              }
                            ]
                          }
                        ]
                      }
                    ]
                  }
                ]
              },
              {
                "value": 0,
                "description": "match on required clause, product of:",
                "details": [
                  {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "status:[2 TO 2], product of:",
                    "details": [
                      {
                        "value": 1,
                        "description": "boost",
                        "details": []
                      },
                      {
                        "value": 1,
                        "description": "queryNorm",
                        "details": []
                      }
                    ]
                  }
                ]
              }
            ]
          },
          {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
              {
                "value": 0,
                "description": "# clause",
                "details": []
              },
              {
                "value": 1,
                "description": "*:*, product of:",
                "details": [
                  {
                    "value": 1,
                    "description": "boost",
                    "details": []
                  },
                  {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      }
    }
  ]
}

Upvotes: 1

Views: 182

Answers (1)

Eugene
Eugene

Reputation: 3957

It seems that your index contain only a few documents and they fall into different shards. Each shrad has it's own term frequencies. By default ElasticSearch uses these local values. But you can change this behaviour by specifying search_type=dfs_query_then_fetch querystring parameter or add corresponding body field like this

{
    "search_type": "dfs_query_then_fetch",
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": "美国",
                    "fields": [
                        "name",
                        "synonyms"
                    ]
                }
            },
            "filter": {
                "term": {
                    "status": 2
                }
            }
        }
    }
}

Take a look at this article https://www.elastic.co/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch

Upvotes: 3

Related Questions