djy
djy

Reputation: 757

How to Get Distinct Token Count

I want to calculate value of doc['_num_matches'] to use in script_score, which is now a hypothetical field as count of should clause match, for every should clause match, adds one to the doc['_num_matches'].

e.g.

doc0: "banana"
doc0['_num_matches'] == 1

doc1: "apple apple apple apple apple apple banana"
doc1['_num_matches'] == 2

doc2: "apple banana cherry"
doc2['_num_matches'] == 3
{
    "query": {
        "function_score": {
            "query": {
                "bool": {
                    "should": [
                        {
                            "match": {
                                "content": {
                                    "query": "apple",
                                    "analyzer": "kuromoji"
                                }
                            }
                        },
                        {
                            "match": {
                                "content": {
                                    "query": "cherry",
                                    "analyzer": "kuromoji"
                                }
                            }
                        },
                        {
                            "match": {
                                "content": {
                                    "query": "banana",
                                    "analyzer": "kuromoji"
                                }
                            }
                        }
                    ],
                    "minimum_should_match": "2<80%"
                }
            },
            "functions": [
                {
                    "script_score": {
                        "script": {
                            "source": "_score * Math.log(1 + doc['_num_matches'].value)"
                        }
                    }
                }
            ],
            "boost_mode": "replace"
        }
    },
    "size": 200
}

Upvotes: 0

Views: 42

Answers (1)

G0l0s
G0l0s

Reputation: 496

You could introduce a multi-field with the unique filter to get distinct tokens

Mapping

PUT /term_count_score
{
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "fields": {
                    "terms": {
                        "type": "token_count",
                        "analyzer": "whitespace_lowercase_trim_unique_analyzer"
                    }
                }
            }
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "whitespace_lowercase_trim_unique_analyzer": {
                    "tokenizer": "whitespace",
                    "filter": [
                        "lowercase",
                        "trim",
                        "unique"
                    ]
                }
            }
        }
    }
}

Your documents

PUT /term_count_score/_bulk
{"create":{"_id":1}}
{"text":"banana"}
{"create":{"_id":2}}
{"text":"apple apple apple apple apple apple banana"}
{"create":{"_id":3}}
{"text":"apple banana cherry"}

Your simplified query with hit filters

GET /term_count_score/_search?filter_path=hits.hits._source,hits.hits._score,hits.hits._id
{
    "query": {
        "function_score": {
            "query": {
                "match_all": {}
            },
            "functions": [
                {
                    "script_score": {
                        "script": "doc['text.terms'].value"
                    }
                }
            ]
        }
    },
    "_source": "text"
}

Response

{
    "hits" : {
        "hits" : [
            {
                "_id" : "3",
                "_score" : 3.0,
                "_source" : {
                    "text" : "apple banana cherry"
                }
            },
            {
                "_id" : "2",
                "_score" : 2.0,
                "_source" : {
                    "text" : "apple apple apple apple apple apple banana"
                }
            },
            {
                "_id" : "1",
                "_score" : 1.0,
                "_source" : {
                    "text" : "banana"
                }
            }
        ]
    }
}

Upvotes: 0

Related Questions