BBloggsbott
BBloggsbott

Reputation: 388

Boosting documents with term matches in elasticsearch after cosine similarity

I am using text embeddings stored in elasticsearch to get documents similar to a query. But I noticed that in some cases, I get documents that don't have the words from the query in them with a higher score. So I want to boost the score for documents that have the words from the query. How do I do this in elasticsearch?

This is my index

{
    "mappings": {
        "properties": {
            "question_text": {
            "type": "text"
            },
            "question_vector": {
            "type": "dense_vector",
            "dims": 768
            }
        }
    }
}

I tried doing this

{
    "query":{
        "script_score": {
            "query": {
                "bool": {
                    "must": [
                        {
                            "more_like_this": {
                                "fields": [
                                    "question_text"
                                ],
                                "like": query_text,
                                "min_term_freq": 1,
                                "max_query_terms": 12,
                                "minimum_should_match": "3<60%"
                            }
                        }
                    ]
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    },
    "fields": [
        "question_text"
    ],
    "_source": false
}

But now I only get documents that have the words in them. Is there a way to do this, but still get matches that don't have the words in them, but with lower scores?

Upvotes: 0

Views: 674

Answers (2)

BBloggsbott
BBloggsbott

Reputation: 388

{
    "query": {
        "boosting": {
            "positive": {
                "function_score": {
                    "query": {
                        "match_all": {}
                    },
                    "script_score": {
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                            "params": {"query_vector": embedding}
                        },
                    }
                }
            },
            "negative": {
                "bool": {
                    "must_not": [
                        {
                            "more_like_this": {
                                "fields": [
                                    "question_text"
                                ],
                                "like": text,
                                "min_doc_freq": 0,
                                "min_term_freq": 0,
                                "max_query_terms": 12,
                                "minimum_should_match": "3<60%",
                            }
                        }
                    ]
                }
            },
            "negative_boost": 0.8
        }
    },
    "_source": "question_text"
}

This query selects all the documents and computes cosine similarity. Then, it reduces the scores of the documents which do not have matching terms.

Upvotes: 0

Mathew
Mathew

Reputation: 584

use function score query.

    {
        "query": {
            "function_score": {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "more_like_this": {
                                    "fields": [
                                        "question_text"
                                    ],
                                    "like": "Once upon a time",
                                    "min_doc_freq": 1,
                                    "min_term_freq": 1,
                                    "max_query_terms": 12,
                                    "minimum_should_match": "1<60%"
                                }
                            }
                        ]
                    }
                },
                "boost": "1",
                "functions": [
                    {
                        "script_score": {
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                                "params": {
                                    "query_vector": [
                                        -0.5,
                                        10,
                                        20
                                    ]
                                }
                            }
                        },
                        "weight": 1000
                    }
                    
                ],
                "boost_mode": "sum"
            }
        }
    }

explaination:

boost -> boost for the whole query

weight -> boost for cosine function

final boost = query boost + function boost.

Upvotes: 1

Related Questions