Reputation: 7066

ElasticSearch - Return Unique Values

How would I get the values of all the languages from the records and make them unique.

Records

PUT items/1
{ "language" : 10 }

PUT items/2
{ "language" : 11 }

PUT items/3
{ "language" : 10 }

Query

GET items/_search
{ ... }

# => Expected Response
[10, 11]

Any help would be great.

Upvotes: 182

Answers (8)

bradvido

Reputation: 2871

If you only need a count of unique terms, Elasticsearch 1.1+ has the Cardinality Aggregation which will give you a unique count of the terms, but not the terms themselves.

Note that it is actually an approximation and accuracy may diminish with high-cardinality datasets, but it's generally pretty accurate in my testing.

You can also tune the accuracy with the precision_threshold parameter. The trade-off, or course, is memory usage.

This graph from the docs shows how a higher precision_threshold leads to much more accurate results.

Relative error vs threshold

Upvotes: 11

citynorman

Reputation: 5292

Short solution to be used in console, inspired by discussion post

POST _sql?format=txt
{
  "query": """SELECT language FROM "index-name" GROUP by language"""
}

Upvotes: 0

Abolfazl Ayoubi

Reputation: 1

aggs will work on number value by default if you want work on string filed you should enable it on fie

Upvotes: -2

Anton

Reputation: 4611

You can use the terms aggregation.

{
"size": 0,
"aggs" : {
    "langs" : {
        "terms" : { "field" : "language",  "size" : 500 }
    }
}}

The size parameter within the aggregation specifies the maximum number of terms to include in the aggregation result. If you need all results, set this to a value that is larger than the number of unique terms in your data.

A search will return something like:

{
"took" : 16,
"timed_out" : false,
"_shards" : {
  "total" : 2,
  "successful" : 2,
  "failed" : 0
},
"hits" : {
"total" : 1000000,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
  "langs" : {
    "buckets" : [ {
      "key" : "10",
      "doc_count" : 244812
    }, {
      "key" : "11",
      "doc_count" : 136794
 
    }, {
      "key" : "12",
      "doc_count" : 32312
       } ]
    }
  }
}

Upvotes: 253

AutoVit

Reputation: 83

To had to distinct by two fields (derivative_id & vehicle_type) and to sort by cheapest car. Had to nest aggs.

GET /cars/_search
{
  "size": 0,
  "aggs": {
    "distinct_by_derivative_id": {
      "terms": { 
        "field": "derivative_id"
      },
      "aggs": {
        "vehicle_type": {
          "terms": {
            "field": "vehicle_type"
          },
          "aggs": {
            "cheapest_vehicle": {
              "top_hits": {
                "sort": [
                  { "rental": { "order": "asc" } }
                ],
                "_source": { "includes": [ "manufacturer_name",
                  "rental",
                  "vehicle_type" 
                  ]
                },
                "size": 1
              }
            }
          }
        }
      }
    }
  }
}

Result:

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "distinct_by_derivative_id" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "04",
          "doc_count" : 3,
          "vehicle_type" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "CAR",
                "doc_count" : 2,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 2,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "8",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "CAR",
                          "manufacturer_name" : "Renault",
                          "rental" : 89.99
                        },
                        "sort" : [
                          89.99
                        ]
                      }
                    ]
                  }
                }
              },
              {
                "key" : "LCV",
                "doc_count" : 1,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 1,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "7",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "LCV",
                          "manufacturer_name" : "Ford",
                          "rental" : 99.99
                        },
                        "sort" : [
                          99.99
                        ]
                      }
                    ]
                  }
                }
              }
            ]
          }
        },
        {
          "key" : "01",
          "doc_count" : 2,
          "vehicle_type" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "CAR",
                "doc_count" : 1,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 1,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "1",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "CAR",
                          "manufacturer_name" : "Ford",
                          "rental" : 599.99
                        },
                        "sort" : [
                          599.99
                        ]
                      }
                    ]
                  }
                }
              },
              {
                "key" : "LCV",
                "doc_count" : 1,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 1,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "2",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "LCV",
                          "manufacturer_name" : "Ford",
                          "rental" : 599.99
                        },
                        "sort" : [
                          599.99
                        ]
                      }
                    ]
                  }
                }
              }
            ]
          }
        },
        {
          "key" : "02",
          "doc_count" : 2,
          "vehicle_type" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "CAR",
                "doc_count" : 2,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 2,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "4",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "CAR",
                          "manufacturer_name" : "Audi",
                          "rental" : 499.99
                        },
                        "sort" : [
                          499.99
                        ]
                      }
                    ]
                  }
                }
              }
            ]
          }
        },
        {
          "key" : "03",
          "doc_count" : 1,
          "vehicle_type" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "CAR",
                "doc_count" : 1,
                "cheapest_vehicle" : {
                  "hits" : {
                    "total" : {
                      "value" : 1,
                      "relation" : "eq"
                    },
                    "max_score" : null,
                    "hits" : [
                      {
                        "_index" : "cars",
                        "_type" : "_doc",
                        "_id" : "5",
                        "_score" : null,
                        "_source" : {
                          "vehicle_type" : "CAR",
                          "manufacturer_name" : "Audi",
                          "rental" : 399.99
                        },
                        "sort" : [
                          399.99
                        ]
                      }
                    ]
                  }
                }
              }
            ]
          }
        }
      ]
    }
  }
}

Upvotes: 4

Ilarion Halushka

Reputation: 2343

If you want to get all unique values without any approximation or setting a magic number (size: 500), then use COMPOSITE AGGREGATION (ES 6.5+).

From official documentation:

"If you want to retrieve all terms or all combinations of terms in a nested terms aggregation you should use the COMPOSITE AGGREGATION which allows to paginate over all possible terms rather than setting a size greater than the cardinality of the field in the terms aggregation. The terms aggregation is meant to return the top terms and does not allow pagination."

Implementation example in JavaScript:

const ITEMS_PER_PAGE = 1000;

const body =  {
    "size": 0, // Returning only aggregation results: https://www.elastic.co/guide/en/elasticsearch/reference/current/returning-only-agg-results.html
    "aggs" : {
        "langs": {
            "composite" : {
                "size": ITEMS_PER_PAGE,
                "sources" : [
                    { "language": { "terms" : { "field": "language" } } }
                ]
            }
        }
     }
};

const uniqueLanguages = [];

while (true) {
  const result = await es.search(body);

  const currentUniqueLangs = result.aggregations.langs.buckets.map(bucket => bucket.key);

  uniqueLanguages.push(...currentUniqueLangs);

  const after = result.aggregations.langs.after_key;

  if (after) {
      // continue paginating unique items
      body.aggs.langs.composite.after = after;
  } else {
      break;
  }
}

console.log(uniqueLanguages);

Upvotes: 22

MajidJafari

Reputation: 1110

if you want to get the first document for each language field unique value, you can do this:

{
 "query": {
    "match_all": {
    }
  },
  "collapse": {
    "field": "language.keyword",
    "inner_hits": {
    "name": "latest",
      "size": 1
    }
  }
}

Upvotes: 17

MAULIK MODI

Reputation: 650

I am looking for this kind of solution for my self as well. I found reference in terms aggregation.

So, according to that following is the proper solution.

{
"aggs" : {
    "langs" : {
        "terms" : { "field" : "language",  
                    "size" : 500 }
    }
}}

But if you ran into following error:

"error": {
        "root_cause": [
            {
                "type": "illegal_argument_exception",
                "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [fastest_method] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
            }
        ]}

In that case, you have to add "KEYWORD" in the request, like following:

   {
    "aggs" : {
        "langs" : {
            "terms" : { "field" : "language.keyword",  
                        "size" : 500 }
        }
    }}

Upvotes: 27

ElasticSearch - Return Unique Values

Answers (8)

Related Questions