Reputation: 7066
How would I get the values of all the languages
from the records and make them unique.
Records
PUT items/1
{ "language" : 10 }
PUT items/2
{ "language" : 11 }
PUT items/3
{ "language" : 10 }
Query
GET items/_search
{ ... }
# => Expected Response
[10, 11]
Any help would be great.
Upvotes: 182
Views: 299124
Reputation: 2871
If you only need a count of unique terms, Elasticsearch 1.1+ has the Cardinality Aggregation which will give you a unique count of the terms, but not the terms themselves.
Note that it is actually an approximation and accuracy may diminish with high-cardinality datasets, but it's generally pretty accurate in my testing.
You can also tune the accuracy with the precision_threshold
parameter. The trade-off, or course, is memory usage.
This graph from the docs shows how a higher precision_threshold
leads to much more accurate results.
Upvotes: 11
Reputation: 5292
Short solution to be used in console, inspired by discussion post
POST _sql?format=txt
{
"query": """SELECT language FROM "index-name" GROUP by language"""
}
Upvotes: 0
Reputation: 1
aggs will work on number value by default if you want work on string filed you should enable it on fie
Upvotes: -2
Reputation: 4611
You can use the terms aggregation.
{
"size": 0,
"aggs" : {
"langs" : {
"terms" : { "field" : "language", "size" : 500 }
}
}}
The size
parameter within the aggregation specifies the maximum number of terms to include in the aggregation result. If you need all results, set this to a value that is larger than the number of unique terms in your data.
A search will return something like:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"failed" : 0
},
"hits" : {
"total" : 1000000,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"langs" : {
"buckets" : [ {
"key" : "10",
"doc_count" : 244812
}, {
"key" : "11",
"doc_count" : 136794
}, {
"key" : "12",
"doc_count" : 32312
} ]
}
}
}
Upvotes: 253
Reputation: 83
To had to distinct by two fields (derivative_id & vehicle_type) and to sort by cheapest car. Had to nest aggs.
GET /cars/_search
{
"size": 0,
"aggs": {
"distinct_by_derivative_id": {
"terms": {
"field": "derivative_id"
},
"aggs": {
"vehicle_type": {
"terms": {
"field": "vehicle_type"
},
"aggs": {
"cheapest_vehicle": {
"top_hits": {
"sort": [
{ "rental": { "order": "asc" } }
],
"_source": { "includes": [ "manufacturer_name",
"rental",
"vehicle_type"
]
},
"size": 1
}
}
}
}
}
}
}
}
Result:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"distinct_by_derivative_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "04",
"doc_count" : 3,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "8",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Renault",
"rental" : 89.99
},
"sort" : [
89.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "7",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 99.99
},
"sort" : [
99.99
]
}
]
}
}
}
]
}
},
{
"key" : "01",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
}
]
}
},
{
"key" : "02",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 499.99
},
"sort" : [
499.99
]
}
]
}
}
}
]
}
},
{
"key" : "03",
"doc_count" : 1,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 399.99
},
"sort" : [
399.99
]
}
]
}
}
}
]
}
}
]
}
}
}
Upvotes: 4
Reputation: 2343
If you want to get all unique values without any approximation or setting a magic number (size: 500
), then use COMPOSITE AGGREGATION (ES 6.5+).
From official documentation:
"If you want to retrieve all terms or all combinations of terms in a nested terms aggregation you should use the COMPOSITE AGGREGATION which allows to paginate over all possible terms rather than setting a size greater than the cardinality of the field in the terms aggregation. The terms aggregation is meant to return the top terms and does not allow pagination."
Implementation example in JavaScript:
const ITEMS_PER_PAGE = 1000;
const body = {
"size": 0, // Returning only aggregation results: https://www.elastic.co/guide/en/elasticsearch/reference/current/returning-only-agg-results.html
"aggs" : {
"langs": {
"composite" : {
"size": ITEMS_PER_PAGE,
"sources" : [
{ "language": { "terms" : { "field": "language" } } }
]
}
}
}
};
const uniqueLanguages = [];
while (true) {
const result = await es.search(body);
const currentUniqueLangs = result.aggregations.langs.buckets.map(bucket => bucket.key);
uniqueLanguages.push(...currentUniqueLangs);
const after = result.aggregations.langs.after_key;
if (after) {
// continue paginating unique items
body.aggs.langs.composite.after = after;
} else {
break;
}
}
console.log(uniqueLanguages);
Upvotes: 22
Reputation: 1110
if you want to get the first document for each language
field unique value, you can do this:
{
"query": {
"match_all": {
}
},
"collapse": {
"field": "language.keyword",
"inner_hits": {
"name": "latest",
"size": 1
}
}
}
Upvotes: 17
Reputation: 650
I am looking for this kind of solution for my self as well. I found reference in terms aggregation.
So, according to that following is the proper solution.
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language",
"size" : 500 }
}
}}
But if you ran into following error:
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [fastest_method] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
}
]}
In that case, you have to add "KEYWORD" in the request, like following:
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language.keyword",
"size" : 500 }
}
}}
Upvotes: 27