How would I get the values of all the languages
from the records and make them unique.
Records
PUT items/1
{ "language" : 10 }
PUT items/2
{ "language" : 11 }
PUT items/3
{ "language" : 10 }
Query
GET items/_search
{ ... }
# => Expected Response
[10, 11]
Any help would be great.
This question is related to
elasticsearch
You can use the terms aggregation.
{
"size": 0,
"aggs" : {
"langs" : {
"terms" : { "field" : "language", "size" : 500 }
}
}}
The size
parameter within the aggregation specifies the maximum number of terms to include in the aggregation result. If you need all results, set this to a value that is larger than the number of unique terms in your data.
A search will return something like:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"failed" : 0
},
"hits" : {
"total" : 1000000,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"langs" : {
"buckets" : [ {
"key" : "10",
"doc_count" : 244812
}, {
"key" : "11",
"doc_count" : 136794
}, {
"key" : "12",
"doc_count" : 32312
} ]
}
}
}
I am looking for this kind of solution for my self as well. I found reference in terms aggregation.
So, according to that following is the proper solution.
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language",
"size" : 500 }
}
}}
But if you ran into following error:
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [fastest_method] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
}
]}
In that case, you have to add "KEYWORD" in the request, like following:
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language.keyword",
"size" : 500 }
}
}}
To had to distinct by two fields (derivative_id & vehicle_type) and to sort by cheapest car. Had to nest aggs.
GET /cars/_search
{
"size": 0,
"aggs": {
"distinct_by_derivative_id": {
"terms": {
"field": "derivative_id"
},
"aggs": {
"vehicle_type": {
"terms": {
"field": "vehicle_type"
},
"aggs": {
"cheapest_vehicle": {
"top_hits": {
"sort": [
{ "rental": { "order": "asc" } }
],
"_source": { "includes": [ "manufacturer_name",
"rental",
"vehicle_type"
]
},
"size": 1
}
}
}
}
}
}
}
}
Result:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"distinct_by_derivative_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "04",
"doc_count" : 3,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "8",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Renault",
"rental" : 89.99
},
"sort" : [
89.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "7",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 99.99
},
"sort" : [
99.99
]
}
]
}
}
}
]
}
},
{
"key" : "01",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
}
]
}
},
{
"key" : "02",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 499.99
},
"sort" : [
499.99
]
}
]
}
}
}
]
}
},
{
"key" : "03",
"doc_count" : 1,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 399.99
},
"sort" : [
399.99
]
}
]
}
}
}
]
}
}
]
}
}
}
If you want to get all unique values without any approximation or setting a magic number (size: 500
), then use COMPOSITE AGGREGATION (ES 6.5+).
From official documentation:
"If you want to retrieve all terms or all combinations of terms in a nested terms aggregation you should use the COMPOSITE AGGREGATION which allows to paginate over all possible terms rather than setting a size greater than the cardinality of the field in the terms aggregation. The terms aggregation is meant to return the top terms and does not allow pagination."
Implementation example in JavaScript:
const ITEMS_PER_PAGE = 1000;_x000D_
_x000D_
const body = {_x000D_
"size": 0, // Returning only aggregation results: https://www.elastic.co/guide/en/elasticsearch/reference/current/returning-only-agg-results.html_x000D_
"aggs" : {_x000D_
"langs": {_x000D_
"composite" : {_x000D_
"size": ITEMS_PER_PAGE,_x000D_
"sources" : [_x000D_
{ "language": { "terms" : { "field": "language" } } }_x000D_
]_x000D_
}_x000D_
}_x000D_
}_x000D_
};_x000D_
_x000D_
const uniqueLanguages = [];_x000D_
_x000D_
while (true) {_x000D_
const result = await es.search(body);_x000D_
_x000D_
const currentUniqueLangs = result.aggregations.langs.buckets.map(bucket => bucket.key);_x000D_
_x000D_
uniqueLanguages.push(...currentUniqueLangs);_x000D_
_x000D_
const after = result.aggregations.langs.after_key;_x000D_
_x000D_
if (after) {_x000D_
// continue paginating unique items_x000D_
body.aggs.langs.composite.after = after;_x000D_
} else {_x000D_
break;_x000D_
}_x000D_
}_x000D_
_x000D_
console.log(uniqueLanguages);
_x000D_
Elasticsearch 1.1+ has the Cardinality Aggregation which will give you a unique count
Note that it is actually an approximation and accuracy may diminish with high-cardinality datasets, but it's generally pretty accurate in my testing.
You can also tune the accuracy with the precision_threshold
parameter. The trade-off, or course, is memory usage.
This graph from the docs shows how a higher precision_threshold
leads to much more accurate results.
if you want to get the first document for each language
field unique value, you can do this:
{
"query": {
"match_all": {
}
},
"collapse": {
"field": "language.keyword",
"inner_hits": {
"name": "latest",
"size": 1
}
}
}
Source: Stackoverflow.com