Reputation: 130
I have been struggling for a week trying to get correct data out of an Elasticsearch nested aggregtation index. Below is my index mapping and two sample documents inserted. What i want to find is:
So as an example in the inserted documents below the output i expect is:
{"JJ": 1, "NN": 1}
{
"_doc": {
"_meta": {},
"_source": {},
"properties": {
"originalText": {
"type": "text"
},
"testDataId": {
"type": "text"
},
"xforms": {
"type": "nested",
"properties": {
"sentence": {
"type": "nested"
},
"predicate": {
"type": "nested"
}
}
},
"corpusId": {
"type": "text"
},
"row": {
"type": "text"
},
"batchId": {
"type": "text"
},
"processor": {
"type": "text"
}
}
}
}
A sample doc inserted is as follows:
{
"_id": "28",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fdf15",
"originalText": "Some text with the word 24",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "Some",
"index": 1,
"after": " ",
"tag": "JJ",
"value": "Some"
},
{
"lemma": "text",
"index": 2,
"after": " ",
"tag": "NN",
"value": "text"
},
{
"lemma": "with",
"index": 3,
"after": " ",
"tag": "NN",
"value": "with"
},
{
"lemma": "the",
"index": 4,
"after": "",
"tag": "CD",
"value": "the"
},
{
"lemma": "word",
"index": 5,
"after": " ",
"tag": "CC",
"value": "word"
},
{
"lemma": "24",
"index": 6,
"after": " ",
"tag": "JJ",
"value": "24"
}
],
"type": "RAW"
},
"originalSentence": "Some text with the word 24 in it",
"id": "e724611d8c024bcb8f0158b60e3df87e"
}]
}
},
{
"_id": "56",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fad15",
"originalText": "24 word",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "24",
"index": 1,
"after": " ",
"tag": "NN",
"value": "24"
},
{
"lemma": "word",
"index": 2,
"after": " ",
"tag": "JJ",
"value": "word"
}
],
"type": "RAW"
},
"originalSentence": "24 word",
"id": "e724611d8c024bcb8f0158b60e3d123"
}]
}
}
Upvotes: 1
Views: 126
Reputation: 16895
Expanding on @Gibbs's answer, @N Kiram you'll need to set the tokens
as nested
too:
{
"xforms":{
"type":"nested",
"properties":{
"sentence":{
"type":"nested",
"properties":{
"tokens":{ <----
"type":"nested"
}
}
},
"predicate":{
"type":"nested"
}
}
}
}
Then and only then will your aggs yield the correct counts:
{
"aggregations":{
"xforms":{
"doc_count":8,
"inner":{
"doc_count":2,
"tag_count":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
{
"key":"JJ",
"doc_count":1
},
{
"key":"NN",
"doc_count":1
}
]
}
}
}
}
}
Side note: you'll have to reindex in order for the changed mapping to apply.
Upvotes: 1
Reputation: 22956
{
"aggs": {
"xforms": {
"nested": { //Nested aggregation
"path": "xforms.sentence"
},
"aggs": {
"inner": { //Counting only within the matching doc
"filter": {
"bool": {
"filter": { //Filtering docs with value=24
"terms": {
"xforms.sentence.tokens.value": [
"24"
]
}
}
}
},
"aggs" : {
"tag_count":{ //On filtered doc, doing terms aggregation on tag's keyword version as tag is of type text
"terms":{
"field":"xforms.sentence.tokens.tag.keyword"
}
}
}
}
}
}
}
}
It provides the below output
"aggregations": {
"xforms": {
"doc_count": 2,
"inner": {
"doc_count": 2,
"tag_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "JJ",
"doc_count": 2
},
{
"key": "NN",
"doc_count": 2
},
{
"key": "CC",
"doc_count": 1
},
{
"key": "CD",
"doc_count": 1
}
]
}
}
}
}
Upvotes: 0