ElasticSearch aggregation by all tokens in a string field

Question

I have ElasticSearch 2.4 and I'm trying to do an aggregation on a text field of type String which contains multiple tokens. The field in question is an address field called mailingAddress. For example, below are a few results which look for NY in the address field.

{
  "from": 0,
  "size": 100,
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    }
  ],
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "must": [
              {
                "match": {
                  "customerprofile.mailingAddress": {
                    "query": "NY",
                    "fuzziness": 0,
                    "operator": "or"
                  }
                }
              },
              {
                "match": {
                  "customerprofile.companyId": {
                    "query": "999",
                    "fuzziness": 0,
                    "operator": "or"
                  }
                }
              }
            ]
          }
        }
      ]
    }
  }
}

returns

"hits":[  
   {  
      "_index":"wht_index_prod_v33_es24",
      "_type":"customerprofile",
      "_id":"2044",
      "_score":2.9787974,
      "_source":{  
         "customerId":2044,
         "companyId":2007,
         "fullName":"John Doe",
         "email":"jon@aol.com",
         "pictureURL":"john.png",
         "profilePictureContentType":"image/png",
         "phone":"(703) 999-8888",
         "mailingAddress":"100 Lake Braddock Drive
Burke, NY 22015",
         "gender":"Male",
         "emergencyContactsIds":[  

         ],
         "wantCorrespondence":false
      }
   },
   {  
      "_index":"wht_index_prod_v33_es24",
      "_type":"customerprofile",
      "_id":"2045",
      "_score":2.9787974,
      "_source":{  
         "customerId":2045,
         "companyId":2007,
         "fullName":"Jane Anderson",
         "email":"janea@touchva.net",
         "pictureURL":"JAnderson.png",
         "profilePictureContentType":"image/png",
         "phone":"(434) 111-2345",
         "mailingAddress":"PO Box 333, Boydton, NY 23917",
         "gender":"Male",
         "emergencyContactsIds":[  

         ],
         "wantCorrespondence":false
      }
   },
..
..
]

The question
When I do the aggregation by mailingAddress I expect to see buckets for each word in the text field. From the results above I expect to also find a bucket key named 'NY' but there isn't one. Can anyone explain why - my guess is that it has too few entries?

The aggregation:

{
  "size": 0,
  "aggs": {
    "group_by_age": {
      "terms": {
        "field": "mailingAddress"
      },
      "aggs": {
        "group_by_gender": {
          "terms": {
            "field": "gender"
          }
        }
      }
    }
  }
}

Aggregation results:

{
  "took": 16,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "hits": {
    "total": 401,
    "max_score": 0,
    "hits": [

    ]
  },
  "aggregations": {
    "group_by_age": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 1041,
      "buckets": [
        {
          "key": "st",
          "doc_count": 30,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 17
              },
              {
                "key": "male",
                "doc_count": 13
              }
            ]
          }
        },
        {
          "key": "ca",
          "doc_count": 28,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 21
              },
              {
                "key": "male",
                "doc_count": 7
              }
            ]
          }
        },
        {
          "key": "dr",
          "doc_count": 16,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 13
              },
              {
                "key": "male",
                "doc_count": 3
              }
            ]
          }
        },
        {
          "key": "street",
          "doc_count": 15,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 11
              },
              {
                "key": "male",
                "doc_count": 4
              }
            ]
          }
        },
        {
          "key": "ave",
          "doc_count": 14,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 7
              },
              {
                "key": "male",
                "doc_count": 7
              }
            ]
          }
        },
        {
          "key": "box",
          "doc_count": 11,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 9
              },
              {
                "key": "male",
                "doc_count": 2
              }
            ]
          }
        },
        {
          "key": "fl",
          "doc_count": 11,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 9
              },
              {
                "key": "male",
                "doc_count": 2
              }
            ]
          }
        },
        {
          "key": "va",
          "doc_count": 11,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "male",
                "doc_count": 6
              },
              {
                "key": "female",
                "doc_count": 5
              }
            ]
          }
        },
        {
          "key": "n",
          "doc_count": 10,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 7
              },
              {
                "key": "male",
                "doc_count": 3
              }
            ]
          }
        },
        {
          "key": "az",
          "doc_count": 9,
          "group_by_gender": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "female",
                "doc_count": 7
              },
              {
                "key": "male",
                "doc_count": 2
              }
            ]
          }
        }
      ]
    }
  }
}

Val · Accepted Answer

By default, terms aggregation return the first 10 terms, but you can decide to return more by specifying a size in your aggregation, like this:

{
  "size": 0,
  "aggs": {
    "group_by_age": {
      "terms": {
        "field": "mailingAddress",
        "size": 50                       <---- add this
      },
      "aggs": {
        "group_by_gender": {
          "terms": {
            "field": "gender"
          }
        }
      }
    }
  }
}

Your mileage may vary and you might need to increase the size in order to really see NY.

ElasticSearch aggregation by all tokens in a string field

Answers (1)

Related Questions