Arel
Arel

Reputation: 3938

Aggregations in Elasticsearch

I have an elasticsearch query that returns a bunch of objects that looks like this:

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 1,
    "hits": [
      {
        "_index": "searchdb",
        "_type": "profile",
        "_id": "1825",
        "_score": 1,
        "_source": {
          "id": 1825,
          "market": "Chicago",
          "geo_location": {
            "lat": 41.1234,
            "lon": -87.5678
          },
          "hourly_values": [
            {
              "datetime": "1997-07-16T19:00:00.00+00:00",
              "seconds": 1200
            },
            {
              "datetime": "1997-07-16T19:20:00.00+00:00",
              "seconds": 1200
            },
            {
              "datetime": "1997-07-16T19:20:00.00+00:00",
              "seconds": 1200
            }
          ]
        }
      },
      {
        "_index": "searchdb",
        "_type": "profile",
        "_id": "1808",
        "_score": 1,
        "_source": {
          "id": 1808,
          "market": "Chicago",
          "geo_location": {
            "lat": 41.1234,
            "lon": -87.5678
          },
          "hourly_values": [
            {
              "datetime": "1997-07-16T19:00:00.00+00:00",
              "seconds": 900
            },
            {
              "datetime": "1997-07-16T19:20:00.00+00:00",
              "seconds": 1200
            },
            {
              "datetime": "1997-07-16T19:20:00.00+00:00",
              "seconds": 800
            }
          ]
        }
      }
    ]
  }

I want to return the same result, but with an aggregation of the seconds fields for each object returned.

My query right now looks like this:

{
    "query": {
        "filtered":{
            "filter":{
                "geo_distance":{
                    "distance":"1km",
                    "geo_location":{
                        "lat":"41.1234",
                        "lon":"-87.5678"
                    }
                }
            }
        }
    },
    "aggregations": {
        "seconds_sum": {
           "sum": {
              "field": "hourly_values.seconds"
            }
        }
    }
} 

The above just aggregates all the seconds for all objects together. I can't figure out how to aggregate just the seconds for each object, and return that aggregate with the object, so I can end up with something like this:

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 1,
    "hits": [
      {
        "_index": "searchdb",
        "_type": "profile",
        "_id": "1825",
        "_score": 1,
        "_source": {
          "id": 1825,
          "market": "Chicago",
          "geo_location": {
            "lat": 41.1234,
            "lon": -87.5678
          },
          "seconds":3600
        }
      },
      {
        "_index": "searchdb",
        "_type": "profile",
        "_id": "1808",
        "_score": 1,
        "_source": {
          "id": 1808,
          "market": "Chicago",
          "geo_location": {
            "lat": 41.1234,
            "lon": -87.5678
          },
          "seconds":2900
        }
      }
    ]
  }

Or something like that ...

Upvotes: 1

Views: 104

Answers (1)

Evaldas Buinauskas
Evaldas Buinauskas

Reputation: 14097

That's quite easy. First of all, you will need to store your hourly_values as nested objects.

You have to aggregate by unique value using terms, in this case it's probably going to be id, only then you have to sum. To sum things up:

That would be your mapping

PUT /test
{
  "mappings": {
    "data": {
      "properties": {
        "id": {
          "type": "integer"
        },
        "geo_location": {
          "type": "geo_point"
        },
        "hourly_values": {
          "type": "nested",
          "properties": {
            "datetime": {
              "type": "date"
            },
            "seconds": {
              "type": "integer"
            }
          }
        }
      }
    }
  }
}

Test data

PUT /test/data/1
{
  "id": 1825,
  "market": "Chicago",
  "geo_location": {
    "lat": 41.1234,
    "lon": -87.5678
  },
  "hourly_values": [
    {
      "datetime": "1997-07-16T19:00:00.00+00:00",
      "seconds": 1200
    },
    {
      "datetime": "1997-07-16T19:20:00.00+00:00",
      "seconds": 1200
    },
    {
      "datetime": "1997-07-16T19:20:00.00+00:00",
      "seconds": 1200
    }
  ]
}

PUT /test/data/2
{
  "id": 1808,
  "market": "Chicago",
  "geo_location": {
    "lat": 41.1234,
    "lon": -87.5678
  },
  "hourly_values": [
    {
      "datetime": "1997-07-16T19:00:00.00+00:00",
      "seconds": 900
    },
    {
      "datetime": "1997-07-16T19:20:00.00+00:00",
      "seconds": 1200
    },
    {
      "datetime": "1997-07-16T19:20:00.00+00:00",
      "seconds": 800
    }
  ]
}

And your aggregation

POST /test/_search
{
  "size": 0,
  "aggs": {
    "Ids": {
      "terms": {
        "field": "id",
        "size": 0
      },
      "aggs": {
        "Nesting": {
          "nested": {
            "path": "hourly_values"
          },
          "aggs": {
            "SumSeconds": {
              "sum": {
                "field": "hourly_values.seconds"
              }
            }
          }
        }
      }
    }
  }
}

This will bring back this result, which is what you're looking for

{
  "took": 7,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "Ids": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": 1808,
          "doc_count": 1,
          "Nesting": {
            "doc_count": 3,
            "SumSeconds": {
              "value": 2900
            }
          }
        },
        {
          "key": 1825,
          "doc_count": 1,
          "Nesting": {
            "doc_count": 3,
            "SumSeconds": {
              "value": 3600
            }
          }
        }
      ]
    }
  }
}

If you'd like to return documents next to them too, you could use Top Hits aggregation together with Nested Sum:

POST /test/_search
{
  "size": 0,
  "aggs": {
    "Ids": {
      "terms": {
        "field": "id",
        "size": 0
      },
      "aggs": {
        "Objects": {
          "top_hits": {
            "_source": ["id", "market", "geo_location"],
            "size": 1
          }
        },
        "Nesting": {
          "nested": {
            "path": "hourly_values"
          },
          "aggs": {
            "SumSeconds": {
              "sum": {
                "field": "hourly_values.seconds"
              }
            }
          }
        }
      }
    }
  }
}

And this would bring back it:

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "Ids": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": 1808,
          "doc_count": 1,
          "Nesting": {
            "doc_count": 3,
            "SumSeconds": {
              "value": 2900
            }
          },
          "Objects": {
            "hits": {
              "total": 1,
              "max_score": 1,
              "hits": [
                {
                  "_index": "test",
                  "_type": "data",
                  "_id": "2",
                  "_score": 1,
                  "_source": {
                    "market": "Chicago",
                    "geo_location": {
                      "lon": -87.5678,
                      "lat": 41.1234
                    },
                    "id": 1808
                  }
                }
              ]
            }
          }
        },
        {
          "key": 1825,
          "doc_count": 1,
          "Nesting": {
            "doc_count": 3,
            "SumSeconds": {
              "value": 3600
            }
          },
          "Objects": {
            "hits": {
              "total": 1,
              "max_score": 1,
              "hits": [
                {
                  "_index": "test",
                  "_type": "data",
                  "_id": "1",
                  "_score": 1,
                  "_source": {
                    "market": "Chicago",
                    "geo_location": {
                      "lon": -87.5678,
                      "lat": 41.1234
                    },
                    "id": 1825
                  }
                }
              ]
            }
          }
        }
      ]
    }
  }
}

Upvotes: 2

Related Questions