the industrial

ブログと言うより自分のためのメモ以外の何モノでもないです。でも読んでくださってありがとうございます。

Elasticsearch入門してます (5日目) とっとこAggregations

まごまごせんと、とりあえず1回Aggregations試そうぜ!ということで、今日はAggregations周りしかやらない。

Elasticsearchの本当にすごいところってたぶんlucene由来の全文検索なんだと思っているのだけど、そこまでたどり着いていない僕がゴイスーと思ってElasticsearchにハマりだしたのが、このAggregationsの機能。

なんというか、SQLを超えるほどの柔軟なクエリーが書けるのではないかと思ったのがElasticsearchのクエリーで、そんなElasticsearchのクエリーの真髄と言っても過言ではない(個人調べ)のが、その名の通り基本的に集計を行うためのAggregationsというもの。

Aggregations

公文はオフィシャルサイトから転記(いいのかな)しちゃうのだけど、こんな感じ。

"aggregations" : {
    "<aggregation_name>" : {
        "<aggregation_type>" : {
            <aggregation_body>
        }
        [,"meta" : {  [<meta_data_body>] } ]?
        [,"aggregations" : { [<sub_aggregation>]+ } ]?
    }
    [,"<aggregation_name_2>" : { ... } ]*
}

Aggregationsの中にさらにAggregationsを記述することが出来るのが地味に便利そう。さらに、1回のクエリーで2個以上のAggregationsを記述することができる。

さてさて、Aggregationsは基本的にBucketingMetricPipelineという3種類のタイプがあるらしい。

今日はその中でも基本っぽい?Bucketing Aggregationsの中を見ていきたい。

Index

前回作ったIndex定義の中で、nameに入っているデータがバリエーション豊かなので、nameにもTermが効くように"index": "not_analyzed"を付けておいた。

(この辺りはあとでちゃんとキャッチアップしなきゃという焦り)

{
  "mappings": {
    "type-a": {
      "dynamic": "strict",
      "_source": {
        "enabled": true
      },
      "_all": {
        "enabled": false
      },
      "_ttl": {
        "enabled": false
      },
      "properties": {
        "name": {
          "type": "string",
          "index": "not_analyzed"
        },
        "attribute": {
          "type": "string",
          "index": "not_analyzed"
        },
        "period": {
          "type": "object",
          "properties": {
            "startDate": {
              "type": "date",
              "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
            },
            "endDate": {
              "type": "date",
              "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
            }
          }
        },
        "createdDate": {
          "type": "date",
          "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
        },
        "updatedDate": {
          "type": "date",
          "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
        }
      }
    },
    "type-b": {
      "dynamic": "strict",
      "_source": {
        "enabled": true
      },
      "_all": {
        "enabled": false
      },
      "_ttl": {
        "enabled": false
      },
      "properties": {
        "name": {
          "type": "string",
          "index": "not_analyzed"
        },
        "attribute": {
          "type": "string",
          "index": "not_analyzed"
        },
        "period": {
          "type": "object",
          "properties": {
            "startDate": {
              "type": "date",
              "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
            },
            "endDate": {
              "type": "date",
              "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
            }
          }
        },
        "createdDate": {
          "type": "date",
          "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
        },
        "updatedDate": {
          "type": "date",
          "format": "YYYY-MM-dd'T'HH:mm:ss.SSSZ"
        }
      }
    }
  }
}

テストデータ

前回は計2万行のデータを入れてみたのだけど、2万行も要らないし、しかもデータのバリエーションとして大したものがなかったので、ちょっと作りなおしてみた。

1月は1件、2月は2件...と、それぞれの月でその月と同じ数のデータがあり、データのCreatedDateをそれぞれの月に属させている。

https://goo.gl/sYlG1m

Bucketing

Bucket Aggregations

さらにいろいろなAggregationsがあるのだけど、Filter Aggregationsから見てみる。

Filter Aggregations

Filter Aggregation

これはとてもわかりやすひ。

AggregationsもやはりSearch APIに載せるので(たぶん)、CURLでやるとこんなクエリーになる。

$ curl -XGET 'localhost:9200/index-a/_search?pretty' -d '{
  "aggregations" : {
    "my_count_aggs" : {
      "filter" : {
        "term" : { "name": "NAME10001" }
      }
    }
  }
}'

結果こうなる。

{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 156,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10005",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10005", "attribute" : "ATTR10005", "period" : { "startDate": "2015-03-08T15:00:00.000Z", "endDate": "2015-03-09T14:59:59.999Z" } , "createdDate" : "2015-03-08T15:00:00.000Z", "updatedDate" : "2015-03-08T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10014",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10014", "attribute" : "ATTR10014", "period" : { "startDate": "2015-05-22T15:00:00.000Z", "endDate": "2015-05-23T14:59:59.999Z" } , "createdDate" : "2015-05-22T15:00:00.000Z", "updatedDate" : "2015-05-22T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10018",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10018", "attribute" : "ATTR10018", "period" : { "startDate": "2015-06-15T15:00:00.000Z", "endDate": "2015-06-16T14:59:59.999Z" } , "createdDate" : "2015-06-15T15:00:00.000Z", "updatedDate" : "2015-06-15T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10020",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10020", "attribute" : "ATTR10020", "period" : { "startDate": "2015-06-29T15:00:00.000Z", "endDate": "2015-06-30T15:59:59.999Z" } , "createdDate" : "2015-06-29T15:00:00.000Z", "updatedDate" : "2015-06-29T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10029",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10029", "attribute" : "ATTR10009", "period" : { "startDate": "2015-08-01T15:00:00.000Z", "endDate": "2015-08-02T14:59:59.999Z" } , "createdDate" : "2015-08-01T15:00:00.000Z", "updatedDate" : "2015-08-01T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10039",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10039", "attribute" : "ATTR10019", "period" : { "startDate": "2015-09-15T15:00:00.000Z", "endDate": "2015-09-16T14:59:59.999Z" } , "createdDate" : "2015-09-15T15:00:00.000Z", "updatedDate" : "2015-09-15T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10047",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10047", "attribute" : "ATTR10007", "period" : { "startDate": "2015-10-08T15:00:00.000Z", "endDate": "2015-10-09T14:59:59.999Z" } , "createdDate" : "2015-10-08T15:00:00.000Z", "updatedDate" : "2015-10-08T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10063",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10063", "attribute" : "ATTR10003", "period" : { "startDate": "2015-11-15T15:00:00.000Z", "endDate": "2015-11-16T14:59:59.999Z" } , "createdDate" : "2015-11-15T15:00:00.000Z", "updatedDate" : "2015-11-15T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10067",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10067", "attribute" : "ATTR10007", "period" : { "startDate": "2015-12-01T15:00:00.000Z", "endDate": "2015-12-02T14:59:59.999Z" } , "createdDate" : "2015-12-01T15:00:00.000Z", "updatedDate" : "2015-12-01T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10073",
      "_score" : 1.0,
      "_source":{ "name" : "NAME10073", "attribute" : "ATTR10013", "period" : { "startDate": "2015-12-08T15:00:00.000Z", "endDate": "2015-12-09T14:59:59.999Z" } , "createdDate" : "2015-12-08T15:00:00.000Z", "updatedDate" : "2015-12-08T15:00:00.000Z" }
    } ]
  },
  "aggregations" : {
    "my_count_aggs" : {
      "doc_count" : 1
    }
  }
}

Search APIの検索条件にヒットしたデータ(上の例では条件指定なし)と、aggregationsで指定した条件に合致するデータ件数が、"aggregations"."{指定した名前}"."doc_count"という名前で帰ってくる。

Search APIでデータを絞った後に、Aggregationsを行う場合はこんな感じか。

2014年と2015年それぞれの12月の中で、"attribute:":"ATTR10008"のデータが何件存在するかを数える例。

$ curl -XGET 'localhost:9200/_search?pretty' -d '{
  "query" : {
    "or" : {
      "filters" : [ {
        "range" : {
          "createdDate" : {
            "gte" : "2014-12-01",
            "lte" : "2015-01-01",
            "format": "yyyy-MM-dd||yyyy-MM-dd"
          }
        }
      }, {
        "range" : {
          "createdDate" : {
            "gte" : "2015-12-01",
            "lte" : "2016-01-01",
            "format": "yyyy-MM-dd||yyyy-MM-dd"
          }
        }
      } ]
    }
  },
  "aggs" : {
    "my_count_aggs" : {
      "filter" : {
        "term" : { "attribute": "ATTR10008" }
      }
    }
  }
}'

結果。

{
  "took" : 27,
  "timed_out" : false,
  "_shards" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "hits" : {
    "total" : 36,
    "max_score" : 0.35355338,
    "hits" : [ {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10067",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME10067", "attribute" : "ATTR10007", "period" : { "startDate": "2015-12-01T15:00:00.000Z", "endDate": "2015-12-02T14:59:59.999Z" } , "createdDate" : "2015-12-01T15:00:00.000Z", "updatedDate" : "2015-12-01T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-a",
      "_id" : "10073",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME10073", "attribute" : "ATTR10013", "period" : { "startDate": "2015-12-08T15:00:00.000Z", "endDate": "2015-12-09T14:59:59.999Z" } , "createdDate" : "2015-12-08T15:00:00.000Z", "updatedDate" : "2015-12-08T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-b",
      "_id" : "20069",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20069", "attribute" : "ATTR10009", "period" : { "startDate": "2014-12-15T15:00:00.000Z", "endDate": "2014-12-16T14:59:59.999Z" } , "createdDate" : "2014-12-15T15:00:00.000Z", "updatedDate" : "2014-12-15T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-b",
      "_id" : "20072",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20072", "attribute" : "ATTR10012", "period" : { "startDate": "2014-12-01T15:00:00.000Z", "endDate": "2014-12-02T14:59:59.999Z" } , "createdDate" : "2014-12-01T15:00:00.000Z", "updatedDate" : "2014-12-01T15:00:00.000Z" }
    }, {
      "_index" : "index-b",
      "_type" : "type-b",
      "_id" : "20069",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20069", "attribute" : "ATTR10009", "period" : { "startDate": "2014-12-15T15:00:00.000Z", "endDate": "2014-12-16T14:59:59.999Z" } , "createdDate" : "2014-12-15T15:00:00.000Z", "updatedDate" : "2014-12-15T15:00:00.000Z" }
    }, {
      "_index" : "index-b",
      "_type" : "type-b",
      "_id" : "20072",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20072", "attribute" : "ATTR10012", "period" : { "startDate": "2014-12-01T15:00:00.000Z", "endDate": "2014-12-02T14:59:59.999Z" } , "createdDate" : "2014-12-01T15:00:00.000Z", "updatedDate" : "2014-12-01T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-b",
      "_id" : "20068",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20068", "attribute" : "ATTR10008", "period" : { "startDate": "2014-12-08T15:00:00.000Z", "endDate": "2014-12-09T14:59:59.999Z" } , "createdDate" : "2014-12-08T15:00:00.000Z", "updatedDate" : "2014-12-08T15:00:00.000Z" }
    }, {
      "_index" : "index-a",
      "_type" : "type-b",
      "_id" : "20073",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20073", "attribute" : "ATTR10013", "period" : { "startDate": "2014-12-08T15:00:00.000Z", "endDate": "2014-12-09T14:59:59.999Z" } , "createdDate" : "2014-12-08T15:00:00.000Z", "updatedDate" : "2014-12-08T15:00:00.000Z" }
    }, {
      "_index" : "index-b",
      "_type" : "type-b",
      "_id" : "20068",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20068", "attribute" : "ATTR10008", "period" : { "startDate": "2014-12-08T15:00:00.000Z", "endDate": "2014-12-09T14:59:59.999Z" } , "createdDate" : "2014-12-08T15:00:00.000Z", "updatedDate" : "2014-12-08T15:00:00.000Z" }
    }, {
      "_index" : "index-b",
      "_type" : "type-b",
      "_id" : "20073",
      "_score" : 0.35355338,
      "_source":{ "name" : "NAME20073", "attribute" : "ATTR10013", "period" : { "startDate": "2014-12-08T15:00:00.000Z", "endDate": "2014-12-09T14:59:59.999Z" } , "createdDate" : "2014-12-08T15:00:00.000Z", "updatedDate" : "2014-12-08T15:00:00.000Z" }
    } ]
  },
  "aggregations" : {
    "my_count_aggs" : {
      "doc_count" : 2
    }
  }
}

ワーオ!柔軟!

"aggregations""aggs"(卵達?)でも良い。

curl -XGET 'localhost:9200/index-a/_search?pretty' -d '{
  "aggs" : {
    "my_count_aggs" : {
      "filter" : {
        "term" : { "name": "NAME00001" }
      }
    }
  }
}'

これは別件かもしれないのだけど、リクエストパラメータにsearch_type=countとつけると、hitsの中身が省略される。(データ転送量が減るのが良い?)

$ curl -XGET 'localhost:9200/_search?search_type=count&pretty' -d '{
  "query" : {
    "or" : {
      "filters" : [ {
        "range" : {
          "createdDate" : {
            "gte" : "2014-12-01",
            "lte" : "2015-01-01",
            "format": "yyyy-MM-dd||yyyy-MM-dd"
          }
        }
      }, {
        "range" : {
          "createdDate" : {
            "gte" : "2015-12-01",
            "lte" : "2016-01-01",
            "format": "yyyy-MM-dd||yyyy-MM-dd"
          }
        }
      } ]
    }
  },
  "aggs" : {
    "my_count_aggs" : {
      "filter" : {
        "term" : { "attribute": "ATTR10008" }
      }
    }
  }
}'
{
  "took" : 8,
  "timed_out" : false,
  "_shards" : {
    "total" : 10,
    "successful" : 10,
    "failed" : 0
  },
  "hits" : {
    "total" : 36,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "my_count_aggs" : {
      "doc_count" : 2
    }
  }
}

Aggregationsのfiterを使えば、OR条件で指定した値ごとにカウントができる。

$ curl -XGET 'localhost:9200/index-a/_search?search_type=count&pretty' -d '{
  "aggregations" : {
    "my_count_aggs" : {
      "filter" : {
        "or" : {
          "filters" : [ {
            "term" : { "attribute" : "ATTR10001" }
          }, {
            "term" : { "attribute" : "ATTR10002" }
          }, {
            "term" : { "attribute" : "ATTR10003" }
          } ]
        }
      },
      "aggs" : {
        "by_term" : {
          "terms" : { "field" : "attribute" }
        }
      }
    }
  }
}'
{
  "took" : 7,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 156,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "my_count_aggs" : {
      "doc_count" : 24,
      "by_term" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [ {
          "key" : "ATTR10001",
          "doc_count" : 8
        }, {
          "key" : "ATTR10002",
          "doc_count" : 8
        }, {
          "key" : "ATTR10003",
          "doc_count" : 8
        } ]
      }
    }
  }
}

次は、仕事でdate_histogramをネストさせてた面白いAggregationsが出来たので、そこらへんをやるべし。