From 8bb5c5cc0740d6d316fdf7623e874a8b8ba452b3 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:40:35 -0500
Subject: [PATCH 01/20] [ML] adding support for composite aggs in anomaly
 detection

---
 .../ml-configuring-aggregations.asciidoc      | 234 ++++++++++++++----
 docs/reference/ml/ml-shared.asciidoc          |  13 +-
 .../DateHistogramValuesSourceBuilder.java     |  10 +-
 .../core/ml/datafeed/DatafeedConfig.java      | 159 ++++++++++--
 .../ml/datafeed/DatafeedJobValidator.java     |   2 +
 .../ml/datafeed/extractor/ExtractorUtils.java |  92 ++++++-
 .../xpack/core/ml/job/messages/Messages.java  |   8 +
 .../core/ml/datafeed/DatafeedConfigTests.java | 134 +++++++++-
 .../ml/integration/DatafeedWithAggsIT.java    |  52 +++-
 .../action/TransportStartDatafeedAction.java  |  16 ++
 .../extractor/DataExtractorFactory.java       |  48 +++-
 .../AbstractAggregationDataExtractor.java     |   2 +-
 .../AggregatedSearchRequestBuilder.java       |  24 ++
 .../AggregationDataExtractorContext.java      |   2 +-
 .../AggregationDataExtractorFactory.java      |  16 ++
 .../AggregationToJsonProcessor.java           |  87 ++++++-
 .../CompositeAggregationDataExtractor.java    | 177 +++++++++++++
 ...positeAggregationDataExtractorContext.java |  61 +++++
 ...positeAggregationDataExtractorFactory.java | 124 ++++++++++
 .../RollupDataExtractorFactory.java           |  17 ++
 .../aggregation/AggregationTestUtils.java     |  29 +++
 .../AggregationToJsonProcessorTests.java      | 157 ++++++++++--
 22 files changed, 1321 insertions(+), 143 deletions(-)
 create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
 create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
 create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorContext.java
 create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorFactory.java

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index bdd59e3dde9d2..6b15fed411dd6 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -11,51 +11,54 @@ distributes these calculations across your cluster. You can then feed this
 aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
-TIP: If you use a terms aggregation and the cardinality of a term is high, the
-aggregation might not be effective and you might want to just use the default
-search and scroll behavior.
+TIP: If you use a terms aggregation and the cardinality of a term is high, you
+should use composite aggregations.
 
 [discrete]
 [[aggs-limits-dfeeds]]
 == Requirements and limitations
 
 There are some limitations to using aggregations in {dfeeds}. Your aggregation
-must include a `date_histogram` aggregation, which in turn must contain a `max`
-aggregation on the time field. This requirement ensures that the aggregated data
-is a time series and the timestamp of each bucket is the time of the last record
-in the bucket.
-
-IMPORTANT: The name of the aggregation and the name of the field that the agg 
-operates on need to match, otherwise the aggregation doesn't work. For example, 
-if you use a `max` aggregation on a time field called `responsetime`, the name 
+must include a `date_histogram` aggregation or a top level `composite` aggregation,
+which in turn must contain a `max` aggregation on the time field.
+This requirement ensures that the aggregated data is a time series and the timestamp
+of each bucket is the time of the last record in the bucket.
+
+For `composite` aggregation support, there must be exactly one `date_histogram` value
+source. Additionally, that value source must NOT be sorted in descending order. Additional
+`composite` aggregation value sources are allowed, such as `terms`.
+
+IMPORTANT: The name of the aggregation and the name of the field that the agg
+operates on need to match, otherwise the aggregation doesn't work. For example,
+if you use a `max` aggregation on a time field called `responsetime`, the name
 of the aggregation must be also `responsetime`.
 
-You must also consider the interval of the date histogram aggregation carefully.
-The bucket span of your {anomaly-job} must be divisible by the value of the
-`calendar_interval` or `fixed_interval` in your aggregation (with no remainder).
-If you specify a `frequency` for your {dfeed}, it must also be divisible by this
-interval. {anomaly-jobs-cap} cannot use date histograms with an interval 
-measured in months because the length of the month is not fixed. {dfeeds-cap} 
-tolerate weeks or smaller units.
+You must also consider the interval of the `date_histogram` or `composite`
+aggregation carefully. The bucket span of your {anomaly-job} must be divisible
+by the value of the `calendar_interval` or `fixed_interval` in your aggregation
+(with no remainder). If you specify a `frequency` for your {dfeed},
+it must also be divisible by this interval. {anomaly-jobs-cap} cannot use
+`date_histogram` or `composite` aggs with an interval measured in months
+because the length of the month is not fixed. {dfeeds-cap} tolerate weeks or smaller units.
 
 TIP: As a rule of thumb, if your detectors use <<ml-metric-functions,metric>> or
-<<ml-sum-functions,sum>> analytical functions, set the date histogram
+<<ml-sum-functions,sum>> analytical functions, set the `date_histogram` or `composite`
 aggregation interval to a tenth of the bucket span. This suggestion creates
 finer, more granular time buckets, which are ideal for this type of analysis. If
 your detectors use <<ml-count-functions,count>> or <<ml-rare-functions,rare>>
 functions, set the interval to the same value as the bucket span.
 
-If your <<aggs-dfeeds,{dfeed} uses aggregations with nested `terms` aggs>> and 
-model plot is not enabled for the {anomaly-job}, neither the **Single Metric 
-Viewer** nor the **Anomaly Explorer** can plot and display an anomaly 
-chart for the job. In these cases, the charts are not visible and an explanatory 
+If your <<aggs-dfeeds,{dfeed} uses aggregations with nested `terms` aggs>> and
+model plot is not enabled for the {anomaly-job}, neither the **Single Metric
+Viewer** nor the **Anomaly Explorer** can plot and display an anomaly
+chart for the job. In these cases, the charts are not visible and an explanatory
 message is shown.
 
-When the aggregation interval of the {dfeed} and the bucket span of the job 
-don't match, the values of the chart plotted in both the **Single Metric 
-Viewer** and the **Anomaly Explorer** differ from the actual values of the job. 
-To avoid this behavior, make sure that the aggregation interval in the {dfeed} 
-configuration and the bucket span in the {anomaly-job} configuration have the 
+When the aggregation interval of the {dfeed} and the bucket span of the job
+don't match, the values of the chart plotted in both the **Single Metric
+Viewer** and the **Anomaly Explorer** differ from the actual values of the job.
+To avoid this behavior, make sure that the aggregation interval in the {dfeed}
+configuration and the bucket span in the {anomaly-job} configuration have the
 same values.
 
 
@@ -86,8 +89,8 @@ PUT _ml/anomaly_detectors/farequote
 ----------------------------------
 // TEST[skip:setup:farequote_data]
 
-<1> The `airline`, `responsetime`, and `time` fields are aggregations. Only the 
-aggregated fields defined in the `analysis_config` object are analyzed by the 
+<1> The `airline`, `responsetime`, and `time` fields are aggregations. Only the
+aggregated fields defined in the `analysis_config` object are analyzed by the
 {anomaly-job}.
 
 NOTE: When the `summary_count_field_name` property is set to a non-null value,
@@ -134,23 +137,116 @@ PUT _ml/datafeeds/datafeed-farequote
 ----------------------------------
 // TEST[skip:setup:farequote_job]
 
-<1> The aggregations have names that match the fields that they operate on. The 
+<1> The aggregations have names that match the fields that they operate on. The
 `max` aggregation is named `time` and its field also needs to be `time`.
-<2> The `term` aggregation is named `airline` and its field is also named 
+<2> The `term` aggregation is named `airline` and its field is also named
 `airline`.
-<3> The `avg` aggregation is named `responsetime` and its field is also named 
+<3> The `avg` aggregation is named `responsetime` and its field is also named
 `responsetime`.
 
-Your {dfeed} can contain multiple aggregations, but only the ones with names 
+TIP: If you are utilizing a `term` aggregation to gather influencer or partition
+detector field information, consider using a `composite` aggregation. It will perform
+better than a `date_histogram` with a nested `term` aggregation and will also include
+ALL the values of the field instead of the top values per bucket.
+
+Your {dfeed} can contain multiple aggregations, but only the ones with names
 that match values in the job configuration are fed to the job.
 
+Here is a similar example utilizing a `composite` aggregation instead of a
+`date_histogram`.
+
+Assuming the same job configuration as above.
+
+[source,console]
+----------------------------------
+PUT _ml/anomaly_detectors/farequote-composite
+{
+  "analysis_config": {
+    "bucket_span": "60m",
+    "detectors": [{
+      "function": "mean",
+      "field_name": "responsetime",
+      "by_field_name": "airline"
+    }],
+    "summary_count_field_name": "doc_count"
+  },
+  "data_description": {
+    "time_field":"time"
+  }
+}
+----------------------------------
+// TEST[skip:setup:farequote_data]
+
+Here is a datafeed utilizing a `composite` aggregation to bucket
+the metrics based on time and terms.
+
+[source,console]
+----------------------------------
+{
+  "job_id": "farequote-composite",
+  "indices": [
+    "farequote"
+  ],
+  "aggregations": {
+    "buckets": {
+      "composite": {
+        "size": 1000,  <1>
+        "sources": [
+          {
+            "time_bucket": {  <2>
+              "date_histogram": {
+                "field": "time",
+                "fixed_interval": "360s",
+                "time_zone": "UTC"
+              }
+            }
+          },
+          {
+            "airline": {  <3>
+              "terms": {
+                "field": "airline"
+              }
+            }
+          }
+        ]
+      },
+      "aggregations": {
+        "time": {  <4>
+          "max": {
+            "field": "time"
+          }
+        },
+        "responsetime": { <5>
+          "avg": {
+            "field": "responsetime"
+          }
+        }
+      }
+    }
+  }
+}
+----------------------------------
+// TEST[skip:setup:farequote_job]
+
+<1> Provide the `size` to the composite agg to control how many resources
+are used when aggregated the data. A larger `size` means a faster datafeed but
+more cluster resources are utilized when searching.
+<2> The required `date_histogram` composite aggregation source. Make sure it
+is named differently than your desired time field.
+<3> Instead of utilizing a regular `term` aggregation, adding a composite
+aggregation `term` source with the name `airline` works. Note its name
+is the same as the field.
+<4> The required `max` aggregation whose name is the time field in the
+job analysis config
+<5> The `avg` aggregation is named `responsetime` and its field is also named
+`responsetime`.
 
 [discrete]
 [[aggs-dfeeds]]
 == Nested aggregations in {dfeeds}
 
-{dfeeds-cap} support complex nested aggregations. This example uses the 
-`derivative` pipeline aggregation to find the first order derivative of the 
+{dfeeds-cap} support complex nested aggregations. This example uses the
+`derivative` pipeline aggregation to find the first order derivative of the
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 [source,js]
@@ -247,8 +343,9 @@ number of unique entries for the `error` field.
 [[aggs-define-dfeeds]]
 == Defining aggregations in {dfeeds}
 
-When you define an aggregation in a {dfeed}, it must have the following form:
+When you define an aggregation in a {dfeed}, it must have one of the following forms:
 
+When using a `date_histogram` aggregation to bucket by time
 [source,js]
 ----------------------------------
 "aggregations": {
@@ -282,26 +379,64 @@ When you define an aggregation in a {dfeed}, it must have the following form:
 ----------------------------------
 // NOTCONSOLE
 
+When using a `composite` aggregation
+
+[source,js]
+----------------------------------
+"aggregations": {
+  "composite_agg": {
+    "sources": [
+      {
+        "date_histogram_agg": {
+          "field": "time",
+          ...settings...
+        }
+      },
+      ...other valid sources...
+      ],
+      ...composite agg settings...,
+      "aggregations": {
+        "timestamp": {
+            "max": {
+              "field": "time"
+            }
+          },
+          ...other aggregations...
+          [
+            [,"aggregations" : {
+              [<sub_aggregation>]+
+            } ]
+          }]
+      }
+   }
+}
+----------------------------------
+// NOTCONSOLE
+
 The top level aggregation must be either a
 {ref}/search-aggregations-bucket.html[bucket aggregation] containing as single
-sub-aggregation that is a `date_histogram` or the top level aggregation is the
-required `date_histogram`. There must be exactly one `date_histogram` 
-aggregation. For more information, see
-{ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date histogram aggregation].
+sub-aggregation that is a `date_histogram`, the top level aggregation is the
+required `date_histogram`, or the top leve aggregation is the required `composite`.
+There must be exactly one `date_histogram` or `composite` aggregation. For more information, see
+{ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date histogram aggregation] and
+{ref}/search-aggregations-bucket-composite-aggregation.html[Composite aggregation].
 
 NOTE: The `time_zone` parameter in the date histogram aggregation must be set to
 `UTC`, which is the default value.
 
-Each histogram bucket has a key, which is the bucket start time. This key cannot
-be used for aggregations in {dfeeds}, however, because they need to know the
-time of the latest record within a bucket. Otherwise, when you restart a 
-{dfeed}, it continues from the start time of the histogram bucket and possibly 
-fetches the same data twice. The max aggregation for the time field is therefore
-necessary to provide the time of the latest record within a bucket.
+Each histogram or composite bucket has a key, which is the bucket start time.
+This key cannot be used for aggregations in {dfeeds}, however, because
+they need to know the time of the latest record within a bucket.
+Otherwise, when you restart a {dfeed}, it continues from the start time of the
+histogram or composite bucket and possibly fetches the same data twice.
+The max aggregation for the time field is therefore necessary to provide
+the time of the latest record within a bucket.
 
 You can optionally specify a terms aggregation, which creates buckets for
 different values of a field.
 
+TIP: Instead of nesting a `term` aggregation, try using `composite` aggs.
+
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not
 all terms are analyzed.
@@ -311,7 +446,7 @@ determine the cardinality of your data, you can run searches such as:
 
 [source,js]
 --------------------------------------------------
-GET .../_search 
+GET .../_search
 {
   "aggs": {
     "service_cardinality": {
@@ -324,10 +459,11 @@ GET .../_search
 --------------------------------------------------
 // NOTCONSOLE
 
+
 By default, {es} limits the maximum number of terms returned to 10000. For high
 cardinality fields, the query might not run. It might return errors related to
 circuit breaking exceptions that indicate that the data is too large. In such
-cases, do not use aggregations in your {dfeed}. For more information, see
+cases, use `composite` aggregations in your {dfeed}. For more information, see
 {ref}/search-aggregations-bucket-terms-aggregation.html[Terms aggregation].
 
 You can also optionally specify multiple sub-aggregations. The sub-aggregations
diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index e5db0be318861..9fdc1ef306bf6 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -1,7 +1,6 @@
 tag::aggregations[]
 If set, the {dfeed} performs aggregation searches. Support for aggregations is
-limited and should be used only with low cardinality data. For more information,
-see
+limited. For more information, see
 {ml-docs}/ml-configuring-aggregation.html[Aggregating data for faster performance].
 end::aggregations[]
 
@@ -156,7 +155,8 @@ tag::bucket-span[]
 The size of the interval that the analysis is aggregated into, typically between
 `5m` and `1h`. The default value is `5m`. If the {anomaly-job} uses a {dfeed}
 with {ml-docs}/ml-configuring-aggregation.html[aggregations], this value must be
-divisible by the interval of the date histogram aggregation. For more
+divisible by the interval of the `date_histogram` aggregation or the 
+`date_histogram` source in the `composite` aggregation. For more
 information, see {ml-docs}/ml-buckets.html[Buckets].
 end::bucket-span[]
 
@@ -824,7 +824,8 @@ for longer bucket spans, a sensible fraction of the bucket span. For example:
 `150s`. When `frequency` is shorter than the bucket span, interim results for
 the last (partial) bucket are written then eventually overwritten by the full
 bucket results. If the {dfeed} uses aggregations, this value must be divisible
-by the interval of the date histogram aggregation.
+by the interval of the `date_histogram` or the `date_histogram` source in the
+`composite` aggregation.
 end::frequency[]
 
 tag::frequent-category-count[]
@@ -1129,8 +1130,8 @@ There are three available modes:
 * `auto`: The chunk size is dynamically calculated. This is the default and
 recommended value when the {dfeed} does not use aggregations.
 * `manual`: Chunking is applied according to the specified `time_span`. Use this
-mode when the {dfeed} uses aggregations.
-* `off`: No chunking is applied.
+mode when the {dfeed} uses aggregations other than `composite`.
+* `off`: No chunking is applied. Recommended for `composite` aggregations.
 --
 end::mode[]
 
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java
index 767e6c5f153de..129f25446fbb9 100644
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/DateHistogramValuesSourceBuilder.java
@@ -212,7 +212,10 @@ public DateHistogramValuesSourceBuilder fixedInterval(DateHistogramInterval inte
      *  {@code null} then it means that the interval is expressed as a fixed
      *  {@link TimeValue} and may be accessed via {@link #getIntervalAsFixed()} ()}. */
     public DateHistogramInterval getIntervalAsCalendar() {
-        return dateHistogramInterval.getAsCalendarInterval();
+        if (dateHistogramInterval.getIntervalType().equals(DateIntervalWrapper.IntervalTypeEnum.CALENDAR)) {
+            return dateHistogramInterval.getAsCalendarInterval();
+        }
+        return null;
     }
 
     /**
@@ -220,7 +223,10 @@ public DateHistogramInterval getIntervalAsCalendar() {
      * the interval cannot be parsed as a fixed time.
      */
     public DateHistogramInterval getIntervalAsFixed() {
-        return dateHistogramInterval.getAsFixedInterval();
+        if (dateHistogramInterval.getIntervalType().equals(DateIntervalWrapper.IntervalTypeEnum.FIXED)) {
+            return dateHistogramInterval.getAsFixedInterval();
+        }
+        return null;
     }
 
     /**
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
index 7c61375478037..47a88408c14bf 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
@@ -28,13 +28,16 @@
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
 import org.elasticsearch.search.aggregations.metrics.MaxAggregationBuilder;
 import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
+import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.xpack.core.common.time.TimeUtils;
 import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils;
 import org.elasticsearch.xpack.core.ml.job.config.Job;
-import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.core.ml.utils.MlStrings;
 import org.elasticsearch.xpack.core.ml.utils.QueryProvider;
@@ -56,6 +59,20 @@
 import java.util.concurrent.TimeUnit;
 
 import static org.elasticsearch.xpack.core.ClientHelper.assertNoAuthorizationHeader;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SORT;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SOURCE_MISSING_BUCKET;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_BE_TOP_LEVEL_AND_ALONE;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_HAVE_SINGLE_DATE_SOURCE;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_INTERVAL_MUST_BE_GREATER_THAN_ZERO;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_MAX_ONE_DATE_HISTOGRAM;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_AGGREGATIONS_REQUIRES_DATE_HISTOGRAM;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_CONFIG_AGG_BAD_FORMAT;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_CONFIG_CANNOT_USE_SCRIPT_FIELDS_WITH_AGGS;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_CONFIG_INVALID_OPTION_VALUE;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_CONFIG_QUERY_BAD_FORMAT;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.DATAFEED_DATA_HISTOGRAM_MUST_HAVE_NESTED_MAX_AGGREGATION;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.INVALID_ID;
+import static org.elasticsearch.xpack.core.ml.job.messages.Messages.getMessage;
 import static org.elasticsearch.xpack.core.ml.utils.ToXContentParams.EXCLUDE_GENERATED;
 
 /**
@@ -119,11 +136,19 @@ public static void validateAggregations(AggregatorFactories.Builder aggregations
         }
         Collection<AggregationBuilder> aggregatorFactories = aggregations.getAggregatorFactories();
         if (aggregatorFactories.isEmpty()) {
-            throw ExceptionsHelper.badRequestException(Messages.DATAFEED_AGGREGATIONS_REQUIRES_DATE_HISTOGRAM);
+            throw ExceptionsHelper.badRequestException(DATAFEED_AGGREGATIONS_REQUIRES_DATE_HISTOGRAM);
         }
 
+        Builder.checkForOnlySingleTopLevelCompositeAggAndValidate(aggregations.getAggregatorFactories());
         AggregationBuilder histogramAggregation = ExtractorUtils.getHistogramAggregation(aggregatorFactories);
+        if (histogramAggregation instanceof CompositeAggregationBuilder
+            && aggregations.getPipelineAggregatorFactories().isEmpty() == false) {
+            throw ExceptionsHelper.badRequestException(
+                "when using composite aggregations, top level pipeline aggregations are not supported"
+            );
+        }
         Builder.checkNoMoreHistogramAggregations(histogramAggregation.getSubAggregations());
+        Builder.checkNoMoreCompositeAggregations(histogramAggregation.getSubAggregations());
         Builder.checkHistogramAggregationHasChildMaxTimeAgg(histogramAggregation);
         Builder.checkHistogramIntervalIsPositive(histogramAggregation);
     }
@@ -141,7 +166,7 @@ private static ObjectParser<Builder, Void> createParser(boolean ignoreUnknownFie
         parser.declareString((builder, val) ->
             builder.setFrequency(TimeValue.parseTimeValue(val, FREQUENCY.getPreferredName())), FREQUENCY);
         parser.declareObject(Builder::setQueryProvider,
-            (p, c) -> QueryProvider.fromXContent(p, ignoreUnknownFields, Messages.DATAFEED_CONFIG_QUERY_BAD_FORMAT),
+            (p, c) -> QueryProvider.fromXContent(p, ignoreUnknownFields, DATAFEED_CONFIG_QUERY_BAD_FORMAT),
             QUERY);
         parser.declareObject(Builder::setAggregationsSafe,
             (p, c) -> AggProvider.fromXContent(p, ignoreUnknownFields),
@@ -321,7 +346,7 @@ private QueryBuilder parseQuery(NamedXContentRegistry namedXContentRegistry, Lis
             if (exception.getCause() instanceof IllegalArgumentException) {
                 exception = (Exception)exception.getCause();
             }
-            throw ExceptionsHelper.badRequestException(Messages.DATAFEED_CONFIG_QUERY_BAD_FORMAT, exception);
+            throw ExceptionsHelper.badRequestException(DATAFEED_CONFIG_QUERY_BAD_FORMAT, exception);
         }
     }
 
@@ -369,7 +394,7 @@ private AggregatorFactories.Builder parseAggregations(NamedXContentRegistry name
             if (exception.getCause() instanceof IllegalArgumentException) {
                 exception = (Exception)exception.getCause();
             }
-            throw ExceptionsHelper.badRequestException(Messages.DATAFEED_CONFIG_AGG_BAD_FORMAT, exception);
+            throw ExceptionsHelper.badRequestException(DATAFEED_CONFIG_AGG_BAD_FORMAT, exception);
         }
     }
 
@@ -402,6 +427,21 @@ public long getHistogramIntervalMillis(NamedXContentRegistry namedXContentRegist
         return ExtractorUtils.getHistogramIntervalMillis(getParsedAggregations(namedXContentRegistry));
     }
 
+    /**
+     * Indicates if the datafeed is using composite aggs.
+     * @param namedXContentRegistry XContent registry to transform the lazily parsed aggregations
+     * @return If the datafeed utilizes composite aggs or not
+     */
+    public boolean hasCompositeAgg(NamedXContentRegistry namedXContentRegistry) {
+        if (hasAggregations() == false) {
+            return false;
+        }
+        AggregationBuilder maybeComposite = ExtractorUtils.getHistogramAggregation(
+            getParsedAggregations(namedXContentRegistry).getAggregatorFactories()
+        );
+        return maybeComposite instanceof CompositeAggregationBuilder;
+    }
+
     /**
      * @return {@code true} when there are non-empty aggregations, {@code false} otherwise
      */
@@ -552,9 +592,13 @@ private static ChunkingConfig defaultChunkingConfig(@Nullable AggProvider aggPro
         if (aggProvider == null || aggProvider.getParsedAggs() == null) {
             return ChunkingConfig.newAuto();
         } else {
-            long histogramIntervalMillis = ExtractorUtils.getHistogramIntervalMillis(aggProvider.getParsedAggs());
+            AggregationBuilder histogram = ExtractorUtils.getHistogramAggregation(aggProvider.getParsedAggs().getAggregatorFactories());
+            if (histogram instanceof CompositeAggregationBuilder) {
+                return ChunkingConfig.newOff();
+            }
+            long histogramIntervalMillis = ExtractorUtils.getHistogramIntervalMillis(histogram);
             if (histogramIntervalMillis <= 0) {
-                throw ExceptionsHelper.badRequestException(Messages.DATAFEED_AGGREGATIONS_INTERVAL_MUST_BE_GREATER_THAN_ZERO);
+                throw ExceptionsHelper.badRequestException(DATAFEED_AGGREGATIONS_INTERVAL_MUST_BE_GREATER_THAN_ZERO);
             }
             return ChunkingConfig.newManual(TimeValue.timeValueMillis(DEFAULT_AGGREGATION_CHUNKING_BUCKETS * histogramIntervalMillis));
         }
@@ -781,8 +825,7 @@ public Builder setScriptFields(List<SearchSourceBuilder.ScriptField> scriptField
 
         public Builder setScrollSize(int scrollSize) {
             if (scrollSize < 0) {
-                String msg = Messages.getMessage(Messages.DATAFEED_CONFIG_INVALID_OPTION_VALUE,
-                        DatafeedConfig.SCROLL_SIZE.getPreferredName(), scrollSize);
+                String msg = getMessage(DATAFEED_CONFIG_INVALID_OPTION_VALUE, DatafeedConfig.SCROLL_SIZE.getPreferredName(), scrollSize);
                 throw ExceptionsHelper.badRequestException(msg);
             }
             this.scrollSize = scrollSize;
@@ -803,8 +846,11 @@ public Builder setMaxEmptySearches(int maxEmptySearches) {
             if (maxEmptySearches == -1) {
                 this.maxEmptySearches = null;
             } else if (maxEmptySearches <= 0) {
-                String msg = Messages.getMessage(Messages.DATAFEED_CONFIG_INVALID_OPTION_VALUE,
-                    DatafeedConfig.MAX_EMPTY_SEARCHES.getPreferredName(), maxEmptySearches);
+                String msg = getMessage(
+                    DATAFEED_CONFIG_INVALID_OPTION_VALUE,
+                    DatafeedConfig.MAX_EMPTY_SEARCHES.getPreferredName(),
+                    maxEmptySearches
+                );
                 throw ExceptionsHelper.badRequestException(msg);
             } else {
                 this.maxEmptySearches = maxEmptySearches;
@@ -831,7 +877,7 @@ public DatafeedConfig build() {
             ExceptionsHelper.requireNonNull(id, ID.getPreferredName());
             ExceptionsHelper.requireNonNull(jobId, Job.ID.getPreferredName());
             if (MlStrings.isValidId(id) == false) {
-                throw ExceptionsHelper.badRequestException(Messages.getMessage(Messages.INVALID_ID, ID.getPreferredName(), id));
+                throw ExceptionsHelper.badRequestException(getMessage(INVALID_ID, ID.getPreferredName(), id));
             }
             if (indices == null || indices.isEmpty() || indices.contains(null) || indices.contains("")) {
                 throw invalidOptionValue(INDICES.getPreferredName(), indices);
@@ -854,15 +900,14 @@ void validateScriptFields() {
                 return;
             }
             if (scriptFields != null && scriptFields.isEmpty() == false) {
-                throw ExceptionsHelper.badRequestException(
-                    Messages.getMessage(Messages.DATAFEED_CONFIG_CANNOT_USE_SCRIPT_FIELDS_WITH_AGGS));
+                throw ExceptionsHelper.badRequestException(getMessage(DATAFEED_CONFIG_CANNOT_USE_SCRIPT_FIELDS_WITH_AGGS));
             }
         }
 
         private static void checkNoMoreHistogramAggregations(Collection<AggregationBuilder> aggregations) {
             for (AggregationBuilder agg : aggregations) {
                 if (ExtractorUtils.isHistogram(agg)) {
-                    throw ExceptionsHelper.badRequestException(Messages.DATAFEED_AGGREGATIONS_MAX_ONE_DATE_HISTOGRAM);
+                    throw ExceptionsHelper.badRequestException(DATAFEED_AGGREGATIONS_MAX_ONE_DATE_HISTOGRAM);
                 }
                 checkNoMoreHistogramAggregations(agg.getSubAggregations());
             }
@@ -873,6 +918,13 @@ static void checkHistogramAggregationHasChildMaxTimeAgg(AggregationBuilder histo
             if (histogramAggregation instanceof ValuesSourceAggregationBuilder) {
                 timeField = ((ValuesSourceAggregationBuilder) histogramAggregation).field();
             }
+            if (histogramAggregation instanceof CompositeAggregationBuilder) {
+                DateHistogramValuesSourceBuilder valueSource = ExtractorUtils.getDateHistogramValuesSource(
+                    (CompositeAggregationBuilder) histogramAggregation
+                );
+                assert valueSource != null : "unexpected null date_histogram composite agg source";
+                timeField = valueSource.field();
+            }
 
             for (AggregationBuilder agg : histogramAggregation.getSubAggregations()) {
                 if (agg instanceof MaxAggregationBuilder) {
@@ -883,17 +935,86 @@ static void checkHistogramAggregationHasChildMaxTimeAgg(AggregationBuilder histo
                 }
             }
 
-            throw ExceptionsHelper.badRequestException(
-                    Messages.getMessage(Messages.DATAFEED_DATA_HISTOGRAM_MUST_HAVE_NESTED_MAX_AGGREGATION, timeField));
+            throw ExceptionsHelper.badRequestException(getMessage(DATAFEED_DATA_HISTOGRAM_MUST_HAVE_NESTED_MAX_AGGREGATION, timeField));
         }
 
         private static void checkHistogramIntervalIsPositive(AggregationBuilder histogramAggregation) {
             long interval = ExtractorUtils.getHistogramIntervalMillis(histogramAggregation);
             if (interval <= 0) {
-                throw ExceptionsHelper.badRequestException(Messages.DATAFEED_AGGREGATIONS_INTERVAL_MUST_BE_GREATER_THAN_ZERO);
+                throw ExceptionsHelper.badRequestException(DATAFEED_AGGREGATIONS_INTERVAL_MUST_BE_GREATER_THAN_ZERO);
+            }
+        }
+
+        static void validateCompositeAggregationSources(CompositeAggregationBuilder histogramAggregation) {
+            boolean hasDateValueSource = false;
+            DateHistogramValuesSourceBuilder foundBuilder = null;
+            for (CompositeValuesSourceBuilder<?> valueSource : histogramAggregation.sources()) {
+                if (valueSource instanceof DateHistogramValuesSourceBuilder) {
+                    if (hasDateValueSource) {
+                        throw ExceptionsHelper.badRequestException(
+                            getMessage(
+                                DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_HAVE_SINGLE_DATE_SOURCE,
+                                histogramAggregation.getName()
+                            )
+                        );
+                    }
+                    hasDateValueSource = true;
+                    foundBuilder = (DateHistogramValuesSourceBuilder) valueSource;
+                }
+            }
+            if (foundBuilder == null) {
+                throw ExceptionsHelper.badRequestException(
+                    getMessage(
+                        DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_HAVE_SINGLE_DATE_SOURCE,
+                        histogramAggregation.getName()
+                    )
+                );
+            }
+            if (foundBuilder.missingBucket()) {
+                throw ExceptionsHelper.badRequestException(
+                    getMessage(
+                        DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SOURCE_MISSING_BUCKET,
+                        histogramAggregation.getName(),
+                        foundBuilder.name()
+                    )
+                );
+            }
+            if (foundBuilder.order() != SortOrder.ASC) {
+                throw ExceptionsHelper.badRequestException(
+                    getMessage(
+                        DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SORT,
+                        histogramAggregation.getName(),
+                        foundBuilder.name()
+                    )
+                );
             }
         }
 
+        private static void checkForOnlySingleTopLevelCompositeAggAndValidate(Collection<AggregationBuilder> aggregationBuilders) {
+            Optional<AggregationBuilder> maybeComposite = aggregationBuilders.stream()
+                .filter(agg -> agg instanceof CompositeAggregationBuilder)
+                .findFirst();
+            if (maybeComposite.isEmpty() == false) {
+                CompositeAggregationBuilder composite = (CompositeAggregationBuilder) maybeComposite.get();
+                if (aggregationBuilders.size() > 1) {
+                    throw ExceptionsHelper.badRequestException(
+                        getMessage(DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_BE_TOP_LEVEL_AND_ALONE, composite.getName())
+                    );
+                }
+                validateCompositeAggregationSources(composite);
+            }
+        }
+
+        private static void checkNoMoreCompositeAggregations(Collection<AggregationBuilder> aggregations) {
+            for (AggregationBuilder agg : aggregations) {
+                if (agg instanceof CompositeAggregationBuilder) {
+                    throw ExceptionsHelper.badRequestException(
+                        getMessage(DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_BE_TOP_LEVEL_AND_ALONE, agg.getName())
+                    );
+                }
+                checkNoMoreCompositeAggregations(agg.getSubAggregations());
+            }
+        }
         private void setDefaultChunkingConfig() {
             if (chunkingConfig == null) {
                 chunkingConfig = defaultChunkingConfig(aggProvider);
@@ -907,7 +1028,7 @@ private void setDefaultQueryDelay() {
         }
 
         private static ElasticsearchException invalidOptionValue(String fieldName, Object value) {
-            String msg = Messages.getMessage(Messages.DATAFEED_CONFIG_INVALID_OPTION_VALUE, fieldName, value);
+            String msg = getMessage(DATAFEED_CONFIG_INVALID_OPTION_VALUE, fieldName, value);
             throw ExceptionsHelper.badRequestException(msg);
         }
     }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedJobValidator.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedJobValidator.java
index 191d740f170c2..c833374ae2d13 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedJobValidator.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedJobValidator.java
@@ -24,12 +24,14 @@ private DatafeedJobValidator() {}
      * Validates a datafeedConfig in relation to the job it refers to
      * @param datafeedConfig the datafeed config
      * @param job the job
+     * @param xContentRegistry the named xcontent registry for parsing datafeed aggs
      */
     public static void validate(DatafeedConfig datafeedConfig, Job job, NamedXContentRegistry xContentRegistry) {
         AnalysisConfig analysisConfig = job.getAnalysisConfig();
         if (analysisConfig.getLatency() != null && analysisConfig.getLatency().seconds() > 0) {
             throw ExceptionsHelper.badRequestException(Messages.getMessage(Messages.DATAFEED_DOES_NOT_SUPPORT_JOB_WITH_LATENCY));
         }
+        // TODO should we validate that the aggs define the fields requested in the analysis config?
         if (datafeedConfig.hasAggregations()) {
             checkSummaryCountFieldNameIsSet(analysisConfig);
             checkValidHistogramInterval(datafeedConfig, analysisConfig, xContentRegistry);
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
index 4a21faac4ca4c..90e88c09824fe 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
@@ -14,11 +14,16 @@
 import org.elasticsearch.index.query.RangeQueryBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
 import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
 import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 
+import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.util.Collection;
 import java.util.concurrent.TimeUnit;
@@ -75,7 +80,22 @@ public static AggregationBuilder getHistogramAggregation(Collection<AggregationB
 
     public static boolean isHistogram(AggregationBuilder aggregationBuilder) {
         return aggregationBuilder instanceof HistogramAggregationBuilder
-                || aggregationBuilder instanceof DateHistogramAggregationBuilder;
+            || aggregationBuilder instanceof DateHistogramAggregationBuilder
+            || isCompositeWithDateHistogramSource(aggregationBuilder);
+    }
+
+    public static boolean isCompositeWithDateHistogramSource(AggregationBuilder aggregationBuilder) {
+        return aggregationBuilder instanceof CompositeAggregationBuilder
+            && getDateHistogramValuesSource((CompositeAggregationBuilder) aggregationBuilder) != null;
+    }
+
+    public static DateHistogramValuesSourceBuilder getDateHistogramValuesSource(CompositeAggregationBuilder compositeAggregationBuilder) {
+        for (CompositeValuesSourceBuilder<?> valuesSourceBuilder : compositeAggregationBuilder.sources()) {
+            if (valuesSourceBuilder instanceof DateHistogramValuesSourceBuilder) {
+                return (DateHistogramValuesSourceBuilder)valuesSourceBuilder;
+            }
+        }
+        return null;
     }
 
     /**
@@ -91,7 +111,13 @@ public static long getHistogramIntervalMillis(AggregationBuilder histogramAggreg
         if (histogramAggregation instanceof HistogramAggregationBuilder) {
             return (long) ((HistogramAggregationBuilder) histogramAggregation).interval();
         } else if (histogramAggregation instanceof DateHistogramAggregationBuilder) {
-            return validateAndGetDateHistogramInterval((DateHistogramAggregationBuilder) histogramAggregation);
+            return validateAndGetDateHistogramInterval(
+                DateHistogramAggOrValueSource.fromAgg((DateHistogramAggregationBuilder) histogramAggregation)
+            );
+        } else if (histogramAggregation instanceof CompositeAggregationBuilder) {
+            return validateAndGetDateHistogramInterval(
+                DateHistogramAggOrValueSource.fromCompositeAgg((CompositeAggregationBuilder)histogramAggregation)
+            );
         } else {
             throw new IllegalStateException("Invalid histogram aggregation [" + histogramAggregation.getName() + "]");
         }
@@ -101,7 +127,7 @@ public static long getHistogramIntervalMillis(AggregationBuilder histogramAggreg
      * Returns the date histogram interval as epoch millis if valid, or throws
      * an {@link ElasticsearchException} with the validation error
      */
-    private static long validateAndGetDateHistogramInterval(DateHistogramAggregationBuilder dateHistogram) {
+    private static long validateAndGetDateHistogramInterval(DateHistogramAggOrValueSource dateHistogram) {
         if (dateHistogram.timeZone() != null && dateHistogram.timeZone().normalized().equals(ZoneOffset.UTC) == false) {
             throw ExceptionsHelper.badRequestException("ML requires date_histogram.time_zone to be UTC");
         }
@@ -116,7 +142,7 @@ private static long validateAndGetDateHistogramInterval(DateHistogramAggregation
         } else if (dateHistogram.interval() != 0) {
             return dateHistogram.interval();
         } else {
-            throw new IllegalArgumentException("Must specify an interval for DateHistogram");
+            throw new IllegalArgumentException("Must specify an interval for date_histogram");
         }
     }
 
@@ -148,7 +174,7 @@ public static long validateAndGetCalendarInterval(String calendarInterval) {
                     throw ExceptionsHelper.badRequestException("Unexpected dateTimeUnit [" + dateTimeUnit + "]");
             }
         } else {
-            interval = TimeValue.parseTimeValue(calendarInterval, "date_histogram.interval");
+            interval = TimeValue.parseTimeValue(calendarInterval, "date_histogram.calendar_interval");
         }
         if (interval.days() > 7) {
             throw ExceptionsHelper.badRequestException(invalidDateHistogramCalendarIntervalMessage(calendarInterval));
@@ -162,4 +188,60 @@ private static String invalidDateHistogramCalendarIntervalMessage(String interva
                 "variable lengths of periods greater than a week");
     }
 
+    private static class DateHistogramAggOrValueSource {
+
+        static DateHistogramAggOrValueSource fromAgg(DateHistogramAggregationBuilder agg) {
+            return new DateHistogramAggOrValueSource(agg, null);
+        }
+
+        static DateHistogramAggOrValueSource fromCompositeAgg(CompositeAggregationBuilder compositeAggregationBuilder) {
+            DateHistogramValuesSourceBuilder valuesSourceBuilder = getDateHistogramValuesSource(compositeAggregationBuilder);
+            assert valuesSourceBuilder != null : "unexpected missing date histogram value source for composite agg ["
+                + compositeAggregationBuilder.getName()
+                + "]";
+            return new DateHistogramAggOrValueSource(null, valuesSourceBuilder);
+        }
+
+        private final DateHistogramAggregationBuilder agg;
+        private final DateHistogramValuesSourceBuilder sourceBuilder;
+
+        private DateHistogramAggOrValueSource(DateHistogramAggregationBuilder agg, DateHistogramValuesSourceBuilder sourceBuilder) {
+            assert agg != null || sourceBuilder != null;
+            this.agg = agg;
+            this.sourceBuilder = sourceBuilder;
+        }
+
+        private ZoneId timeZone() {
+            return agg != null ?
+                agg.timeZone() :
+                sourceBuilder.timeZone();
+        }
+
+        private DateHistogramInterval getFixedInterval() {
+            return agg != null ?
+                agg.getFixedInterval() :
+                sourceBuilder.getIntervalAsFixed();
+        }
+
+        private DateHistogramInterval getCalendarInterval() {
+            return agg != null ?
+                agg.getCalendarInterval() :
+                sourceBuilder.getIntervalAsCalendar();
+        }
+
+        @Deprecated
+        private DateHistogramInterval dateHistogramInterval() {
+            return agg != null ?
+                agg.dateHistogramInterval() :
+                sourceBuilder.dateHistogramInterval();
+        }
+
+        @Deprecated
+        private long interval() {
+            return agg != null ?
+                agg.interval() :
+                sourceBuilder.interval();
+        }
+    }
+
 }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java
index 401e6660f8917..294d7560ef1a3 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java
@@ -50,6 +50,14 @@ public final class Messages {
     public static final String DATAFEED_MISSING_MAX_AGGREGATION_FOR_TIME_FIELD = "Missing max aggregation for time_field [{0}]";
     public static final String DATAFEED_FREQUENCY_MUST_BE_MULTIPLE_OF_AGGREGATIONS_INTERVAL =
             "Datafeed frequency [{0}] must be a multiple of the aggregation interval [{1}]";
+    public static final String DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_HAVE_SINGLE_DATE_SOURCE =
+        "Composite aggregation [{0}] must have exactly one date_histogram source";
+    public static final String DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SOURCE_MISSING_BUCKET =
+        "Datafeed composite aggregation [{0}] date_histogram [{1}] source does not support missing_buckets";
+    public static final String DATAFEED_AGGREGATIONS_COMPOSITE_AGG_DATE_HISTOGRAM_SORT =
+        "Datafeed composite aggregation [{0}] date_histogram [{1}] must be sorted in ascending order";
+    public static final String DATAFEED_AGGREGATIONS_COMPOSITE_AGG_MUST_BE_TOP_LEVEL_AND_ALONE =
+        "Composite aggregation [{0}] must be the only composite agg and should be the only top level aggregation";
     public static final String DATAFEED_ID_ALREADY_TAKEN = "A datafeed with id [{0}] already exists";
     public static final String DATAFEED_NEEDS_REMOTE_CLUSTER_SEARCH = "Datafeed [{0}] is configured with a remote index pattern(s) {1}" +
         " but the current node [{2}] is not allowed to connect to remote clusters." +
diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
index 00c0b62adce1d..e32d3c1f343ab 100644
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
@@ -35,9 +35,13 @@
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.script.Script;
 import org.elasticsearch.search.SearchModule;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.PipelineAggregatorBuilders;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.TermsValuesSourceBuilder;
 import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregationBuilder;
 import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
@@ -58,6 +62,7 @@
 import java.time.ZoneId;
 import java.time.ZoneOffset;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
@@ -121,8 +126,19 @@ private static DatafeedConfig.Builder createRandomizedDatafeedConfigBuilder(Stri
             aggHistogramInterval = aggHistogramInterval> bucketSpanMillis ? bucketSpanMillis : aggHistogramInterval;
             aggHistogramInterval = aggHistogramInterval <= 0 ? 1 : aggHistogramInterval;
             MaxAggregationBuilder maxTime = AggregationBuilders.max("time").field("time");
-            aggs.addAggregator(AggregationBuilders.dateHistogram("buckets")
-                    .fixedInterval(new DateHistogramInterval(aggHistogramInterval + "ms")).subAggregation(maxTime).field("time"));
+            AggregationBuilder topAgg = randomBoolean() ?
+                AggregationBuilders.dateHistogram("buckets")
+                    .field("time")
+                    .fixedInterval(new DateHistogramInterval(aggHistogramInterval + "ms")) :
+                AggregationBuilders.composite(
+                    "buckets",
+                    Collections.singletonList(
+                        new DateHistogramValuesSourceBuilder("time")
+                            .field("time")
+                            .fixedInterval(new DateHistogramInterval(aggHistogramInterval + "ms"))
+                    )
+                );
+            aggs.addAggregator(topAgg.subAggregation(maxTime));
             builder.setParsedAggregations(aggs);
         }
         if (randomBoolean()) {
@@ -659,6 +675,63 @@ public void testCheckHistogramAggregationHasChildMaxTimeAgg() {
         assertThat(e.getMessage(), containsString("Date histogram must have nested max aggregation for time_field [max_time]"));
     }
 
+    public void testValidateCompositeAggValueSources() {
+
+        { // must have exactly one date value source
+            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+                "buckets",
+                Arrays.asList(new TermsValuesSourceBuilder("foo").field("bar"))
+            );
+            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+            assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
+
+            CompositeAggregationBuilder aggregationBuilderWithMoreDateHisto = AggregationBuilders.composite(
+                "buckets",
+                Arrays.asList(
+                    new TermsValuesSourceBuilder("foo").field("bar"),
+                    new DateHistogramValuesSourceBuilder("date1").field("time").fixedInterval(DateHistogramInterval.days(1)),
+                    new DateHistogramValuesSourceBuilder("date2").field("time").fixedInterval(DateHistogramInterval.days(1))
+                )
+            );
+            ex = expectThrows(ElasticsearchStatusException.class,
+                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilderWithMoreDateHisto));
+            assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
+        }
+
+        { // date histo with missing bucket
+            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+                "buckets",
+                Arrays.asList(
+                    new TermsValuesSourceBuilder("foo").field("bar"),
+                    new DateHistogramValuesSourceBuilder("date1")
+                        .field("time")
+                        .fixedInterval(DateHistogramInterval.days(1))
+                        .missingBucket(true)
+                )
+            );
+            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+            assertThat(ex.getMessage(), containsString("does not support missing_buckets"));
+        }
+
+        { // date histo ordered descended
+            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+                "buckets",
+                Arrays.asList(
+                    new TermsValuesSourceBuilder("foo").field("bar"),
+                    new DateHistogramValuesSourceBuilder("date1")
+                        .field("time")
+                        .fixedInterval(DateHistogramInterval.days(1))
+                        .order("desc")
+                )
+            );
+            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+            assertThat(ex.getMessage(), containsString("must be sorted in ascending order"));
+        }
+    }
+
     public void testValidateAggregations_GivenMulitpleHistogramAggs() {
         DateHistogramAggregationBuilder nestedDateHistogram = AggregationBuilders.dateHistogram("nested_time");
         AvgAggregationBuilder avg = AggregationBuilders.avg("avg").subAggregation(nestedDateHistogram);
@@ -713,8 +786,10 @@ public void testDefaultFrequency_GivenNoAggregations() {
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueHours(48), xContentRegistry()));
     }
 
-    public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_1_Second() {
-        DatafeedConfig datafeed = createDatafeedWithDateHistogram("1s");
+    public void testDefaultFrequency_GivenAggregationsWithHistogramOrCompositeInterval_1_Second() {
+        DatafeedConfig datafeed = randomBoolean() ?
+            createDatafeedWithDateHistogram("1s") :
+            createDatafeedWithCompositeAgg("1s");
 
         assertEquals(TimeValue.timeValueMinutes(1), datafeed.defaultFrequency(TimeValue.timeValueSeconds(60), xContentRegistry()));
         assertEquals(TimeValue.timeValueMinutes(1), datafeed.defaultFrequency(TimeValue.timeValueSeconds(90), xContentRegistry()));
@@ -726,8 +801,10 @@ public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_1_Second
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueHours(13), xContentRegistry()));
     }
 
-    public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_1_Minute() {
-        DatafeedConfig datafeed = createDatafeedWithDateHistogram("1m");
+    public void testDefaultFrequency_GivenAggregationsWithHistogramOrCompositeInterval_1_Minute() {
+        DatafeedConfig datafeed = randomBoolean() ?
+            createDatafeedWithDateHistogram("1m") :
+            createDatafeedWithCompositeAgg("1m");
 
         assertEquals(TimeValue.timeValueMinutes(1), datafeed.defaultFrequency(TimeValue.timeValueSeconds(60), xContentRegistry()));
         assertEquals(TimeValue.timeValueMinutes(1), datafeed.defaultFrequency(TimeValue.timeValueSeconds(90), xContentRegistry()));
@@ -745,9 +822,10 @@ public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_1_Minute
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueHours(72), xContentRegistry()));
     }
 
-    public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_10_Minutes() {
-        DatafeedConfig datafeed = createDatafeedWithDateHistogram("10m");
-
+    public void testDefaultFrequency_GivenAggregationsWithHistogramOrCompositeInterval_10_Minutes() {
+        DatafeedConfig datafeed = randomBoolean() ?
+            createDatafeedWithDateHistogram("10m") :
+            createDatafeedWithCompositeAgg("10m");
         assertEquals(TimeValue.timeValueMinutes(10), datafeed.defaultFrequency(TimeValue.timeValueMinutes(10), xContentRegistry()));
         assertEquals(TimeValue.timeValueMinutes(10), datafeed.defaultFrequency(TimeValue.timeValueMinutes(20), xContentRegistry()));
         assertEquals(TimeValue.timeValueMinutes(10), datafeed.defaultFrequency(TimeValue.timeValueMinutes(30), xContentRegistry()));
@@ -755,9 +833,10 @@ public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_10_Minut
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueMinutes(13 * 60), xContentRegistry()));
     }
 
-    public void testDefaultFrequency_GivenAggregationsWithHistogramInterval_1_Hour() {
-        DatafeedConfig datafeed = createDatafeedWithDateHistogram("1h");
-
+    public void testDefaultFrequency_GivenAggregationsWithHistogramOrCompositeInterval_1_Hour() {
+        DatafeedConfig datafeed = randomBoolean() ?
+            createDatafeedWithDateHistogram("1h") :
+            createDatafeedWithCompositeAgg("1h");
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueHours(1), xContentRegistry()));
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueSeconds(3601), xContentRegistry()));
         assertEquals(TimeValue.timeValueHours(1), datafeed.defaultFrequency(TimeValue.timeValueHours(2), xContentRegistry()));
@@ -873,6 +952,24 @@ public static String randomValidDatafeedId() {
         return generator.ofCodePointsLength(random(), 10, 10);
     }
 
+    private static DatafeedConfig createDatafeedWithCompositeAgg(String interval) {
+        MaxAggregationBuilder maxTime = AggregationBuilders.max("time").field("time");
+        DateHistogramValuesSourceBuilder sourceBuilder = new DateHistogramValuesSourceBuilder("time");
+        sourceBuilder.field("time");
+        if (interval != null) {
+            if (DateHistogramAggregationBuilder.DATE_FIELD_UNITS.get(interval) != null) {
+                sourceBuilder.calendarInterval(new DateHistogramInterval(interval));
+            } else {
+                sourceBuilder.fixedInterval(new DateHistogramInterval(interval));
+            }
+        }
+        CompositeAggregationBuilder composite = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(sourceBuilder)
+        ).subAggregation(maxTime);
+        return createDatafeedWithComposite(composite);
+    }
+
     private static DatafeedConfig createDatafeedWithDateHistogram(String interval) {
         MaxAggregationBuilder maxTime = AggregationBuilders.max("time").field("time");
         DateHistogramAggregationBuilder dateHistogram = AggregationBuilders.dateHistogram("buckets").subAggregation(maxTime).field("time");
@@ -908,6 +1005,19 @@ private static DatafeedConfig createDatafeedWithDateHistogram(DateHistogramAggre
         return createDatafeedBuilderWithDateHistogram(dateHistogram).build();
     }
 
+    private static DatafeedConfig.Builder createDatafeedBuilderWithComposite(CompositeAggregationBuilder compositeAggregationBuilder) {
+        DatafeedConfig.Builder builder = new DatafeedConfig.Builder("datafeed1", "job1");
+        builder.setIndices(Collections.singletonList("myIndex"));
+        AggregatorFactories.Builder aggs = new AggregatorFactories.Builder().addAggregator(compositeAggregationBuilder);
+        DatafeedConfig.validateAggregations(aggs);
+        builder.setParsedAggregations(aggs);
+        return builder;
+    }
+
+    private static DatafeedConfig createDatafeedWithComposite(CompositeAggregationBuilder dateHistogram) {
+        return createDatafeedBuilderWithComposite(dateHistogram).build();
+    }
+
     @Override
     protected DatafeedConfig mutateInstance(DatafeedConfig instance) throws IOException {
         DatafeedConfig.Builder builder = new DatafeedConfig.Builder(instance);
diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedWithAggsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedWithAggsIT.java
index a6ef700fa3e30..813855412814c 100644
--- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedWithAggsIT.java
+++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedWithAggsIT.java
@@ -13,6 +13,8 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.TermsValuesSourceBuilder;
 import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
 import org.elasticsearch.xpack.core.ml.action.GetBucketsAction;
 import org.elasticsearch.xpack.core.ml.action.GetDatafeedsStatsAction;
@@ -26,6 +28,7 @@
 import org.elasticsearch.xpack.core.ml.job.results.Bucket;
 import org.junit.After;
 
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
@@ -42,14 +45,41 @@ public void cleanup(){
     }
 
     public void testRealtime() throws Exception {
-        String dataIndex = "datafeed-with-aggs-rt-data";
+        AggregatorFactories.Builder aggs = new AggregatorFactories.Builder();
+        aggs.addAggregator(AggregationBuilders.dateHistogram("time").field("time")
+            .fixedInterval(new DateHistogramInterval("1000ms"))
+            .subAggregation(AggregationBuilders.max("time").field("time")));
+        testDfWithAggs(
+            aggs,
+            new Detector.Builder("count", null),
+            "datafeed-with-aggs-rt-job",
+            "datafeed-with-aggs-rt-data"
+        );
+    }
+
+    public void testRealtimeComposite() throws Exception {
+        AggregatorFactories.Builder aggs = new AggregatorFactories.Builder();
+        aggs.addAggregator(AggregationBuilders.composite("buckets",
+            Arrays.asList(
+                new DateHistogramValuesSourceBuilder("time").field("time").fixedInterval(new DateHistogramInterval("1000ms")),
+                new TermsValuesSourceBuilder("field").field("field")
+            ))
+            .size(1000)
+            .subAggregation(AggregationBuilders.max("time").field("time")));
+        testDfWithAggs(
+            aggs,
+            new Detector.Builder("count", null).setByFieldName("field"),
+            "datafeed-with-composite-aggs-rt-job",
+            "datafeed-with-composite-aggs-rt-data"
+        );
+    }
+
+    private void testDfWithAggs(AggregatorFactories.Builder aggs, Detector.Builder detector, String jobId, String dfId) throws Exception {
 
         // A job with a bucket_span of 2s
-        String jobId = "datafeed-with-aggs-rt-job";
         DataDescription.Builder dataDescription = new DataDescription.Builder();
 
-        Detector.Builder d = new Detector.Builder("count", null);
-        AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(d.build()));
+        AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
         analysisConfig.setBucketSpan(TimeValue.timeValueSeconds(2));
         analysisConfig.setSummaryCountFieldName("doc_count");
 
@@ -64,12 +94,8 @@ public void testRealtime() throws Exception {
         DatafeedConfig.Builder datafeedBuilder = new DatafeedConfig.Builder(datafeedId, jobId);
         datafeedBuilder.setQueryDelay(TimeValue.timeValueMillis(100));
         datafeedBuilder.setFrequency(TimeValue.timeValueSeconds(1));
-        datafeedBuilder.setIndices(Collections.singletonList(dataIndex));
+        datafeedBuilder.setIndices(Collections.singletonList(dfId));
 
-        AggregatorFactories.Builder aggs = new AggregatorFactories.Builder();
-        aggs.addAggregator(AggregationBuilders.dateHistogram("time").field("time")
-            .fixedInterval(new DateHistogramInterval("1000ms"))
-            .subAggregation(AggregationBuilders.max("time").field("time")));
         datafeedBuilder.setParsedAggregations(aggs);
 
         DatafeedConfig datafeed = datafeedBuilder.build();
@@ -82,8 +108,8 @@ public void testRealtime() throws Exception {
         openJob(jobId);
 
         // Now let's index the data
-        client().admin().indices().prepareCreate(dataIndex)
-            .setMapping("time", "type=date")
+        client().admin().indices().prepareCreate(dfId)
+            .setMapping("time", "type=date", "field", "type=keyword")
             .get();
 
         // Index a doc per second from a minute ago to a minute later
@@ -93,8 +119,8 @@ public void testRealtime() throws Exception {
         long curTime = aMinuteAgo;
         BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
         while (curTime < aMinuteLater) {
-            IndexRequest indexRequest = new IndexRequest(dataIndex);
-            indexRequest.source("time", curTime);
+            IndexRequest indexRequest = new IndexRequest(dfId);
+            indexRequest.source("time", curTime, "field", randomFrom("foo", "bar", "baz"));
             bulkRequestBuilder.add(indexRequest);
             curTime += TimeValue.timeValueSeconds(1).millis();
         }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDatafeedAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDatafeedAction.java
index 3127680551eb2..271aba0191b42 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDatafeedAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDatafeedAction.java
@@ -88,6 +88,7 @@ To ensure that a subsequent stop datafeed call will see that same task status (a
  */
 public class TransportStartDatafeedAction extends TransportMasterNodeAction<StartDatafeedAction.Request, NodeAcknowledgedResponse> {
 
+    private static final Version COMPOSITE_AGG_SUPPORT = Version.V_7_13_0;
     private static final Logger logger = LogManager.getLogger(TransportStartDatafeedAction.class);
 
     private final Client client;
@@ -251,6 +252,21 @@ public void onFailure(Exception e) {
                         params.setJobId(datafeedConfig.getJobId());
                         params.setIndicesOptions(datafeedConfig.getIndicesOptions());
                         datafeedConfigHolder.set(datafeedConfig);
+                        if (datafeedConfig.hasCompositeAgg(xContentRegistry)) {
+                            if (state.nodes()
+                                .mastersFirstStream()
+                                .filter(MachineLearning::isMlNode)
+                                .map(DiscoveryNode::getVersion)
+                                .anyMatch(COMPOSITE_AGG_SUPPORT::after)) {
+                                listener.onFailure(ExceptionsHelper.badRequestException(
+                                    "cannot start datafeed [{}] as [{}] requires all machine learning nodes to be at least version [{}]",
+                                    datafeedConfig.getId(),
+                                    "composite aggs",
+                                    COMPOSITE_AGG_SUPPORT
+                                ));
+                                return;
+                            }
+                        }
                         jobConfigProvider.getJob(datafeedConfig.getJobId(), jobListener);
                     } catch (Exception e) {
                         listener.onFailure(e);
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
index 86a67d7a94dd7..17ed908890fd9 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
@@ -8,6 +8,7 @@
 
 import org.elasticsearch.ResourceNotFoundException;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.support.IndicesOptions;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.index.IndexNotFoundException;
@@ -19,7 +20,9 @@
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.core.rollup.action.GetRollupIndexCapsAction;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
+import org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregatedSearchRequestBuilder;
 import org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationDataExtractorFactory;
+import org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.CompositeAggregationDataExtractorFactory;
 import org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.RollupDataExtractorFactory;
 import org.elasticsearch.xpack.ml.datafeed.extractor.chunked.ChunkedDataExtractorFactory;
 import org.elasticsearch.xpack.ml.datafeed.extractor.scroll.ScrollDataExtractorFactory;
@@ -36,6 +39,8 @@ static void create(Client client,
                        NamedXContentRegistry xContentRegistry,
                        DatafeedTimingStatsReporter timingStatsReporter,
                        ActionListener<DataExtractorFactory> listener) {
+        boolean hasAggs = datafeed.hasAggregations();
+        boolean isComposite = hasAggs && datafeed.hasCompositeAgg(xContentRegistry);
         ActionListener<DataExtractorFactory> factoryHandler = ActionListener.wrap(
             factory -> listener.onResponse(datafeed.getChunkingConfig().isEnabled()
                 ? new ChunkedDataExtractorFactory(client, datafeed, job, xContentRegistry, factory, timingStatsReporter) : factory)
@@ -44,20 +49,39 @@ static void create(Client client,
 
         ActionListener<GetRollupIndexCapsAction.Response> getRollupIndexCapsActionHandler = ActionListener.wrap(
             response -> {
+                if (response.getJobs().isEmpty() == false && datafeed.hasAggregations() == false) {
+                    listener.onFailure(new IllegalArgumentException("Aggregations are required when using Rollup indices"));
+                    return;
+                }
+                if (datafeed.hasAggregations() == false) {
+                    ScrollDataExtractorFactory.create(client, datafeed, job, xContentRegistry, timingStatsReporter, factoryHandler);
+                    return;
+                }
+                if (isComposite) {
+                    String[] indices = datafeed.getIndices().toArray(new String[0]);
+                    IndicesOptions indicesOptions = datafeed.getIndicesOptions();
+                    AggregatedSearchRequestBuilder aggregatedSearchRequestBuilder = response.getJobs().isEmpty() ?
+                        AggregationDataExtractorFactory.requestBuilder(client, indices, indicesOptions) :
+                        RollupDataExtractorFactory.requestBuilder(client, indices, indicesOptions);
+                    factoryHandler.onResponse(
+                        new CompositeAggregationDataExtractorFactory(
+                            client,
+                            datafeed,
+                            job,
+                            xContentRegistry,
+                            timingStatsReporter,
+                            aggregatedSearchRequestBuilder
+                        )
+                    );
+                    return;
+                }
+
                 if (response.getJobs().isEmpty()) { // This means no rollup indexes are in the config
-                    if (datafeed.hasAggregations()) {
-                        factoryHandler.onResponse(
-                            new AggregationDataExtractorFactory(client, datafeed, job, xContentRegistry, timingStatsReporter));
-                    } else {
-                        ScrollDataExtractorFactory.create(client, datafeed, job, xContentRegistry, timingStatsReporter, factoryHandler);
-                    }
+                    factoryHandler.onResponse(
+                        new AggregationDataExtractorFactory(client, datafeed, job, xContentRegistry, timingStatsReporter));
                 } else {
-                    if (datafeed.hasAggregations()) { // Rollup indexes require aggregations
-                        RollupDataExtractorFactory.create(
-                            client, datafeed, job, response.getJobs(), xContentRegistry, timingStatsReporter, factoryHandler);
-                    } else {
-                        listener.onFailure(new IllegalArgumentException("Aggregations are required when using Rollup indices"));
-                    }
+                    RollupDataExtractorFactory.create(
+                        client, datafeed, job, response.getJobs(), xContentRegistry, timingStatsReporter, factoryHandler);
                 }
             },
             e -> {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
index de2c273ea4d15..b25c814a4a954 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
@@ -120,7 +120,7 @@ private Aggregations search() {
 
     private void initAggregationProcessor(Aggregations aggs) throws IOException {
         aggregationToJsonProcessor = new AggregationToJsonProcessor(context.timeField, context.fields, context.includeDocCount,
-            context.start);
+            context.start, null);
         aggregationToJsonProcessor.process(aggs);
     }
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
new file mode 100644
index 0000000000000..bc120586ed884
--- /dev/null
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
@@ -0,0 +1,24 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
+
+import org.elasticsearch.action.ActionRequestBuilder;
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.search.builder.SearchSourceBuilder;
+
+
+/**
+ * This is used when building search actions for aggregated data.
+ *
+ * Implementations can be found for regular searches and rollup searches.
+ */
+@FunctionalInterface
+public interface AggregatedSearchRequestBuilder {
+    ActionRequestBuilder<SearchRequest, SearchResponse> build(SearchSourceBuilder searchSourceBuilder);
+}
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorContext.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorContext.java
index 97796f90f9107..bfcf4a8cbd98b 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorContext.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorContext.java
@@ -36,7 +36,7 @@ class AggregationDataExtractorContext {
         this.jobId = Objects.requireNonNull(jobId);
         this.timeField = Objects.requireNonNull(timeField);
         this.fields = Objects.requireNonNull(fields);
-        this.indices = indices.toArray(new String[indices.size()]);
+        this.indices = indices.toArray(new String[0]);
         this.query = Objects.requireNonNull(query);
         this.aggs = Objects.requireNonNull(aggs);
         this.start = start;
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorFactory.java
index d4f7764e9852b..c4a6d5a866e44 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorFactory.java
@@ -6,6 +6,9 @@
  */
 package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
 
+import org.elasticsearch.action.search.SearchAction;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.support.IndicesOptions;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.xpack.core.ml.datafeed.DatafeedConfig;
@@ -25,6 +28,19 @@ public class AggregationDataExtractorFactory implements DataExtractorFactory {
     private final NamedXContentRegistry xContentRegistry;
     private final DatafeedTimingStatsReporter timingStatsReporter;
 
+    public static AggregatedSearchRequestBuilder requestBuilder(
+        Client client,
+        String[] indices,
+        IndicesOptions indicesOptions
+    ) {
+        return (searchSourceBuilder) ->
+            new SearchRequestBuilder(client, SearchAction.INSTANCE)
+                .setSource(searchSourceBuilder)
+                .setIndicesOptions(indicesOptions)
+                .setAllowPartialSearchResults(false)
+                .setIndices(indices);
+    }
+
     public AggregationDataExtractorFactory(
             Client client,
             DatafeedConfig datafeedConfig,
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
index c900890f2d2f2..8c6a62e6c0f3b 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
@@ -8,13 +8,16 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.elasticsearch.search.aggregations.Aggregation;
 import org.elasticsearch.search.aggregations.Aggregations;
 import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation;
 import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
 import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
 import org.elasticsearch.search.aggregations.metrics.GeoCentroid;
 import org.elasticsearch.search.aggregations.metrics.Max;
@@ -28,6 +31,7 @@
 import java.io.OutputStream;
 import java.time.ZonedDateTime;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
@@ -55,6 +59,7 @@ class AggregationToJsonProcessor {
     private long keyValueWrittenCount;
     private final SortedMap<Long, List<Map<String, Object>>> docsByBucketTimestamp;
     private final long startTime;
+    private final String compositeAggDateValueSourceName;
 
     /**
      * Constructs a processor that processes aggregations into JSON
@@ -63,8 +68,13 @@ class AggregationToJsonProcessor {
      * @param fields the fields to convert into JSON
      * @param includeDocCount whether to include the doc_count
      * @param startTime buckets with a timestamp before this time are discarded
+     * @param compositeAggDateValueSourceName the value source for the date_histogram source in the composite agg, if it exists
      */
-    AggregationToJsonProcessor(String timeField, Set<String> fields, boolean includeDocCount, long startTime) {
+    AggregationToJsonProcessor(String timeField,
+                               Set<String> fields,
+                               boolean includeDocCount,
+                               long startTime,
+                               @Nullable String compositeAggDateValueSourceName) {
         this.timeField = Objects.requireNonNull(timeField);
         this.fields = Objects.requireNonNull(fields);
         this.includeDocCount = includeDocCount;
@@ -72,6 +82,7 @@ class AggregationToJsonProcessor {
         docsByBucketTimestamp = new TreeMap<>();
         keyValueWrittenCount = 0;
         this.startTime = startTime;
+        this.compositeAggDateValueSourceName = compositeAggDateValueSourceName;
     }
 
     public void process(Aggregations aggs) throws IOException {
@@ -149,6 +160,9 @@ private void processAggs(long docCount, List<Aggregation> aggregations) throws I
             MultiBucketsAggregation bucketAgg = bucketAggregations.get(0);
             if (bucketAgg instanceof Histogram) {
                 processDateHistogram((Histogram) bucketAgg);
+            } else if (bucketAgg instanceof CompositeAggregation && compositeAggDateValueSourceName != null) {
+                // This indicates that our composite agg contains our date histogram bucketing via one of its sources
+                processCompositeAgg((CompositeAggregation) bucketAgg);
             } else {
                 // Ignore bucket aggregations that don't contain a field we
                 // are interested in. This avoids problems with pipeline bucket
@@ -185,8 +199,8 @@ private void processAggs(long docCount, List<Aggregation> aggregations) throws I
 
     private void processDateHistogram(Histogram agg) throws IOException {
         if (keyValuePairs.containsKey(timeField)) {
-            throw new IllegalArgumentException("More than one Date histogram cannot be used in the aggregation. " +
-                    "[" + agg.getName() + "] is another instance of a Date histogram");
+            throw new IllegalArgumentException("More than one composite or date_histogram cannot be used in the aggregation. " +
+                "[" + agg.getName() + "] is another instance of a composite or date_histogram aggregation");
         }
 
         // buckets are ordered by time, once we get to a bucket past the
@@ -210,6 +224,60 @@ private void processDateHistogram(Histogram agg) throws IOException {
         }
     }
 
+    private void processCompositeAgg(CompositeAggregation agg) throws IOException {
+        if (keyValuePairs.containsKey(timeField)) {
+            throw new IllegalArgumentException("More than one composite or date_histogram cannot be used in the aggregation. " +
+                "[" + agg.getName() + "] is another instance of a composite or date_histogram aggregation");
+        }
+        // Shouldn't ever happen
+        if (compositeAggDateValueSourceName == null) {
+            throw new IllegalArgumentException("attempted to process composite agg ["
+                + agg.getName()
+                + "] but does not contain date_histogram value source");
+        }
+
+        // the date_histogram value source should order the buckets in ascending order
+        // So, skip forward until we are within the start time
+        boolean checkBucketTime = true;
+        for (CompositeAggregation.Bucket bucket : agg.getBuckets()) {
+            if (checkBucketTime) {
+                long bucketTime = toHistogramKeyToEpoch(bucket.getKey().get(compositeAggDateValueSourceName));
+                if (bucketTime < startTime) {
+                    LOGGER.debug(() -> new ParameterizedMessage("Skipping bucket at [{}], startTime is [{}]", bucketTime, startTime));
+                    continue;
+                } else {
+                    checkBucketTime = false;
+                }
+            }
+
+            Collection<String> addedFields = processCompositeAggBucketKeys(bucket.getKey());
+            List<Aggregation> childAggs = bucket.getAggregations().asList();
+            processAggs(bucket.getDocCount(), childAggs);
+            keyValuePairs.remove(timeField);
+            for (String fieldName : addedFields) {
+                keyValuePairs.remove(fieldName);
+            }
+        }
+    }
+
+    /**
+     * It is possible that the key values in composite agg bucket contain field values we care about
+     * Make sure if they do, they get processed
+     * @param bucketKeys the composite agg bucket keys
+     * @return The field names we added to the key value pairs
+     */
+    private Collection<String> processCompositeAggBucketKeys(Map<String, Object> bucketKeys) {
+        List<String> addedFieldValues = new ArrayList<>();
+        for (Map.Entry<String, Object> bucketKey : bucketKeys.entrySet()) {
+            if (bucketKey.getKey().equals(compositeAggDateValueSourceName) == false && fields.contains(bucketKey.getKey())) {
+                // TODO any validations or processing???
+                keyValuePairs.put(bucketKey.getKey(), bucketKey.getValue());
+                addedFieldValues.add(bucketKey.getKey());
+            }
+        }
+        return addedFieldValues;
+    }
+
     /*
      * Date Histograms have a {@link ZonedDateTime} object as the key,
      * Histograms have either a Double or Long.
@@ -239,6 +307,10 @@ boolean bucketAggContainsRequiredAgg(MultiBucketsAggregation aggregation) {
         if (fields.contains(aggregation.getName())) {
             return true;
         }
+        if (aggregation instanceof CompositeAggregation
+            && Sets.haveNonEmptyIntersection(((CompositeAggregation) aggregation).afterKey().keySet(), fields)) {
+            return true;
+        }
 
         if (aggregation.getBuckets().isEmpty()) {
             return false;
@@ -265,12 +337,17 @@ boolean bucketAggContainsRequiredAgg(MultiBucketsAggregation aggregation) {
 
     private void processBucket(MultiBucketsAggregation bucketAgg, boolean addField) throws IOException {
         for (MultiBucketsAggregation.Bucket bucket : bucketAgg.getBuckets()) {
+            List<String> addedFields = new ArrayList<>();
             if (addField) {
+                addedFields.add(bucketAgg.getName());
                 keyValuePairs.put(bucketAgg.getName(), bucket.getKey());
             }
+            if (bucket instanceof CompositeAggregation.Bucket) {
+                addedFields.addAll(processCompositeAggBucketKeys(((CompositeAggregation.Bucket)bucket).getKey()));
+            }
             processAggs(bucket.getDocCount(), asList(bucket.getAggregations()));
-            if (addField) {
-                keyValuePairs.remove(bucketAgg.getName());
+            for (String fieldName : addedFields) {
+                keyValuePairs.remove(fieldName);
             }
         }
     }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
new file mode 100644
index 0000000000000..3d8c9dd662ef9
--- /dev/null
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -0,0 +1,177 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.action.ActionRequestBuilder;
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.search.aggregations.Aggregations;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.builder.SearchSourceBuilder;
+import org.elasticsearch.xpack.core.ClientHelper;
+import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor;
+import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils;
+import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicReference;
+
+/**
+ * An implementation that extracts data from elasticsearch using search with composite aggregations on a client.
+ * The first time {@link #next()} is called, the search is executed. All the aggregated buckets from the composite agg are then
+ * returned. Subsequent calls to {@link #next()} execute additional searches, moving forward the composite agg `afterKey`.
+ *
+ * It's like scroll, but with aggs.
+ *
+ * It is not thread-safe, we reuse underlying objects without synchronization (like a pre-constructed composite agg object)
+ */
+class CompositeAggregationDataExtractor implements DataExtractor {
+
+    private static final Logger LOGGER = LogManager.getLogger(CompositeAggregationDataExtractor.class);
+
+    private final AtomicReference<Map<String, Object>> afterKey = new AtomicReference<>(null);
+    private final CompositeAggregationBuilder compositeAggregationBuilder;
+    private final Client client;
+    private final CompositeAggregationDataExtractorContext context;
+    private final DatafeedTimingStatsReporter timingStatsReporter;
+    private final AggregatedSearchRequestBuilder requestBuilder;
+    private boolean hasNext;
+    private boolean isCancelled;
+
+    CompositeAggregationDataExtractor(
+        CompositeAggregationBuilder compositeAggregationBuilder,
+        Client client,
+        CompositeAggregationDataExtractorContext dataExtractorContext,
+        DatafeedTimingStatsReporter timingStatsReporter,
+        AggregatedSearchRequestBuilder requestBuilder
+    ) {
+        this.compositeAggregationBuilder = compositeAggregationBuilder;
+        this.client = Objects.requireNonNull(client);
+        this.context = Objects.requireNonNull(dataExtractorContext);
+        this.timingStatsReporter = Objects.requireNonNull(timingStatsReporter);
+        this.requestBuilder = Objects.requireNonNull(requestBuilder);
+        this.hasNext = true;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return hasNext;
+    }
+
+    @Override
+    public boolean isCancelled() {
+        return isCancelled;
+    }
+
+    @Override
+    public void cancel() {
+        LOGGER.debug(() -> new ParameterizedMessage("[{}] Data extractor received cancel request", context.jobId));
+        isCancelled = true;
+        hasNext = false;
+    }
+
+    @Override
+    public long getEndTime() {
+        return context.end;
+    }
+
+    @Override
+    public Optional<InputStream> next() throws IOException {
+        if (hasNext() == false) {
+            throw new NoSuchElementException();
+        }
+
+        Aggregations aggs = search();
+        if (aggs == null) {
+            LOGGER.trace(() -> new ParameterizedMessage("[{}] extraction finished", context.jobId));
+            hasNext = false;
+            afterKey.set(null);
+            return Optional.empty();
+        }
+        return Optional.of(processAggs(aggs));
+    }
+
+    private Aggregations search() {
+        long histogramSearchStartTime = Math.max(
+            0,
+            context.start - ExtractorUtils.getHistogramIntervalMillis(compositeAggregationBuilder)
+        );
+        LOGGER.trace(
+            () -> new ParameterizedMessage(
+                "[{}] Executing composite aggregated search from [{}] to [{}]",
+                context.jobId,
+                histogramSearchStartTime,
+                context.end
+            )
+        );
+        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder()
+            .size(0)
+            .query(ExtractorUtils.wrapInTimeRangeQuery(context.query, context.timeField, histogramSearchStartTime, context.end));
+
+        if (context.runtimeMappings.isEmpty() == false) {
+            searchSourceBuilder.runtimeMappings(context.runtimeMappings);
+        }
+        if (afterKey.get() != null) {
+            compositeAggregationBuilder.aggregateAfter(afterKey.get());
+        }
+        searchSourceBuilder.aggregation(compositeAggregationBuilder);
+        ActionRequestBuilder<SearchRequest, SearchResponse> searchRequest = requestBuilder.build(searchSourceBuilder);
+        SearchResponse searchResponse = executeSearchRequest(searchRequest);
+        LOGGER.trace(() -> new ParameterizedMessage("[{}] Search composite response was obtained", context.jobId));
+        timingStatsReporter.reportSearchDuration(searchResponse.getTook());
+        Aggregations aggregations = searchResponse.getAggregations();
+        if (aggregations == null) {
+            return null;
+        }
+        CompositeAggregation compositeAgg = aggregations.get(compositeAggregationBuilder.getName());
+        if (compositeAgg == null || compositeAgg.getBuckets().isEmpty()) {
+            return null;
+        }
+        afterKey.set(compositeAgg.afterKey());
+
+        return aggregations;
+    }
+
+    protected SearchResponse executeSearchRequest(ActionRequestBuilder<SearchRequest, SearchResponse> searchRequestBuilder) {
+        return ClientHelper.executeWithHeaders(context.headers, ClientHelper.ML_ORIGIN, client, searchRequestBuilder::get);
+    }
+
+    private InputStream processAggs(Aggregations aggs) throws IOException {
+        AggregationToJsonProcessor aggregationToJsonProcessor = new AggregationToJsonProcessor(
+            context.timeField,
+            context.fields,
+            context.includeDocCount,
+            context.start,
+            context.compositeAggDateHistogramGroupSourceName
+        );
+        LOGGER.trace(() -> new ParameterizedMessage(
+            "[{}] got [{}] composite buckets",
+            context.jobId,
+            ((CompositeAggregation)aggs.get(compositeAggregationBuilder.getName())).getBuckets().size()
+        ));
+        aggregationToJsonProcessor.process(aggs);
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        boolean moreToWrite = aggregationToJsonProcessor.writeDocs(Integer.MAX_VALUE, outputStream);
+        while (moreToWrite) {
+            moreToWrite = aggregationToJsonProcessor.writeDocs(Integer.MAX_VALUE, outputStream);
+        }
+        return new ByteArrayInputStream(outputStream.toByteArray());
+    }
+
+}
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorContext.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorContext.java
new file mode 100644
index 0000000000000..9650fd949526b
--- /dev/null
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorContext.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
+
+import org.elasticsearch.action.support.IndicesOptions;
+import org.elasticsearch.index.query.QueryBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+class CompositeAggregationDataExtractorContext {
+
+    final String jobId;
+    final String timeField;
+    final Set<String> fields;
+    final String[] indices;
+    final QueryBuilder query;
+    final CompositeAggregationBuilder compositeAggregationBuilder;
+    final long start;
+    final long end;
+    final boolean includeDocCount;
+    final Map<String, String> headers;
+    final IndicesOptions indicesOptions;
+    final Map<String, Object> runtimeMappings;
+    final String compositeAggDateHistogramGroupSourceName;
+
+    CompositeAggregationDataExtractorContext(String jobId,
+                                             String timeField,
+                                             Set<String> fields,
+                                             List<String> indices,
+                                             QueryBuilder query,
+                                             CompositeAggregationBuilder compositeAggregationBuilder,
+                                             String compositeAggDateHistogramGroupSourceName,
+                                             long start,
+                                             long end,
+                                             boolean includeDocCount,
+                                             Map<String, String> headers,
+                                             IndicesOptions indicesOptions,
+                                             Map<String, Object> runtimeMappings) {
+        this.jobId = Objects.requireNonNull(jobId);
+        this.timeField = Objects.requireNonNull(timeField);
+        this.fields = Objects.requireNonNull(fields);
+        this.indices = indices.toArray(new String[0]);
+        this.query = Objects.requireNonNull(query);
+        this.compositeAggregationBuilder = Objects.requireNonNull(compositeAggregationBuilder);
+        this.compositeAggDateHistogramGroupSourceName = Objects.requireNonNull(compositeAggDateHistogramGroupSourceName);
+        this.start = start;
+        this.end = end;
+        this.includeDocCount = includeDocCount;
+        this.headers = headers;
+        this.indicesOptions = Objects.requireNonNull(indicesOptions);
+        this.runtimeMappings = Objects.requireNonNull(runtimeMappings);
+    }
+}
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorFactory.java
new file mode 100644
index 0000000000000..be423ec6cdfc1
--- /dev/null
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorFactory.java
@@ -0,0 +1,124 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
+
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.index.query.QueryBuilder;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
+import org.elasticsearch.search.aggregations.PipelineAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
+import org.elasticsearch.xpack.core.ml.datafeed.DatafeedConfig;
+import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor;
+import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.utils.Intervals;
+import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
+import org.elasticsearch.xpack.ml.datafeed.extractor.DataExtractorFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+
+public class CompositeAggregationDataExtractorFactory implements DataExtractorFactory {
+
+    private final Client client;
+    private final DatafeedConfig datafeedConfig;
+    private final Job job;
+    private final DatafeedTimingStatsReporter timingStatsReporter;
+    private final String compositeAggName;
+    private final Collection<AggregationBuilder> subAggs;
+    private final Collection<PipelineAggregationBuilder> subPipelineAggs;
+    private final String dateHistogramGroupSourceName;
+    private final AggregatedSearchRequestBuilder requestBuilder;
+    private final int numBuckets;
+    private final List<CompositeValuesSourceBuilder<?>> compositeValuesSourceBuilders;
+    private final QueryBuilder parsedQuery;
+
+    public CompositeAggregationDataExtractorFactory(
+            Client client,
+            DatafeedConfig datafeedConfig,
+            Job job,
+            NamedXContentRegistry xContentRegistry,
+            DatafeedTimingStatsReporter timingStatsReporter,
+            AggregatedSearchRequestBuilder requestBuilder
+    ) {
+        this.client = Objects.requireNonNull(client);
+        this.datafeedConfig = Objects.requireNonNull(datafeedConfig);
+        this.job = Objects.requireNonNull(job);
+        this.timingStatsReporter = Objects.requireNonNull(timingStatsReporter);
+        this.parsedQuery = datafeedConfig.getParsedQuery(xContentRegistry);
+        AggregationBuilder aggregationBuilder = ExtractorUtils.getHistogramAggregation(
+            datafeedConfig.getParsedAggregations(xContentRegistry).getAggregatorFactories()
+        );
+        if (aggregationBuilder instanceof CompositeAggregationBuilder == false) {
+            throw new IllegalArgumentException(
+                "top level aggregation must be a composite agg ["
+                    + aggregationBuilder.getName()
+                    + "] is a ["
+                    + aggregationBuilder.getType()
+                    + "]"
+            );
+        }
+        CompositeAggregationBuilder compositeAggregationBuilder = (CompositeAggregationBuilder) aggregationBuilder;
+        this.numBuckets = compositeAggregationBuilder.size();
+        this.compositeAggName = compositeAggregationBuilder.getName();
+        this.subAggs = compositeAggregationBuilder.getSubAggregations();
+        this.subPipelineAggs = compositeAggregationBuilder.getPipelineAggregations();
+        // We want to make sure our date_histogram source is first. This way we order at the top level by the timestamp
+        this.compositeValuesSourceBuilders = new ArrayList<>(compositeAggregationBuilder.sources().size());
+        List<CompositeValuesSourceBuilder<?>> others = new ArrayList<>(compositeAggregationBuilder.sources().size() - 1);
+        String dateHistoGroupName = null;
+        for (CompositeValuesSourceBuilder<?> sourceBuilder : compositeAggregationBuilder.sources()) {
+            if (sourceBuilder instanceof DateHistogramValuesSourceBuilder) {
+                this.compositeValuesSourceBuilders.add(sourceBuilder);
+                dateHistoGroupName = sourceBuilder.name();
+            } else {
+                others.add(sourceBuilder);
+            }
+        }
+        this.dateHistogramGroupSourceName = dateHistoGroupName;
+        this.compositeValuesSourceBuilders.addAll(others);
+        this.requestBuilder = requestBuilder;
+    }
+
+    @Override
+    public DataExtractor newExtractor(long start, long end) {
+        CompositeAggregationBuilder compositeAggregationBuilder = new CompositeAggregationBuilder(
+            compositeAggName,
+            compositeValuesSourceBuilders
+        );
+        compositeAggregationBuilder.size(numBuckets);
+        subAggs.forEach(compositeAggregationBuilder::subAggregation);
+        subPipelineAggs.forEach(compositeAggregationBuilder::subAggregation);
+        long histogramInterval = ExtractorUtils.getHistogramIntervalMillis(compositeAggregationBuilder);
+        CompositeAggregationDataExtractorContext dataExtractorContext = new CompositeAggregationDataExtractorContext(
+                job.getId(),
+                job.getDataDescription().getTimeField(),
+                job.getAnalysisConfig().analysisFields(),
+                datafeedConfig.getIndices(),
+                parsedQuery,
+                compositeAggregationBuilder,
+                this.dateHistogramGroupSourceName,
+                Intervals.alignToCeil(start, histogramInterval),
+                Intervals.alignToFloor(end, histogramInterval),
+                job.getAnalysisConfig().getSummaryCountFieldName().equals(DatafeedConfig.DOC_COUNT),
+                datafeedConfig.getHeaders(),
+                datafeedConfig.getIndicesOptions(),
+                datafeedConfig.getRuntimeMappings());
+        return new CompositeAggregationDataExtractor(
+            compositeAggregationBuilder,
+            client,
+            dataExtractorContext,
+            timingStatsReporter,
+            requestBuilder
+        );
+    }
+}
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
index 6a6ee90ee071f..0aa5f1dee2b14 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
@@ -8,6 +8,8 @@
 
 import org.elasticsearch.ElasticsearchStatusException;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.support.IndicesOptions;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
@@ -21,6 +23,7 @@
 import org.elasticsearch.xpack.core.ml.utils.Intervals;
 import org.elasticsearch.xpack.core.rollup.action.RollableIndexCaps;
 import org.elasticsearch.xpack.core.rollup.action.RollupJobCaps.RollupFieldCaps;
+import org.elasticsearch.xpack.core.rollup.action.RollupSearchAction;
 import org.elasticsearch.xpack.core.rollup.job.DateHistogramGroupConfig;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
 import org.elasticsearch.xpack.ml.datafeed.extractor.DataExtractorFactory;
@@ -62,6 +65,20 @@ private RollupDataExtractorFactory(
         this.timingStatsReporter = Objects.requireNonNull(timingStatsReporter);
     }
 
+    public static AggregatedSearchRequestBuilder requestBuilder(
+        Client client,
+        String[] indices,
+        IndicesOptions indicesOptions
+    ) {
+        return (searchSourceBuilder) -> {
+            SearchRequest searchRequest = new SearchRequest().indices(indices)
+                .indicesOptions(indicesOptions)
+                .allowPartialSearchResults(false)
+                .source(searchSourceBuilder);
+            return new RollupSearchAction.RequestBuilder(client, searchRequest);
+        };
+    }
+
     @Override
     public DataExtractor newExtractor(long start, long end) {
         long histogramInterval = datafeedConfig.getHistogramIntervalMillis(xContentRegistry);
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
index a19985e3aa1e6..5e49232ee8d1d 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
@@ -6,10 +6,12 @@
  */
 package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
 
+import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.geo.GeoPoint;
 import org.elasticsearch.search.aggregations.Aggregation;
 import org.elasticsearch.search.aggregations.Aggregations;
 import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
 import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
 import org.elasticsearch.search.aggregations.bucket.terms.StringTerms;
 import org.elasticsearch.search.aggregations.bucket.terms.Terms;
@@ -41,6 +43,24 @@ static Histogram.Bucket createHistogramBucket(long timestamp, long docCount, Lis
         return bucket;
     }
 
+    static CompositeAggregation.Bucket createCompositeBucket(long timestamp,
+                                                             String dateValueSource,
+                                                             long docCount,
+                                                             List<Aggregation> subAggregations,
+                                                             List<Tuple<String, String>> termValues) {
+        CompositeAggregation.Bucket bucket = mock(CompositeAggregation.Bucket.class);
+        when(bucket.getDocCount()).thenReturn(docCount);
+        Aggregations aggs = createAggs(subAggregations);
+        when(bucket.getAggregations()).thenReturn(aggs);
+        Map<String, Object> bucketKey = new HashMap<>();
+        bucketKey.put(dateValueSource, timestamp);
+        for (Tuple<String, String> termValue : termValues) {
+            bucketKey.put(termValue.v1(), termValue.v2());
+        }
+        when(bucket.getKey()).thenReturn(bucketKey);
+        return bucket;
+    }
+
     static SingleBucketAggregation createSingleBucketAgg(String name, long docCount, List<Aggregation> subAggregations) {
         SingleBucketAggregation singleBucketAggregation = mock(SingleBucketAggregation.class);
         when(singleBucketAggregation.getName()).thenReturn(name);
@@ -65,6 +85,15 @@ static Histogram createHistogramAggregation(String name, List<Histogram.Bucket>
         return histogram;
     }
 
+    @SuppressWarnings("unchecked")
+    static CompositeAggregation createCompositeAggregation(String name, List<CompositeAggregation.Bucket> buckets) {
+        CompositeAggregation compositeAggregation = mock(CompositeAggregation.class);
+        when((List<CompositeAggregation.Bucket>)compositeAggregation.getBuckets()).thenReturn(buckets);
+        when(compositeAggregation.getName()).thenReturn(name);
+        return compositeAggregation;
+
+    }
+
     static Max createMax(String name, double value) {
         Max max = mock(Max.class);
         when(max.getName()).thenReturn(name);
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
index 474545339ad27..0f2b672d0fe13 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
@@ -6,14 +6,17 @@
  */
 package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
 
+import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.search.aggregations.Aggregation;
 import org.elasticsearch.search.aggregations.Aggregations;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
 import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
 import org.elasticsearch.search.aggregations.bucket.terms.StringTerms;
 import org.elasticsearch.search.aggregations.bucket.terms.Terms;
 import org.elasticsearch.search.aggregations.metrics.Max;
 import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
@@ -28,6 +31,8 @@
 
 import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.Term;
 import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createAggs;
+import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createCompositeAggregation;
+import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createCompositeBucket;
 import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createGeoCentroid;
 import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createHistogramAggregation;
 import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createHistogramBucket;
@@ -47,25 +52,52 @@ public class AggregationToJsonProcessorTests extends ESTestCase {
     private String timeField = "time";
     private boolean includeDocCount = true;
     private long startTime = 0;
+    private String compositeAggValueSource = "time";
 
-    public void testProcessGivenMultipleDateHistograms() {
-        List<Histogram.Bucket> nestedHistogramBuckets = Arrays.asList(
+    @Before
+    public void setValues() {
+        compositeAggValueSource = "time";
+    }
+
+    public void testProcessGivenMultipleDateHistogramsOrComposite() {
+        Aggregation nestedBucket;
+        if (randomBoolean()) {
+            List<Histogram.Bucket> nestedHistogramBuckets = Arrays.asList(
                 createHistogramBucket(1000L, 3, Collections.singletonList(createMax("metric1", 1200))),
                 createHistogramBucket(2000L, 5, Collections.singletonList(createMax("metric1", 2800)))
-        );
-        Histogram histogram = createHistogramAggregation("buckets", nestedHistogramBuckets);
+            );
+            nestedBucket = createHistogramAggregation("buckets", nestedHistogramBuckets);
+        } else {
+            List<CompositeAggregation.Bucket> nestedCompositebuckets = Arrays.asList(
+                createCompositeBucket(
+                    1000L,
+                    "time",
+                    3,
+                    Collections.singletonList(createMax("metric1", 1200)),
+                    Collections.emptyList()
+                ),
+                createCompositeBucket(
+                    2000L,
+                    "time",
+                    5,
+                    Collections.singletonList(createMax("metric1", 2800)),
+                    Collections.emptyList()
+                )
+            );
+            nestedBucket = createCompositeAggregation("buckets", nestedCompositebuckets);
+        }
 
         List<Histogram.Bucket> histogramBuckets = Arrays.asList(
-                createHistogramBucket(1000L, 3, Arrays.asList(createMax("time", 1000L), histogram))
+                createHistogramBucket(1000L, 3, Arrays.asList(createMax("time", 1000L), nestedBucket))
         );
 
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
                 () -> aggToString(Sets.newHashSet("my_field"), histogramBuckets));
-        assertThat(e.getMessage(), containsString("More than one Date histogram cannot be used in the aggregation. " +
-                "[buckets] is another instance of a Date histogram"));
+        assertThat(e.getMessage(), containsString("More than one composite or date_histogram cannot be used in the aggregation."
+            + " [buckets] is another instance of a composite or date_histogram aggregation"));
     }
 
-    public void testProcessGivenMaxTimeIsMissing() throws IOException {
+    public void testProcessGivenMaxTimeIsMissing() {
         List<Histogram.Bucket> histogramBuckets = Arrays.asList(
                 createHistogramBucket(1000L, 3),
                 createHistogramBucket(2000L, 5)
@@ -74,17 +106,37 @@ public void testProcessGivenMaxTimeIsMissing() throws IOException {
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
                 () -> aggToString(Collections.emptySet(), histogramBuckets));
         assertThat(e.getMessage(), containsString("Missing max aggregation for time_field [time]"));
+
+        List<CompositeAggregation.Bucket> compositeBuckets = Arrays.asList(
+            createCompositeBucket(1000L, "time",3, Collections.emptyList(), Collections.emptyList()),
+            createCompositeBucket(2000L, "time",5, Collections.emptyList(), Collections.emptyList())
+        );
+
+        e = expectThrows(IllegalArgumentException.class,
+            () -> aggToStringComposite(Collections.emptySet(), compositeBuckets));
+        assertThat(e.getMessage(), containsString("Missing max aggregation for time_field [time]"));
     }
 
-    public void testProcessGivenNonMaxTimeAgg() throws IOException {
+    public void testProcessGivenNonMaxTimeAgg() {
+        List<Aggregation> aggs = Collections.singletonList(createTerms("time", new Term("a", 1), new Term("b", 2)));
         List<Histogram.Bucket> histogramBuckets = Arrays.asList(
-                createHistogramBucket(1000L, 3, Collections.singletonList(createTerms("time", new Term("a", 1), new Term("b", 2)))),
-                createHistogramBucket(2000L, 5, Collections.singletonList(createTerms("time", new Term("a", 1), new Term("b", 2))))
+                createHistogramBucket(1000L, 3, aggs),
+                createHistogramBucket(2000L, 5, aggs)
         );
 
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
                 () -> aggToString(Collections.emptySet(), histogramBuckets));
         assertThat(e.getMessage(), containsString("Missing max aggregation for time_field [time]"));
+
+
+        List<CompositeAggregation.Bucket> compositeBuckets = Arrays.asList(
+            createCompositeBucket(1000L, "time", 3, aggs, Collections.emptyList()),
+            createCompositeBucket(2000L, "time",5, aggs, Collections.emptyList())
+        );
+
+        e = expectThrows(IllegalArgumentException.class,
+            () -> aggToStringComposite(Collections.emptySet(), compositeBuckets));
+        assertThat(e.getMessage(), containsString("Missing max aggregation for time_field [time]"));
     }
 
     public void testProcessGivenHistogramOnly() throws IOException {
@@ -113,6 +165,63 @@ public void testProcessGivenHistogramOnlyAndNoDocCount() throws IOException {
         assertThat(keyValuePairsWritten, equalTo(2L));
     }
 
+
+    public void testProcessGivenCompositeOnly() throws IOException {
+        compositeAggValueSource = "timestamp";
+        List<CompositeAggregation.Bucket> compositeBuckets = Arrays.asList(
+            createCompositeBucket(1000L, "timestamp", 3, Collections.singletonList(createMax("timestamp", 1200)), Collections.emptyList()),
+            createCompositeBucket(2000L, "timestamp", 5, Collections.singletonList(createMax("timestamp", 2800)), Collections.emptyList())
+        );
+
+        timeField = "timestamp";
+        String json = aggToStringComposite(Collections.emptySet(), compositeBuckets);
+
+        assertThat(json, equalTo("{\"timestamp\":1200,\"doc_count\":3} {\"timestamp\":2800,\"doc_count\":5}"));
+        assertThat(keyValuePairsWritten, equalTo(4L));
+    }
+
+    public void testProcessGivenCompositeOnlyAndNoDocCount() throws IOException {
+        List<CompositeAggregation.Bucket> compositeBuckets = Arrays.asList(
+            createCompositeBucket(1000L, "time", 3, Collections.singletonList(createMax("time", 1000)), Collections.emptyList()),
+            createCompositeBucket(2000L, "time", 5, Collections.singletonList(createMax("time", 2000)), Collections.emptyList())
+        );
+
+        includeDocCount = false;
+        String json = aggToStringComposite(Collections.emptySet(), compositeBuckets);
+
+        assertThat(json, equalTo("{\"time\":1000} {\"time\":2000}"));
+        assertThat(keyValuePairsWritten, equalTo(2L));
+    }
+
+    public void testProcessGivenCompositeWithDocAndTerms() throws IOException {
+        compositeAggValueSource = "timestamp";
+        List<CompositeAggregation.Bucket> compositeBuckets = Arrays.asList(
+            createCompositeBucket(1000L,
+                "timestamp",
+                3,
+                Collections.singletonList(createMax("timestamp", 1200)),
+                Arrays.asList(Tuple.tuple("foo", "value1"), Tuple.tuple("bar", "value1"))
+            ),
+            createCompositeBucket(2000L,
+                "timestamp",
+                5,
+                Collections.singletonList(createMax("timestamp", 2800)),
+                Arrays.asList(Tuple.tuple("foo", "value2"), Tuple.tuple("bar", "value2"))
+            )
+        );
+
+        timeField = "timestamp";
+        String json = aggToStringComposite(Sets.newHashSet("foo", "bar"), compositeBuckets);
+
+        assertThat(json,
+            equalTo(
+                "{\"bar\":\"value1\",\"foo\":\"value1\",\"timestamp\":1200,\"doc_count\":3}"
+                    + " {\"bar\":\"value2\",\"foo\":\"value2\",\"timestamp\":2800,\"doc_count\":5}"
+            )
+        );
+        assertThat(keyValuePairsWritten, equalTo(8L));
+    }
+
     public void testProcessGivenTopLevelAggIsNotHistogram() throws IOException {
 
         List<Histogram.Bucket> histogramABuckets = Arrays.asList(
@@ -265,7 +374,7 @@ public void testProcessGivenMultipleSingleMetricPerSingleTermsPerHistogram() thr
                 "{\"time\":4000,\"my_field\":\"b\",\"my_value\":421.0,\"my_value2\":422.0}"));
     }
 
-    public void testProcessGivenUnsupportedAggregationUnderHistogram() throws IOException {
+    public void testProcessGivenUnsupportedAggregationUnderHistogram() {
         Histogram.Bucket histogramBucket = createHistogramBucket(1000L, 2);
         Aggregation anotherHistogram = mock(Aggregation.class);
         when(anotherHistogram.getName()).thenReturn("nested-agg");
@@ -277,7 +386,7 @@ public void testProcessGivenUnsupportedAggregationUnderHistogram() throws IOExce
         assertThat(e.getMessage(), containsString("Unsupported aggregation type [nested-agg]"));
     }
 
-    public void testProcessGivenMultipleBucketAggregations() throws IOException {
+    public void testProcessGivenMultipleBucketAggregations() {
         Histogram.Bucket histogramBucket = createHistogramBucket(1000L, 2);
         Terms terms1 = mock(Terms.class);
         when(terms1.getName()).thenReturn("terms_1");
@@ -355,7 +464,7 @@ public void testProcessGivenSinglePercentilesPerHistogram() throws IOException {
                 "{\"time\":4000,\"my_field\":4.0,\"doc_count\":14}"));
     }
 
-    public void testProcessGivenMultiplePercentilesPerHistogram() throws IOException {
+    public void testProcessGivenMultiplePercentilesPerHistogram() {
         List<Histogram.Bucket> histogramBuckets = Arrays.asList(
                 createHistogramBucket(1000L, 4, Arrays.asList(
                         createMax("time", 1000), createPercentiles("my_field", 1.0))),
@@ -373,10 +482,10 @@ public void testProcessGivenMultiplePercentilesPerHistogram() throws IOException
     }
 
     @SuppressWarnings("unchecked")
-    public void testBucketAggContainsRequiredAgg() throws IOException {
+    public void testBucketAggContainsRequiredAgg() {
         Set<String> fields = new HashSet<>();
         fields.add("foo");
-        AggregationToJsonProcessor processor = new AggregationToJsonProcessor("time", fields, false, 0L);
+        AggregationToJsonProcessor processor = new AggregationToJsonProcessor("time", fields, false, 0L, null);
 
         Terms termsAgg = mock(Terms.class);
         when(termsAgg.getBuckets()).thenReturn(Collections.emptyList());
@@ -459,7 +568,7 @@ public void testSingleBucketAgg() throws IOException {
             " {\"time\":2000,\"field2\":1.0,\"field1\":7.0,\"doc_count\":7}"));
     }
 
-    public void testSingleBucketAgg_failureWithSubMultiBucket() throws IOException {
+    public void testSingleBucketAgg_failureWithSubMultiBucket() {
 
         List<Histogram.Bucket> histogramBuckets = Collections.singletonList(
             createHistogramBucket(1000L, 4, Arrays.asList(
@@ -500,10 +609,22 @@ private String aggToString(Set<String> fields, List<Histogram.Bucket> buckets) t
     private String aggToString(Set<String> fields, Aggregations aggregations) throws IOException {
         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
 
-        AggregationToJsonProcessor processor = new AggregationToJsonProcessor(timeField, fields, includeDocCount, startTime);
+        AggregationToJsonProcessor processor = new AggregationToJsonProcessor(
+            timeField,
+            fields,
+            includeDocCount,
+            startTime,
+            compositeAggValueSource
+        );
         processor.process(aggregations);
         processor.writeDocs(10000, outputStream);
         keyValuePairsWritten = processor.getKeyValueCount();
         return outputStream.toString(StandardCharsets.UTF_8.name());
     }
+
+    private String aggToStringComposite(Set<String> fields, List<CompositeAggregation.Bucket> buckets) throws IOException {
+        CompositeAggregation compositeAggregation = createCompositeAggregation("buckets", buckets);
+        return aggToString(fields, createAggs(Collections.singletonList(compositeAggregation)));
+    }
+
 }

From dfa7abcaa39d6144341e21ff8631088df0783bbb Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Thu, 4 Mar 2021 10:58:49 -0500
Subject: [PATCH 02/20] updating docs

---
 .../ml-configuring-aggregations.asciidoc      | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 6b15fed411dd6..8020721a5984c 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -11,8 +11,8 @@ distributes these calculations across your cluster. You can then feed this
 aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
-TIP: If you use a terms aggregation and the cardinality of a term is high, you
-should use composite aggregations.
+TIP: If you use a terms aggregation and the cardinality of a term is high, 
+use composite aggregations instead.
 
 [discrete]
 [[aggs-limits-dfeeds]]
@@ -25,7 +25,7 @@ This requirement ensures that the aggregated data is a time series and the times
 of each bucket is the time of the last record in the bucket.
 
 For `composite` aggregation support, there must be exactly one `date_histogram` value
-source. Additionally, that value source must NOT be sorted in descending order. Additional
+source. That value source must not be sorted in descending order. Additional
 `composite` aggregation value sources are allowed, such as `terms`.
 
 IMPORTANT: The name of the aggregation and the name of the field that the agg
@@ -145,14 +145,14 @@ PUT _ml/datafeeds/datafeed-farequote
 `responsetime`.
 
 TIP: If you are utilizing a `term` aggregation to gather influencer or partition
-detector field information, consider using a `composite` aggregation. It will perform
-better than a `date_histogram` with a nested `term` aggregation and will also include
-ALL the values of the field instead of the top values per bucket.
+detector field information, consider using a `composite` aggregation. It performs
+better than a `date_histogram` with a nested `term` aggregation and also includes
+all the values of the field instead of the top values per bucket.
 
 Your {dfeed} can contain multiple aggregations, but only the ones with names
 that match values in the job configuration are fed to the job.
 
-Here is a similar example utilizing a `composite` aggregation instead of a
+Here is an example utilizing a `composite` aggregation instead of a
 `date_histogram`.
 
 Assuming the same job configuration as above.
@@ -177,8 +177,8 @@ PUT _ml/anomaly_detectors/farequote-composite
 ----------------------------------
 // TEST[skip:setup:farequote_data]
 
-Here is a datafeed utilizing a `composite` aggregation to bucket
-the metrics based on time and terms.
+This is an example of a datafeed utilizing a `composite` aggregation to bucket
+the metrics based on time and terms:
 
 [source,console]
 ----------------------------------
@@ -237,7 +237,7 @@ is named differently than your desired time field.
 aggregation `term` source with the name `airline` works. Note its name
 is the same as the field.
 <4> The required `max` aggregation whose name is the time field in the
-job analysis config
+job analysis config.
 <5> The `avg` aggregation is named `responsetime` and its field is also named
 `responsetime`.
 
@@ -345,7 +345,7 @@ number of unique entries for the `error` field.
 
 When you define an aggregation in a {dfeed}, it must have one of the following forms:
 
-When using a `date_histogram` aggregation to bucket by time
+When using a `date_histogram` aggregation to bucket by time:
 [source,js]
 ----------------------------------
 "aggregations": {
@@ -379,7 +379,7 @@ When using a `date_histogram` aggregation to bucket by time
 ----------------------------------
 // NOTCONSOLE
 
-When using a `composite` aggregation
+When using a `composite` aggregation:
 
 [source,js]
 ----------------------------------
@@ -416,7 +416,7 @@ When using a `composite` aggregation
 The top level aggregation must be either a
 {ref}/search-aggregations-bucket.html[bucket aggregation] containing as single
 sub-aggregation that is a `date_histogram`, the top level aggregation is the
-required `date_histogram`, or the top leve aggregation is the required `composite`.
+required `date_histogram`, or the top level aggregation is the required `composite`.
 There must be exactly one `date_histogram` or `composite` aggregation. For more information, see
 {ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date histogram aggregation] and
 {ref}/search-aggregations-bucket-composite-aggregation.html[Composite aggregation].
@@ -435,7 +435,7 @@ the time of the latest record within a bucket.
 You can optionally specify a terms aggregation, which creates buckets for
 different values of a field.
 
-TIP: Instead of nesting a `term` aggregation, try using `composite` aggs.
+TIP: Instead of nesting a `term` aggregation, use `composite` aggs.
 
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not

From dd69d162f611ed33e95aee30bb3c269b3d9cefd6 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Thu, 4 Mar 2021 11:07:12 -0500
Subject: [PATCH 03/20] fixing docs

---
 .../ml/anomaly-detection/ml-configuring-aggregations.asciidoc    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 8020721a5984c..9d9914af823b1 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -182,6 +182,7 @@ the metrics based on time and terms:
 
 [source,console]
 ----------------------------------
+PUT _ml/datafeeds/datafeed-farequote-composite
 {
   "job_id": "farequote-composite",
   "indices": [

From d9e7c6fcdbb98876e3106d2b8cc2f65c61d4a886 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Fri, 5 Mar 2021 16:05:07 -0500
Subject: [PATCH 04/20] addressing PR comments

---
 .../core/ml/datafeed/DatafeedConfig.java      |   2 +
 .../core/ml/datafeed/DatafeedConfigTests.java | 102 +++++++++---------
 .../extractor/DataExtractorFactory.java       |  23 ++--
 .../AggregationToJsonProcessor.java           |  18 ++--
 .../CompositeAggregationDataExtractor.java    |  13 ++-
 .../chunked/ChunkedDataExtractor.java         |   8 +-
 .../writer/AbstractDataToProcessWriter.java   |  13 ++-
 .../writer/CsvDataToProcessWriterTests.java   |  43 +++++++-
 .../writer/JsonDataToProcessWriterTests.java  |  41 ++++++-
 9 files changed, 175 insertions(+), 88 deletions(-)

diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
index 47a88408c14bf..4d012c6037ac0 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
@@ -594,6 +594,8 @@ private static ChunkingConfig defaultChunkingConfig(@Nullable AggProvider aggPro
         } else {
             AggregationBuilder histogram = ExtractorUtils.getHistogramAggregation(aggProvider.getParsedAggs().getAggregatorFactories());
             if (histogram instanceof CompositeAggregationBuilder) {
+                // TODO we need to come up with a good auto chunking heuristic for composite
+                // Or composite needs to be faster
                 return ChunkingConfig.newOff();
             }
             long histogramIntervalMillis = ExtractorUtils.getHistogramIntervalMillis(histogram);
diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
index e32d3c1f343ab..902ddfa0c7aaa 100644
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfigTests.java
@@ -675,61 +675,57 @@ public void testCheckHistogramAggregationHasChildMaxTimeAgg() {
         assertThat(e.getMessage(), containsString("Date histogram must have nested max aggregation for time_field [max_time]"));
     }
 
-    public void testValidateCompositeAggValueSources() {
-
-        { // must have exactly one date value source
-            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
-                "buckets",
-                Arrays.asList(new TermsValuesSourceBuilder("foo").field("bar"))
-            );
-            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
-                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
-            assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
-
-            CompositeAggregationBuilder aggregationBuilderWithMoreDateHisto = AggregationBuilders.composite(
-                "buckets",
-                Arrays.asList(
-                    new TermsValuesSourceBuilder("foo").field("bar"),
-                    new DateHistogramValuesSourceBuilder("date1").field("time").fixedInterval(DateHistogramInterval.days(1)),
-                    new DateHistogramValuesSourceBuilder("date2").field("time").fixedInterval(DateHistogramInterval.days(1))
-                )
-            );
-            ex = expectThrows(ElasticsearchStatusException.class,
-                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilderWithMoreDateHisto));
-            assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
-        }
+    public void testValidateCompositeAggValueSources_MustHaveExactlyOneDateValue() {
+        CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(new TermsValuesSourceBuilder("foo").field("bar"))
+        );
+        ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+            () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+        assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
 
-        { // date histo with missing bucket
-            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
-                "buckets",
-                Arrays.asList(
-                    new TermsValuesSourceBuilder("foo").field("bar"),
-                    new DateHistogramValuesSourceBuilder("date1")
-                        .field("time")
-                        .fixedInterval(DateHistogramInterval.days(1))
-                        .missingBucket(true)
-                )
-            );
-            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
-                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
-            assertThat(ex.getMessage(), containsString("does not support missing_buckets"));
-        }
+        CompositeAggregationBuilder aggregationBuilderWithMoreDateHisto = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(
+                new TermsValuesSourceBuilder("foo").field("bar"),
+                new DateHistogramValuesSourceBuilder("date1").field("time").fixedInterval(DateHistogramInterval.days(1)),
+                new DateHistogramValuesSourceBuilder("date2").field("time").fixedInterval(DateHistogramInterval.days(1))
+            )
+        );
+        ex = expectThrows(ElasticsearchStatusException.class,
+            () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilderWithMoreDateHisto));
+        assertThat(ex.getMessage(), containsString("must have exactly one date_histogram source"));
+    }
+    public void testValidateCompositeAggValueSources_DateHistoWithMissingBucket() {
+        CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(
+                new TermsValuesSourceBuilder("foo").field("bar"),
+                new DateHistogramValuesSourceBuilder("date1")
+                    .field("time")
+                    .fixedInterval(DateHistogramInterval.days(1))
+                    .missingBucket(true)
+            )
+        );
+        ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+            () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+        assertThat(ex.getMessage(), containsString("does not support missing_buckets"));
+    }
 
-        { // date histo ordered descended
-            CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
-                "buckets",
-                Arrays.asList(
-                    new TermsValuesSourceBuilder("foo").field("bar"),
-                    new DateHistogramValuesSourceBuilder("date1")
-                        .field("time")
-                        .fixedInterval(DateHistogramInterval.days(1))
-                        .order("desc")
-                )
-            );
-            ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
-                () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
-            assertThat(ex.getMessage(), containsString("must be sorted in ascending order"));
-        }
+    public void testValidateCompositeAggValueSources_DateHistoBadOrder() {
+        CompositeAggregationBuilder aggregationBuilder = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(
+                new TermsValuesSourceBuilder("foo").field("bar"),
+                new DateHistogramValuesSourceBuilder("date1")
+                    .field("time")
+                    .fixedInterval(DateHistogramInterval.days(1))
+                    .order("desc")
+            )
+        );
+        ElasticsearchStatusException ex = expectThrows(ElasticsearchStatusException.class,
+            () -> DatafeedConfig.Builder.validateCompositeAggregationSources(aggregationBuilder));
+        assertThat(ex.getMessage(), containsString("must be sorted in ascending order"));
     }
 
     public void testValidateAggregations_GivenMulitpleHistogramAggs() {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
index 17ed908890fd9..d853834574d2e 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
@@ -39,8 +39,8 @@ static void create(Client client,
                        NamedXContentRegistry xContentRegistry,
                        DatafeedTimingStatsReporter timingStatsReporter,
                        ActionListener<DataExtractorFactory> listener) {
-        boolean hasAggs = datafeed.hasAggregations();
-        boolean isComposite = hasAggs && datafeed.hasCompositeAgg(xContentRegistry);
+        final boolean hasAggs = datafeed.hasAggregations();
+        final boolean isComposite = hasAggs && datafeed.hasCompositeAgg(xContentRegistry);
         ActionListener<DataExtractorFactory> factoryHandler = ActionListener.wrap(
             factory -> listener.onResponse(datafeed.getChunkingConfig().isEnabled()
                 ? new ChunkedDataExtractorFactory(client, datafeed, job, xContentRegistry, factory, timingStatsReporter) : factory)
@@ -49,20 +49,21 @@ static void create(Client client,
 
         ActionListener<GetRollupIndexCapsAction.Response> getRollupIndexCapsActionHandler = ActionListener.wrap(
             response -> {
-                if (response.getJobs().isEmpty() == false && datafeed.hasAggregations() == false) {
+                final boolean hasRollup = response.getJobs().isEmpty() == false;
+                if (hasRollup && hasAggs == false) {
                     listener.onFailure(new IllegalArgumentException("Aggregations are required when using Rollup indices"));
                     return;
                 }
-                if (datafeed.hasAggregations() == false) {
+                if (hasAggs == false) {
                     ScrollDataExtractorFactory.create(client, datafeed, job, xContentRegistry, timingStatsReporter, factoryHandler);
                     return;
                 }
                 if (isComposite) {
                     String[] indices = datafeed.getIndices().toArray(new String[0]);
                     IndicesOptions indicesOptions = datafeed.getIndicesOptions();
-                    AggregatedSearchRequestBuilder aggregatedSearchRequestBuilder = response.getJobs().isEmpty() ?
-                        AggregationDataExtractorFactory.requestBuilder(client, indices, indicesOptions) :
-                        RollupDataExtractorFactory.requestBuilder(client, indices, indicesOptions);
+                    AggregatedSearchRequestBuilder aggregatedSearchRequestBuilder = hasRollup ?
+                        RollupDataExtractorFactory.requestBuilder(client, indices, indicesOptions) :
+                        AggregationDataExtractorFactory.requestBuilder(client, indices, indicesOptions);
                     factoryHandler.onResponse(
                         new CompositeAggregationDataExtractorFactory(
                             client,
@@ -76,12 +77,12 @@ static void create(Client client,
                     return;
                 }
 
-                if (response.getJobs().isEmpty()) { // This means no rollup indexes are in the config
-                    factoryHandler.onResponse(
-                        new AggregationDataExtractorFactory(client, datafeed, job, xContentRegistry, timingStatsReporter));
-                } else {
+                if (hasRollup) {
                     RollupDataExtractorFactory.create(
                         client, datafeed, job, response.getJobs(), xContentRegistry, timingStatsReporter, factoryHandler);
+                } else {
+                    factoryHandler.onResponse(
+                        new AggregationDataExtractorFactory(client, datafeed, job, xContentRegistry, timingStatsReporter));
                 }
             },
             e -> {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
index 8c6a62e6c0f3b..e755cf0eb1169 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
@@ -160,7 +160,7 @@ private void processAggs(long docCount, List<Aggregation> aggregations) throws I
             MultiBucketsAggregation bucketAgg = bucketAggregations.get(0);
             if (bucketAgg instanceof Histogram) {
                 processDateHistogram((Histogram) bucketAgg);
-            } else if (bucketAgg instanceof CompositeAggregation && compositeAggDateValueSourceName != null) {
+            } else if (bucketAgg instanceof CompositeAggregation) {
                 // This indicates that our composite agg contains our date histogram bucketing via one of its sources
                 processCompositeAgg((CompositeAggregation) bucketAgg);
             } else {
@@ -236,12 +236,14 @@ private void processCompositeAgg(CompositeAggregation agg) throws IOException {
                 + "] but does not contain date_histogram value source");
         }
 
-        // the date_histogram value source should order the buckets in ascending order
-        // So, skip forward until we are within the start time
+        // Composite aggs have multiple items in the bucket. It is possible that within the current
+        // date_histogram interval, there are still unprocessed terms, so we shouldn't skip forward past those buckets
+        // Instead, we skip according to the `max(timeField)` agg.
         boolean checkBucketTime = true;
         for (CompositeAggregation.Bucket bucket : agg.getBuckets()) {
             if (checkBucketTime) {
-                long bucketTime = toHistogramKeyToEpoch(bucket.getKey().get(compositeAggDateValueSourceName));
+
+                long bucketTime = getTimeFieldValue(bucket.getAggregations().get(timeField));
                 if (bucketTime < startTime) {
                     LOGGER.debug(() -> new ParameterizedMessage("Skipping bucket at [{}], startTime is [{}]", bucketTime, startTime));
                     continue;
@@ -294,13 +296,15 @@ private long toHistogramKeyToEpoch(Object key) {
         }
     }
 
-    private void processTimeField(Aggregation agg) {
+    private long getTimeFieldValue(Aggregation agg) {
         if (agg instanceof Max == false) {
             throw new IllegalArgumentException(Messages.getMessage(Messages.DATAFEED_MISSING_MAX_AGGREGATION_FOR_TIME_FIELD, timeField));
         }
+        return (long) ((Max) agg).value();
+    }
 
-        long timestamp = (long) ((Max) agg).value();
-        keyValuePairs.put(timeField, timestamp);
+    private void processTimeField(Aggregation agg) {
+        keyValuePairs.put(timeField, getTimeFieldValue(agg));
     }
 
     boolean bucketAggContainsRequiredAgg(MultiBucketsAggregation aggregation) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
index 3d8c9dd662ef9..c6958b683480e 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -30,7 +30,6 @@
 import java.util.NoSuchElementException;
 import java.util.Objects;
 import java.util.Optional;
-import java.util.concurrent.atomic.AtomicReference;
 
 /**
  * An implementation that extracts data from elasticsearch using search with composite aggregations on a client.
@@ -45,7 +44,7 @@ class CompositeAggregationDataExtractor implements DataExtractor {
 
     private static final Logger LOGGER = LogManager.getLogger(CompositeAggregationDataExtractor.class);
 
-    private final AtomicReference<Map<String, Object>> afterKey = new AtomicReference<>(null);
+    private volatile Map<String, Object> afterKey = null;
     private final CompositeAggregationBuilder compositeAggregationBuilder;
     private final Client client;
     private final CompositeAggregationDataExtractorContext context;
@@ -61,7 +60,7 @@ class CompositeAggregationDataExtractor implements DataExtractor {
         DatafeedTimingStatsReporter timingStatsReporter,
         AggregatedSearchRequestBuilder requestBuilder
     ) {
-        this.compositeAggregationBuilder = compositeAggregationBuilder;
+        this.compositeAggregationBuilder = Objects.requireNonNull(compositeAggregationBuilder);
         this.client = Objects.requireNonNull(client);
         this.context = Objects.requireNonNull(dataExtractorContext);
         this.timingStatsReporter = Objects.requireNonNull(timingStatsReporter);
@@ -101,7 +100,7 @@ public Optional<InputStream> next() throws IOException {
         if (aggs == null) {
             LOGGER.trace(() -> new ParameterizedMessage("[{}] extraction finished", context.jobId));
             hasNext = false;
-            afterKey.set(null);
+            afterKey = null;
             return Optional.empty();
         }
         return Optional.of(processAggs(aggs));
@@ -127,8 +126,8 @@ private Aggregations search() {
         if (context.runtimeMappings.isEmpty() == false) {
             searchSourceBuilder.runtimeMappings(context.runtimeMappings);
         }
-        if (afterKey.get() != null) {
-            compositeAggregationBuilder.aggregateAfter(afterKey.get());
+        if (afterKey != null) {
+            compositeAggregationBuilder.aggregateAfter(afterKey);
         }
         searchSourceBuilder.aggregation(compositeAggregationBuilder);
         ActionRequestBuilder<SearchRequest, SearchResponse> searchRequest = requestBuilder.build(searchSourceBuilder);
@@ -143,7 +142,7 @@ private Aggregations search() {
         if (compositeAgg == null || compositeAgg.getBuckets().isEmpty()) {
             return null;
         }
-        afterKey.set(compositeAgg.afterKey());
+        afterKey = compositeAgg.afterKey();
 
         return aggregations;
     }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/chunked/ChunkedDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/chunked/ChunkedDataExtractor.java
index 61263a48e77ee..0b675b171acba 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/chunked/ChunkedDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/chunked/ChunkedDataExtractor.java
@@ -265,9 +265,9 @@ private RollupSearchAction.RequestBuilder rollupRangeSearchRequest() {
 
     private class ScrolledDataSummary implements DataSummary {
 
-        private long earliestTime;
-        private long latestTime;
-        private long totalHits;
+        private final long earliestTime;
+        private final long latestTime;
+        private final long totalHits;
 
         private ScrolledDataSummary(long earliestTime, long latestTime, long totalHits) {
             this.earliestTime = earliestTime;
@@ -309,7 +309,7 @@ public boolean hasData() {
         }
     }
 
-    private class AggregatedDataSummary implements DataSummary {
+    private static class AggregatedDataSummary implements DataSummary {
 
         private final double earliestTime;
         private final double latestTime;
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java
index 9dc261abffb32..ede0c278ac083 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java
@@ -33,6 +33,8 @@
 import java.util.Set;
 import java.util.function.BiConsumer;
 
+import static org.elasticsearch.xpack.core.ml.utils.Intervals.alignToFloor;
+
 public abstract class AbstractDataToProcessWriter implements DataToProcessWriter {
 
     private static final int TIME_FIELD_OUT_INDEX = 0;
@@ -48,7 +50,8 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
 
     private final Logger logger;
     private final DateTransformer dateTransformer;
-    private long latencySeconds;
+    private final long bucketSpanMs;
+    private final long latencySeconds;
 
     protected Map<String, Integer> inFieldIndexes;
     protected List<InputOutputMap> inputOutputMap;
@@ -68,6 +71,7 @@ protected AbstractDataToProcessWriter(boolean includeControlField, boolean inclu
         this.dataCountsReporter = Objects.requireNonNull(dataCountsReporter);
         this.logger = Objects.requireNonNull(logger);
         this.latencySeconds = analysisConfig.getLatency() == null ? 0 : analysisConfig.getLatency().seconds();
+        this.bucketSpanMs = analysisConfig.getBucketSpan().getMillis();
 
         Date date = dataCountsReporter.getLatestRecordTime();
         latestEpochMsThisUpload = 0;
@@ -178,9 +182,12 @@ protected boolean transformTimeAndWrite(String[] record, long numberOfFieldsRead
         }
 
         record[TIME_FIELD_OUT_INDEX] = Long.toString(epochMs / MS_IN_SECOND);
+        long latestBucketFloor = alignToFloor(latestEpochMs, bucketSpanMs);
 
         // Records have epoch seconds timestamp so compare for out of order in seconds
-        if (epochMs / MS_IN_SECOND < latestEpochMs / MS_IN_SECOND - latencySeconds) {
+        // We care only about records that are older than the current bucket according to our latest timestamp
+        // The native side handles random order within the same bucket without issue
+        if (epochMs / MS_IN_SECOND < latestBucketFloor / MS_IN_SECOND - latencySeconds) {
             // out of order
             dataCountsReporter.reportOutOfOrderRecord(numberOfFieldsRead);
 
@@ -325,7 +332,7 @@ protected abstract boolean checkForMissingFields(Collection<String> inputFields,
     /**
      * Input and output array indexes map
      */
-    protected class InputOutputMap {
+    protected static class InputOutputMap {
         int inputIndex;
         int outputIndex;
 
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java
index 760f1bdd99812..aa817da721917 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java
@@ -86,7 +86,9 @@ public Void answer(InvocationOnMock invocation) throws Throwable {
         dataDescription.setTimeFormat(DataDescription.EPOCH);
 
         Detector detector = new Detector.Builder("metric", "value").build();
-        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector)).build();
+        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector))
+            .setBucketSpan(TimeValue.timeValueSeconds(1))
+            .build();
     }
 
     public void testWrite_GivenTimeFormatIsEpochAndDataIsValid() throws IOException {
@@ -198,6 +200,7 @@ public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOu
         AnalysisConfig.Builder builder =
                 new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
         builder.setLatency(TimeValue.timeValueSeconds(2));
+        builder.setBucketSpan(TimeValue.timeValueSeconds(1));
         analysisConfig = builder.build();
 
         StringBuilder input = new StringBuilder();
@@ -228,6 +231,44 @@ public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOu
         verify(dataCountsReporter).finishReporting();
     }
 
+    public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsOutOfOrderWithinBucketSpan() throws Exception {
+        AnalysisConfig.Builder builder =
+            new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
+        builder.setBucketSpan(TimeValue.timeValueSeconds(10));
+        analysisConfig = builder.build();
+
+        StringBuilder input = new StringBuilder();
+        input.append("time,metric,value\n");
+        input.append("4,foo,4.0\n");
+        input.append("5,foo,5.0\n");
+        input.append("3,foo,3.0\n");
+        input.append("4,bar,4.0\n");
+        input.append("2,bar,2.0\n");
+        input.append("12,bar,12.0\n");
+        input.append("2,bar,2.0\n");
+        input.append("\0");
+        InputStream inputStream = createInputStream(input.toString());
+        CsvDataToProcessWriter writer = createWriter();
+        writer.writeHeader();
+        writer.write(inputStream, null, null, (r, e) -> {});
+        verify(dataCountsReporter, times(1)).startNewIncrementalCount();
+
+        List<String[]> expectedRecords = new ArrayList<>();
+        // The final field is the control field
+        expectedRecords.add(new String[] { "time", "value", "." });
+        expectedRecords.add(new String[] { "4", "4.0", "" });
+        expectedRecords.add(new String[] { "5", "5.0", "" });
+        expectedRecords.add(new String[] { "3", "3.0", "" });
+        expectedRecords.add(new String[] { "4", "4.0", "" });
+        expectedRecords.add(new String[] { "2", "2.0", "" });
+        expectedRecords.add(new String[] { "12", "12.0", "" });
+        assertWrittenRecordsEqualTo(expectedRecords);
+
+        verify(dataCountsReporter, times(1)).reportOutOfOrderRecord(2);
+        verify(dataCountsReporter, never()).reportLatestTimeIncrementalStats(anyLong());
+        verify(dataCountsReporter).finishReporting();
+    }
+
     public void testWrite_NullByte() throws IOException {
         AnalysisConfig.Builder builder =
                 new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java
index 229f7f260a634..9d35755707ffb 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java
@@ -87,7 +87,9 @@ public Void answer(InvocationOnMock invocation) throws Throwable {
         dataDescription.setTimeFormat(DataDescription.EPOCH);
 
         Detector detector = new Detector.Builder("metric", "value").build();
-        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector)).build();
+        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector))
+            .setBucketSpan(TimeValue.timeValueSeconds(1))
+            .build();
     }
 
     public void testWrite_GivenTimeFormatIsEpochAndDataIsValid() throws Exception {
@@ -168,10 +170,45 @@ public void testWrite_GivenTimeFormatIsEpochAndTimestampsAreOutOfOrder() throws
         verify(dataCountsReporter).finishReporting();
     }
 
+    public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsOutOfOrderWithinBucketSpan() throws Exception {
+        AnalysisConfig.Builder builder =
+            new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
+        builder.setBucketSpan(TimeValue.timeValueSeconds(10));
+        analysisConfig = builder.build();
+
+        StringBuilder input = new StringBuilder();
+        input.append("{\"time\":\"4\", \"metric\":\"foo\", \"value\":\"4.0\"}");
+        input.append("{\"time\":\"5\", \"metric\":\"foo\", \"value\":\"5.0\"}");
+        input.append("{\"time\":\"3\", \"metric\":\"bar\", \"value\":\"3.0\"}");
+        input.append("{\"time\":\"4\", \"metric\":\"bar\", \"value\":\"4.0\"}");
+        input.append("{\"time\":\"2\", \"metric\":\"bar\", \"value\":\"2.0\"}");
+        input.append("{\"time\":\"12\", \"metric\":\"bar\", \"value\":\"12.0\"}");
+        input.append("{\"time\":\"2\", \"metric\":\"bar\", \"value\":\"2.0\"}");
+        InputStream inputStream = createInputStream(input.toString());
+        JsonDataToProcessWriter writer = createWriter();
+        writer.writeHeader();
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
+
+        List<String[]> expectedRecords = new ArrayList<>();
+        // The final field is the control field
+        expectedRecords.add(new String[]{"time", "value", "."});
+        expectedRecords.add(new String[]{"4", "4.0", ""});
+        expectedRecords.add(new String[]{"5", "5.0", ""});
+        expectedRecords.add(new String[]{"3", "3.0", ""});
+        expectedRecords.add(new String[]{"4", "4.0", ""});
+        expectedRecords.add(new String[]{"2", "2.0", ""});
+        expectedRecords.add(new String[]{"12", "12.0", ""});
+        assertWrittenRecordsEqualTo(expectedRecords);
+
+        verify(dataCountsReporter, times(1)).reportOutOfOrderRecord(2);
+        verify(dataCountsReporter, never()).reportLatestTimeIncrementalStats(anyLong());
+        verify(dataCountsReporter).finishReporting();
+    }
+
     public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOutOfOrder() throws Exception {
         AnalysisConfig.Builder builder =
                 new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
-        builder.setLatency(TimeValue.timeValueSeconds(2));
+        builder.setBucketSpan(TimeValue.timeValueSeconds(1)).setLatency(TimeValue.timeValueSeconds(2));
         analysisConfig = builder.build();
 
         StringBuilder input = new StringBuilder();

From 3fc41599faef7927d550007bdde4618c679a9e38 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 16 Mar 2021 11:22:19 -0400
Subject: [PATCH 05/20] fixing bugs and addressing PR comments

---
 docs/reference/ml/ml-shared.asciidoc          | 13 ++--
 .../core/ml/datafeed/ChunkingConfig.java      |  4 ++
 .../core/ml/datafeed/DatafeedConfig.java      |  3 +-
 .../xpack/ml/datafeed/DatafeedJob.java        |  2 +-
 .../extractor/DataExtractorFactory.java       | 27 +++++---
 .../AggregatedSearchRequestBuilder.java       |  1 -
 .../AggregationToJsonProcessor.java           | 42 +++++++++--
 .../CompositeAggregationDataExtractor.java    | 38 +++++++---
 .../RollupDataExtractorFactory.java           |  7 --
 .../extractor/DataExtractorFactoryTests.java  | 38 ++++++++++
 .../RollupDataExtractorFactoryTests.java      | 69 -------------------
 11 files changed, 132 insertions(+), 112 deletions(-)
 delete mode 100644 x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactoryTests.java

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index 9fdc1ef306bf6..e5db0be318861 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -1,6 +1,7 @@
 tag::aggregations[]
 If set, the {dfeed} performs aggregation searches. Support for aggregations is
-limited. For more information, see
+limited and should be used only with low cardinality data. For more information,
+see
 {ml-docs}/ml-configuring-aggregation.html[Aggregating data for faster performance].
 end::aggregations[]
 
@@ -155,8 +156,7 @@ tag::bucket-span[]
 The size of the interval that the analysis is aggregated into, typically between
 `5m` and `1h`. The default value is `5m`. If the {anomaly-job} uses a {dfeed}
 with {ml-docs}/ml-configuring-aggregation.html[aggregations], this value must be
-divisible by the interval of the `date_histogram` aggregation or the 
-`date_histogram` source in the `composite` aggregation. For more
+divisible by the interval of the date histogram aggregation. For more
 information, see {ml-docs}/ml-buckets.html[Buckets].
 end::bucket-span[]
 
@@ -824,8 +824,7 @@ for longer bucket spans, a sensible fraction of the bucket span. For example:
 `150s`. When `frequency` is shorter than the bucket span, interim results for
 the last (partial) bucket are written then eventually overwritten by the full
 bucket results. If the {dfeed} uses aggregations, this value must be divisible
-by the interval of the `date_histogram` or the `date_histogram` source in the
-`composite` aggregation.
+by the interval of the date histogram aggregation.
 end::frequency[]
 
 tag::frequent-category-count[]
@@ -1130,8 +1129,8 @@ There are three available modes:
 * `auto`: The chunk size is dynamically calculated. This is the default and
 recommended value when the {dfeed} does not use aggregations.
 * `manual`: Chunking is applied according to the specified `time_span`. Use this
-mode when the {dfeed} uses aggregations other than `composite`.
-* `off`: No chunking is applied. Recommended for `composite` aggregations.
+mode when the {dfeed} uses aggregations.
+* `off`: No chunking is applied.
 --
 end::mode[]
 
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/ChunkingConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/ChunkingConfig.java
index cf213c0bd6a4a..97690008247cb 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/ChunkingConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/ChunkingConfig.java
@@ -86,6 +86,10 @@ public boolean isEnabled() {
         return mode != Mode.OFF;
     }
 
+    public boolean isManual() {
+        return mode == Mode.MANUAL;
+    }
+
     Mode getMode() {
         return mode;
     }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
index 4d012c6037ac0..b25a774cca822 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
@@ -594,8 +594,7 @@ private static ChunkingConfig defaultChunkingConfig(@Nullable AggProvider aggPro
         } else {
             AggregationBuilder histogram = ExtractorUtils.getHistogramAggregation(aggProvider.getParsedAggs().getAggregatorFactories());
             if (histogram instanceof CompositeAggregationBuilder) {
-                // TODO we need to come up with a good auto chunking heuristic for composite
-                // Or composite needs to be faster
+                // Allow composite aggs to handle the underlying chunking and searching
                 return ChunkingConfig.newOff();
             }
             long histogramIntervalMillis = ExtractorUtils.getHistogramIntervalMillis(histogram);
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
index 780f8e5adc935..5fe9ba565327e 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
@@ -354,7 +354,7 @@ private void run(long start, long end, FlushJobAction.Request flushRequest) thro
                     if (isIsolated) {
                         return;
                     }
-                LOGGER.error(new ParameterizedMessage("[{}] error while posting data", jobId), e);
+                    LOGGER.error(new ParameterizedMessage("[{}] error while posting data", jobId), e);
 
                     // a conflict exception means the job state is not open any more.
                     // we should therefore stop the datafeed.
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
index d853834574d2e..3fc779bd36b5f 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactory.java
@@ -58,22 +58,31 @@ static void create(Client client,
                     ScrollDataExtractorFactory.create(client, datafeed, job, xContentRegistry, timingStatsReporter, factoryHandler);
                     return;
                 }
+                if (hasRollup && datafeed.getRuntimeMappings().isEmpty() == false) {
+                    // TODO Rollup V2 will support runtime fields
+                    listener.onFailure(new IllegalArgumentException("The datafeed has runtime_mappings defined, "
+                        + "runtime fields are not supported in rollup searches"));
+                    return;
+                }
                 if (isComposite) {
                     String[] indices = datafeed.getIndices().toArray(new String[0]);
                     IndicesOptions indicesOptions = datafeed.getIndicesOptions();
                     AggregatedSearchRequestBuilder aggregatedSearchRequestBuilder = hasRollup ?
                         RollupDataExtractorFactory.requestBuilder(client, indices, indicesOptions) :
                         AggregationDataExtractorFactory.requestBuilder(client, indices, indicesOptions);
-                    factoryHandler.onResponse(
-                        new CompositeAggregationDataExtractorFactory(
-                            client,
-                            datafeed,
-                            job,
-                            xContentRegistry,
-                            timingStatsReporter,
-                            aggregatedSearchRequestBuilder
-                        )
+                    final DataExtractorFactory dataExtractorFactory = new CompositeAggregationDataExtractorFactory(
+                        client,
+                        datafeed,
+                        job,
+                        xContentRegistry,
+                        timingStatsReporter,
+                        aggregatedSearchRequestBuilder
                     );
+                    if (datafeed.getChunkingConfig().isManual()) {
+                        factoryHandler.onResponse(dataExtractorFactory);
+                    } else {
+                        listener.onResponse(dataExtractorFactory);
+                    }
                     return;
                 }
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
index bc120586ed884..a6c6c3fb27a4f 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregatedSearchRequestBuilder.java
@@ -18,7 +18,6 @@
  *
  * Implementations can be found for regular searches and rollup searches.
  */
-@FunctionalInterface
 public interface AggregatedSearchRequestBuilder {
     ActionRequestBuilder<SearchRequest, SearchResponse> build(SearchSourceBuilder searchSourceBuilder);
 }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
index e755cf0eb1169..d607090fa7979 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
@@ -41,6 +41,7 @@
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 /**
@@ -243,7 +244,7 @@ private void processCompositeAgg(CompositeAggregation agg) throws IOException {
         for (CompositeAggregation.Bucket bucket : agg.getBuckets()) {
             if (checkBucketTime) {
 
-                long bucketTime = getTimeFieldValue(bucket.getAggregations().get(timeField));
+                long bucketTime = toHistogramKeyToEpoch(bucket.getKey().get(compositeAggDateValueSourceName));
                 if (bucketTime < startTime) {
                     LOGGER.debug(() -> new ParameterizedMessage("Skipping bucket at [{}], startTime is [{}]", bucketTime, startTime));
                     continue;
@@ -296,15 +297,13 @@ private long toHistogramKeyToEpoch(Object key) {
         }
     }
 
-    private long getTimeFieldValue(Aggregation agg) {
+    private void processTimeField(Aggregation agg) {
         if (agg instanceof Max == false) {
             throw new IllegalArgumentException(Messages.getMessage(Messages.DATAFEED_MISSING_MAX_AGGREGATION_FOR_TIME_FIELD, timeField));
         }
-        return (long) ((Max) agg).value();
-    }
 
-    private void processTimeField(Aggregation agg) {
-        keyValuePairs.put(timeField, getTimeFieldValue(agg));
+        long timestamp = (long) ((Max) agg).value();
+        keyValuePairs.put(timeField, timestamp);
     }
 
     boolean bucketAggContainsRequiredAgg(MultiBucketsAggregation aggregation) {
@@ -456,6 +455,37 @@ boolean writeDocs(int batchSize, OutputStream outputStream) throws IOException {
         return docsByBucketTimestamp.isEmpty() == false;
     }
 
+    /**
+     * This writes ALL the documents stored within the processor object unless indicated otherwise by the `shouldCancel` predicate
+     *
+     * This returns `true` if it is safe to cancel the overall process as the current `date_histogram` bucket has finished.
+     * @param shouldCancel determines if a given timestamp indicates that the processing stream should be cancelled
+     * @param outputStream where to write the aggregated data
+     * @return true if it is acceptable for the caller to close the process and cancel the stream
+     * @throws IOException if there is a parsing exception
+     */
+    boolean writeAllDocsCancellable(Predicate<Long> shouldCancel, OutputStream outputStream) throws IOException {
+        if (docsByBucketTimestamp.isEmpty()) {
+            return true;
+        }
+
+        try (XContentBuilder jsonBuilder = new XContentBuilder(JsonXContent.jsonXContent, outputStream)) {
+            Iterator<Map.Entry<Long, List<Map<String, Object>>>> iterator = docsByBucketTimestamp.entrySet().iterator();
+            while (iterator.hasNext()) {
+                Map.Entry<Long, List<Map<String, Object>>> entry = iterator.next();
+                if (shouldCancel.test(entry.getKey())) {
+                    return true;
+                }
+                for (Map<String, Object> map : entry.getValue()) {
+                    writeJsonObject(jsonBuilder, map);
+                }
+                iterator.remove();
+            }
+        }
+
+        return false;
+    }
+
     private void writeJsonObject(XContentBuilder jsonBuilder, Map<String, Object> record) throws IOException {
         jsonBuilder.startObject();
         for (Map.Entry<String, Object> keyValue : record.entrySet()) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
index c6958b683480e..8fc0e208b0a7a 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -50,6 +50,7 @@ class CompositeAggregationDataExtractor implements DataExtractor {
     private final CompositeAggregationDataExtractorContext context;
     private final DatafeedTimingStatsReporter timingStatsReporter;
     private final AggregatedSearchRequestBuilder requestBuilder;
+    private final long interval;
     private boolean hasNext;
     private boolean isCancelled;
 
@@ -65,6 +66,7 @@ class CompositeAggregationDataExtractor implements DataExtractor {
         this.context = Objects.requireNonNull(dataExtractorContext);
         this.timingStatsReporter = Objects.requireNonNull(timingStatsReporter);
         this.requestBuilder = Objects.requireNonNull(requestBuilder);
+        this.interval = ExtractorUtils.getHistogramIntervalMillis(compositeAggregationBuilder);
         this.hasNext = true;
     }
 
@@ -82,7 +84,6 @@ public boolean isCancelled() {
     public void cancel() {
         LOGGER.debug(() -> new ParameterizedMessage("[{}] Data extractor received cancel request", context.jobId));
         isCancelled = true;
-        hasNext = false;
     }
 
     @Override
@@ -107,21 +108,22 @@ public Optional<InputStream> next() throws IOException {
     }
 
     private Aggregations search() {
-        long histogramSearchStartTime = Math.max(
-            0,
-            context.start - ExtractorUtils.getHistogramIntervalMillis(compositeAggregationBuilder)
-        );
+        // Compare to the normal aggregation implementation, this search does not search for the previous bucket's data.
+        // For composite aggs, since it is scrolling, it is not really possible to know the previous pages results in the current page.
+        // Aggregations like derivative cannot work within composite aggs, for now.
+        // Also, it doesn't make sense to have a derivative when grouping by time AND by some other criteria.
+
         LOGGER.trace(
             () -> new ParameterizedMessage(
                 "[{}] Executing composite aggregated search from [{}] to [{}]",
                 context.jobId,
-                histogramSearchStartTime,
+                context.start,
                 context.end
             )
         );
         SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder()
             .size(0)
-            .query(ExtractorUtils.wrapInTimeRangeQuery(context.query, context.timeField, histogramSearchStartTime, context.end));
+            .query(ExtractorUtils.wrapInTimeRangeQuery(context.query, context.timeField, context.start, context.end));
 
         if (context.runtimeMappings.isEmpty() == false) {
             searchSourceBuilder.runtimeMappings(context.runtimeMappings);
@@ -166,9 +168,25 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
         ));
         aggregationToJsonProcessor.process(aggs);
         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
-        boolean moreToWrite = aggregationToJsonProcessor.writeDocs(Integer.MAX_VALUE, outputStream);
-        while (moreToWrite) {
-            moreToWrite = aggregationToJsonProcessor.writeDocs(Integer.MAX_VALUE, outputStream);
+        final boolean hasAfterKey = afterKey != null && (afterKey.get(context.compositeAggDateHistogramGroupSourceName) instanceof Long);
+        final Long nextBucket = hasAfterKey ? (Long)afterKey.get(context.compositeAggDateHistogramGroupSourceName) + interval : null;
+        boolean cancellable = aggregationToJsonProcessor.writeAllDocsCancellable(
+            timestamp -> {
+                if (isCancelled) {
+                    // If we have not processed a single composite agg page yet and we are cancelled
+                    // We should not process anything
+                    if (hasAfterKey == false) {
+                        return true;
+                    }
+                    // If the current timestamp is part of the next bucket given our current afterKey, the process can be cancelled
+                    // as it has passed into the next date_histogram bucket in the composite aggregation paging
+                    return timestamp >= nextBucket;
+                }
+                return false;
+            }, outputStream);
+        // If the process is canceled and cancelable, then we can indicate that there are no more buckets to process.
+        if (isCancelled && cancellable) {
+            hasNext = false;
         }
         return new ByteArrayInputStream(outputStream.toByteArray());
     }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
index 0aa5f1dee2b14..35c66f83128d4 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactory.java
@@ -106,13 +106,6 @@ public static void create(Client client,
                               DatafeedTimingStatsReporter timingStatsReporter,
                               ActionListener<DataExtractorFactory> listener) {
 
-        if (datafeed.getRuntimeMappings().isEmpty() == false) {
-            // TODO Rollup V2 will support runtime fields
-            listener.onFailure(new IllegalArgumentException("The datafeed has runtime_mappings defined, "
-                + "runtime fields are not supported in rollup searches"));
-            return;
-        }
-
         final AggregationBuilder datafeedHistogramAggregation = getHistogramAggregation(
             datafeed.getParsedAggregations(xContentRegistry).getAggregatorFactories());
         if ((datafeedHistogramAggregation instanceof DateHistogramAggregationBuilder) == false) {
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactoryTests.java
index 32b1a71ccfe75..a868b0447df01 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactoryTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/DataExtractorFactoryTests.java
@@ -209,6 +209,44 @@ public void testCreateDataExtractorFactoryGivenDefaultAggregationWithAutoChunk()
             client, datafeedConfig.build(), jobBuilder.build(new Date()), xContentRegistry(), timingStatsReporter, listener);
     }
 
+    public void testCreateDataExtractorFactoryGivenRollupAndValidAggregationAndRuntimeFields() {
+        givenAggregatableRollup("myField", "max", 5, "termField");
+        DataDescription.Builder dataDescription = new DataDescription.Builder();
+        dataDescription.setTimeField("time");
+        Job.Builder jobBuilder = DatafeedManagerTests.createDatafeedJob();
+        jobBuilder.setDataDescription(dataDescription);
+        DatafeedConfig.Builder datafeedConfig = DatafeedManagerTests.createDatafeedConfig("datafeed1", "foo");
+        datafeedConfig.setChunkingConfig(ChunkingConfig.newOff());
+        MaxAggregationBuilder maxTime = AggregationBuilders.max("time").field("time");
+        MaxAggregationBuilder myField = AggregationBuilders.max("myField").field("myField");
+        TermsAggregationBuilder myTerm = AggregationBuilders.terms("termAgg").field("termField").subAggregation(myField);
+        Map<String, Object> settings = new HashMap<>();
+        settings.put("type", "keyword");
+        settings.put("script", "");
+        Map<String, Object> field = new HashMap<>();
+        field.put("runtime_field_bar", settings);
+        datafeedConfig.setParsedAggregations(AggregatorFactories.builder().addAggregator(
+            AggregationBuilders.dateHistogram("time")
+                .fixedInterval(new DateHistogramInterval("600000ms"))
+                .subAggregation(maxTime)
+                .subAggregation(myTerm)
+                .field("time")))
+            .setRuntimeMappings(field);
+        ActionListener<DataExtractorFactory> listener = ActionListener.wrap(
+            dataExtractorFactory -> fail(),
+            e -> {
+                assertThat(
+                    e.getMessage(),
+                    equalTo("The datafeed has runtime_mappings defined, runtime fields are not supported in rollup searches")
+                );
+                assertThat(e, instanceOf(IllegalArgumentException.class));
+            }
+        );
+        DataExtractorFactory.create(
+            client, datafeedConfig.build(), jobBuilder.build(new Date()), xContentRegistry(), timingStatsReporter, listener);
+    }
+
+
     public void testCreateDataExtractorFactoryGivenRollupAndValidAggregation() {
         givenAggregatableRollup("myField", "max", 5, "termField");
         DataDescription.Builder dataDescription = new DataDescription.Builder();
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactoryTests.java
deleted file mode 100644
index 2c6ae2b5a6a63..0000000000000
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/RollupDataExtractorFactoryTests.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
-
-import org.elasticsearch.action.ActionListener;
-import org.elasticsearch.client.Client;
-import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.xpack.core.ml.datafeed.DatafeedConfig;
-import org.elasticsearch.xpack.core.ml.job.config.AnalysisConfig;
-import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
-import org.elasticsearch.xpack.core.ml.job.config.Detector;
-import org.elasticsearch.xpack.core.ml.job.config.Job;
-import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
-import org.elasticsearch.xpack.ml.datafeed.extractor.DataExtractorFactory;
-
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicReference;
-
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.instanceOf;
-import static org.mockito.Mockito.mock;
-
-public class RollupDataExtractorFactoryTests extends ESTestCase {
-
-    public void testCreateWithRuntimeFields() {
-        String jobId = "foojob";
-
-        Detector.Builder detectorBuilder = new Detector.Builder();
-        detectorBuilder.setFunction("sum");
-        detectorBuilder.setFieldName("value");
-        AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detectorBuilder.build()));
-        Job.Builder jobBuilder = new Job.Builder(jobId);
-        jobBuilder.setDataDescription(new DataDescription.Builder());
-        jobBuilder.setAnalysisConfig(analysisConfig);
-
-        DatafeedConfig.Builder datafeedConfigBuilder = new DatafeedConfig.Builder("foo-feed", jobId);
-        datafeedConfigBuilder.setIndices(Collections.singletonList("my_index"));
-
-        Map<String, Object> settings = new HashMap<>();
-        settings.put("type", "keyword");
-        settings.put("script", "");
-        Map<String, Object> field = new HashMap<>();
-        field.put("runtime_field_bar", settings);
-        datafeedConfigBuilder.setRuntimeMappings(field);
-
-        AtomicReference<Exception> exceptionRef = new AtomicReference<>();
-        ActionListener<DataExtractorFactory> listener = ActionListener.wrap(
-            r -> fail("unexpected response"),
-            exceptionRef::set
-        );
-
-        RollupDataExtractorFactory.create(mock(Client.class), datafeedConfigBuilder.build(), jobBuilder.build(new Date()),
-            Collections.emptyMap(), xContentRegistry(), mock(DatafeedTimingStatsReporter.class), listener);
-
-        assertNotNull(exceptionRef.get());
-        Exception e = exceptionRef.get();
-        assertThat(e, instanceOf(IllegalArgumentException.class));
-        assertThat(e.getMessage(), equalTo("The datafeed has runtime_mappings defined, " +
-            "runtime fields are not supported in rollup searches"));
-    }
-}

From 4fc54920d61662431c5268f933cad03fdfed0bdd Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 16 Mar 2021 13:56:25 -0400
Subject: [PATCH 06/20] adding docs around composite agg limitations

---
 .../ml-configuring-aggregations.asciidoc                  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 9d9914af823b1..3214d6454997d 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -149,6 +149,11 @@ detector field information, consider using a `composite` aggregation. It perform
 better than a `date_histogram` with a nested `term` aggregation and also includes
 all the values of the field instead of the top values per bucket.
 
+NOTE: `composite` aggregations may not be as performant as a scrolling or 
+`date_histogram` oriented {dfeed}. `composite` aggregations are optimized
+for queries that are either `match_all` or `range` filters. Other types of
+queries may cause the `composite` aggregation to be ineffecient.
+
 Your {dfeed} can contain multiple aggregations, but only the ones with names
 that match values in the job configuration are fed to the job.
 
@@ -436,7 +441,8 @@ the time of the latest record within a bucket.
 You can optionally specify a terms aggregation, which creates buckets for
 different values of a field.
 
-TIP: Instead of nesting a `term` aggregation, use `composite` aggs.
+TIP: Instead of nesting a `term` aggregation, use `composite` aggs when you
+have many terms and a simple range query.
 
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not

From 673460c15b324c76fda20aaff215d39f2333264b Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 17 Mar 2021 09:36:59 -0400
Subject: [PATCH 07/20] adding integration test

---
 .../xpack/ml/integration/DatafeedJobsIT.java  | 127 +++++++++++++++++-
 .../action/TransportStopDatafeedAction.java   |   2 +-
 2 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
index d24b8bc44e41d..947263c4ac842 100644
--- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
+++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
@@ -12,10 +12,14 @@
 import org.elasticsearch.action.admin.cluster.node.hotthreads.NodesHotThreadsResponse;
 import org.elasticsearch.action.support.master.AcknowledgedResponse;
 import org.elasticsearch.common.CheckedRunnable;
-import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.ConcurrentMapLong;
+import org.elasticsearch.search.aggregations.AggregationBuilders;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
+import org.elasticsearch.xpack.core.ml.MlTasks;
 import org.elasticsearch.xpack.core.ml.action.DeleteDatafeedAction;
 import org.elasticsearch.xpack.core.ml.action.GetDatafeedsStatsAction;
 import org.elasticsearch.xpack.core.ml.action.GetJobsStatsAction;
@@ -32,9 +36,10 @@
 import org.elasticsearch.xpack.core.ml.job.config.Job;
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.DataCounts;
+import org.elasticsearch.xpack.core.ml.job.results.Bucket;
+import org.hamcrest.CoreMatchers;
 import org.hamcrest.Matcher;
 import org.junit.After;
-import org.junit.Before;
 
 import java.time.Duration;
 import java.time.Instant;
@@ -54,6 +59,8 @@
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.oneOf;
 
 public class DatafeedJobsIT extends MlNativeAutodetectIntegTestCase {
 
@@ -287,6 +294,122 @@ public void testDatafeedTimingStats_QueryDelayUpdated_TimingStatsNotReset() thro
         assertDatafeedStats(datafeedId, DatafeedState.STOPPED, job.getId(), greaterThan(0L));
     }
 
+    public void testStopAndRestartCompositeDatafeed() throws Exception {
+        String indexName = "stop-restart-data";
+        client().admin().indices().prepareCreate("stop-restart-data")
+            .setMapping("time", "type=date")
+            .get();
+        long numDocs = randomIntBetween(32, 2048);
+        long now = System.currentTimeMillis();
+        long oneWeekAgo = now - 604800000;
+        long twoWeeksAgo = oneWeekAgo - 604800000;
+        indexDocs(logger, indexName, numDocs, twoWeeksAgo, oneWeekAgo);
+        long numDocs2 = randomIntBetween(32, 2048);
+        indexDocs(logger, indexName, numDocs2, oneWeekAgo, now);
+        client().admin().cluster().prepareHealth(indexName).setWaitForYellowStatus().get();
+
+        String scrollJobId = "stop-restart-scroll";
+        Job.Builder scrollJob = createScheduledJob(scrollJobId);
+        registerJob(scrollJob);
+        putJob(scrollJob);
+        openJob(scrollJobId);
+        assertBusy(() -> assertEquals(getJobStats(scrollJobId).get(0).getState(), JobState.OPENED));
+
+        DatafeedConfig datafeedConfig = createDatafeedBuilder(scrollJobId+ "-datafeed", scrollJobId, Collections.singletonList(indexName))
+            .setChunkingConfig(ChunkingConfig.newManual(new TimeValue(1, TimeUnit.SECONDS)))
+            .build();
+        registerDatafeed(datafeedConfig);
+        putDatafeed(datafeedConfig);
+        startDatafeed(datafeedConfig.getId(), 0L, null);
+
+        // Wait until we have processed data
+        assertBusy(() -> assertThat(getDataCounts(scrollJobId).getProcessedRecordCount(), greaterThan(0L)));
+        stopDatafeed(datafeedConfig.getId());
+        assertBusy(() -> assertThat(getJobStats(scrollJobId).get(0).getState(), is(oneOf(JobState.CLOSED, JobState.OPENED))));
+        // If we are not OPENED, then we are closed and shouldn't restart as the datafeed finished running through the data
+        if (getJobStats(scrollJobId).get(0).getState().equals(JobState.OPENED)) {
+            updateDatafeed(new DatafeedUpdate.Builder().setId(datafeedConfig.getId()).setChunkingConfig(ChunkingConfig.newAuto()).build());
+            startDatafeed(
+                datafeedConfig.getId(),
+                randomLongBetween(0, getDataCounts(scrollJobId).getLatestRecordTimeStamp().getTime()),
+                now
+            );
+            waitUntilJobIsClosed(scrollJobId);
+        }
+
+        assertBusy(() -> {
+            DataCounts dataCounts = getDataCounts(scrollJobId);
+            assertThat(dataCounts.getProcessedRecordCount(), equalTo(numDocs + numDocs2));
+            assertThat(dataCounts.getOutOfOrderTimeStampCount(), equalTo(0L));
+        }, 60, TimeUnit.SECONDS);
+
+        String compositeJobId = "stop-restart-composite";
+        Job.Builder compositeJob = createScheduledJob(compositeJobId);
+        compositeJob.setAnalysisConfig(
+            new AnalysisConfig.Builder(compositeJob.getAnalysisConfig()).setSummaryCountFieldName("doc_count")
+        );
+        registerJob(compositeJob);
+        putJob(compositeJob);
+        openJob(compositeJobId);
+        assertBusy(() -> assertEquals(getJobStats(compositeJobId).get(0).getState(), JobState.OPENED));
+
+        AggregatorFactories.Builder aggs = new AggregatorFactories.Builder();
+        aggs.addAggregator(
+            AggregationBuilders.composite(
+                "buckets",
+                Collections.singletonList(
+                    new DateHistogramValuesSourceBuilder("timebucket")
+                        .fixedInterval(new DateHistogramInterval("1h"))
+                        .field("time")
+                )
+            ).subAggregation(AggregationBuilders.max("time").field("time"))
+        );
+        DatafeedConfig compositeDatafeedConfig = createDatafeedBuilder(
+            compositeJobId + "-datafeed",
+            compositeJobId,
+            Collections.singletonList(indexName))
+            .setParsedAggregations(aggs)
+            .setFrequency(TimeValue.timeValueHours(1))
+            .setChunkingConfig(ChunkingConfig.newManual(TimeValue.timeValueHours(1)))
+            .build();
+        registerDatafeed(compositeDatafeedConfig);
+        putDatafeed(compositeDatafeedConfig);
+        startDatafeed(compositeDatafeedConfig.getId(), 0L, null);
+
+        // Wait until we have processed data
+        assertBusy(() -> assertThat(getDataCounts(compositeJobId).getProcessedRecordCount(), greaterThan(0L)));
+        stopDatafeed(compositeDatafeedConfig.getId());
+        assertBusy(() ->
+            assertThat(getJobStats(compositeJobId).get(0).getState(), is(oneOf(JobState.CLOSED, JobState.OPENED)))
+        );
+        // If we are not OPENED, then we are closed and shouldn't restart as the datafeed finished running through the data
+        if (getJobStats(compositeJobId).get(0).getState().equals(JobState.OPENED)) {
+            updateDatafeed(new DatafeedUpdate.Builder()
+                .setId(compositeDatafeedConfig.getId())
+                .setChunkingConfig(ChunkingConfig.newAuto())
+                .build());
+            startDatafeed(
+                compositeDatafeedConfig.getId(),
+                randomLongBetween(0, getDataCounts(compositeJobId).getLatestRecordTimeStamp().getTime()),
+                now
+            );
+            waitUntilJobIsClosed(compositeJobId);
+        }
+
+        List<Bucket> scrollBuckets = getBuckets(scrollJobId);
+        List<Bucket> compositeBuckets = getBuckets(compositeJobId);
+        for (int i = 0; i < scrollBuckets.size(); i++) {
+            Bucket scrollBucket = scrollBuckets.get(i);
+            Bucket compositeBucket = compositeBuckets.get(i);
+            assertThat(
+                "composite bucket [" + compositeBucket.getTimestamp() +"] [" + compositeBucket.getEventCount() +"] does not equal"
+                + "scroll bucket [" + scrollBucket.getTimestamp() + "] [" + scrollBucket.getEventCount() + "]",
+                compositeBucket.getEventCount(),
+                equalTo(scrollBucket.getEventCount())
+            );
+        }
+    }
+
     private void assertDatafeedStats(String datafeedId, DatafeedState state, String jobId, Matcher<Long> searchCountMatcher) {
         GetDatafeedsStatsAction.Request request = new GetDatafeedsStatsAction.Request(datafeedId);
         GetDatafeedsStatsAction.Response response = client().execute(GetDatafeedsStatsAction.INSTANCE, request).actionGet();
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
index 4ea7423acdfc4..ec9e82ae45cdf 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
@@ -186,7 +186,7 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
             }
         }
 
-        request.setNodes(executorNodes.toArray(new String[executorNodes.size()]));
+        request.setNodes(executorNodes.toArray(new String[0]));
 
         // wait for started and stopping datafeeds
         // Map datafeedId -> datafeed task Id.

From b0d6ba86b6aed29df287ecc0d4cac0e53b18a3b2 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 17 Mar 2021 12:35:08 -0400
Subject: [PATCH 08/20] unifying agg to json processor methods

---
 .../AbstractAggregationDataExtractor.java     |  5 ++-
 .../AggregationToJsonProcessor.java           | 42 ++-----------------
 .../CompositeAggregationDataExtractor.java    |  2 +-
 .../AggregationToJsonProcessorTests.java      |  2 +-
 4 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
index b25c814a4a954..7fb08066e9539 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
@@ -54,7 +54,7 @@ abstract class AbstractAggregationDataExtractor<T extends ActionRequestBuilder<S
     protected final AggregationDataExtractorContext context;
     private final DatafeedTimingStatsReporter timingStatsReporter;
     private boolean hasNext;
-    private boolean isCancelled;
+    private volatile boolean isCancelled;
     private AggregationToJsonProcessor aggregationToJsonProcessor;
     private final ByteArrayOutputStream outputStream;
 
@@ -167,7 +167,8 @@ private Aggregations validateAggs(@Nullable Aggregations aggs) {
     private InputStream processNextBatch() throws IOException {
         outputStream.reset();
 
-        hasNext = aggregationToJsonProcessor.writeDocs(BATCH_KEY_VALUE_PAIRS, outputStream);
+        // We can cancel immediately as we process whole date_histogram buckets at a time
+        hasNext = aggregationToJsonProcessor.writeAllDocsCancellable(_timestamp -> isCancelled, outputStream);
         return new ByteArrayInputStream(outputStream.toByteArray());
     }
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
index d607090fa7979..5b9fb1d0f999f 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessor.java
@@ -417,48 +417,14 @@ private void queueDocToWrite(Map<String, Object> doc, long docCount) {
         }
     }
 
-    /**
-     * Write the aggregated documents one bucket at a time until {@code batchSize}
-     * key-value pairs have been written. Buckets are written in their entirety and
-     * the check on {@code batchSize} run after the bucket has been written so more
-     * than {@code batchSize} key-value pairs could be written.
-     * The function should be called repeatedly until it returns false, at that point
-     * there are no more documents to write.
-     *
-     * @param batchSize The number of key-value pairs to write.
-     * @return True if there are any more documents to write after the call.
-     * False if there are no documents to write.
-     * @throws IOException If an error occurs serialising the JSON
-     */
-    boolean writeDocs(int batchSize, OutputStream outputStream) throws IOException {
-
-        if (docsByBucketTimestamp.isEmpty()) {
-            return false;
-        }
-
-        try (XContentBuilder jsonBuilder = new XContentBuilder(JsonXContent.jsonXContent, outputStream)) {
-            long previousWrittenCount = keyValueWrittenCount;
-            Iterator<Map.Entry<Long, List<Map<String, Object>>>> iterator = docsByBucketTimestamp.entrySet().iterator();
-            while (iterator.hasNext()) {
-                Map.Entry<Long, List<Map<String, Object>>> entry = iterator.next();
-                for (Map<String, Object> map : entry.getValue()) {
-                    writeJsonObject(jsonBuilder, map);
-                }
-                iterator.remove();
-
-                if (keyValueWrittenCount - previousWrittenCount >= batchSize) {
-                    break;
-                }
-            }
-        }
-
-        return docsByBucketTimestamp.isEmpty() == false;
-    }
-
     /**
      * This writes ALL the documents stored within the processor object unless indicated otherwise by the `shouldCancel` predicate
      *
      * This returns `true` if it is safe to cancel the overall process as the current `date_histogram` bucket has finished.
+     *
+     * For a simple `date_histogram` this is guaranteed. But, for a `composite` agg, it is possible that the current page is in the
+     * middle of a bucket. If you are writing with `composite` aggs, don't cancel the processing until this method returns true.
+     *
      * @param shouldCancel determines if a given timestamp indicates that the processing stream should be cancelled
      * @param outputStream where to write the aggregated data
      * @return true if it is acceptable for the caller to close the process and cancel the stream
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
index 8fc0e208b0a7a..4f333a24c082a 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -51,8 +51,8 @@ class CompositeAggregationDataExtractor implements DataExtractor {
     private final DatafeedTimingStatsReporter timingStatsReporter;
     private final AggregatedSearchRequestBuilder requestBuilder;
     private final long interval;
+    private volatile boolean isCancelled;
     private boolean hasNext;
-    private boolean isCancelled;
 
     CompositeAggregationDataExtractor(
         CompositeAggregationBuilder compositeAggregationBuilder,
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
index 0f2b672d0fe13..7e2284f684bf4 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationToJsonProcessorTests.java
@@ -617,7 +617,7 @@ private String aggToString(Set<String> fields, Aggregations aggregations) throws
             compositeAggValueSource
         );
         processor.process(aggregations);
-        processor.writeDocs(10000, outputStream);
+        processor.writeAllDocsCancellable(_timestamp -> false, outputStream);
         keyValuePairsWritten = processor.getKeyValueCount();
         return outputStream.toString(StandardCharsets.UTF_8.name());
     }

From 767bf044705c7028b7d36cc20ec7db129beaca5b Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 17 Mar 2021 14:43:10 -0400
Subject: [PATCH 09/20] removing unused batching parameter

---
 .../AbstractAggregationDataExtractor.java     |  9 ----
 .../AggregationDataExtractorTests.java        | 47 ++++---------------
 2 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
index 7fb08066e9539..5047d6deaee51 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
@@ -41,15 +41,6 @@ abstract class AbstractAggregationDataExtractor<T extends ActionRequestBuilder<S
 
     private static final Logger LOGGER = LogManager.getLogger(AbstractAggregationDataExtractor.class);
 
-    /**
-     * The number of key-value pairs written in each batch to process.
-     * This has to be a number that is small enough to allow for responsive
-     * cancelling and big enough to not cause overhead by calling the
-     * post data action too often. The value of 1000 was determined via
-     * such testing.
-     */
-    private static final int BATCH_KEY_VALUE_PAIRS = 1000;
-
     protected final Client client;
     protected final AggregationDataExtractorContext context;
     private final DatafeedTimingStatsReporter timingStatsReporter;
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorTests.java
index 6fe60925ab2bb..5f923eff7ca02 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationDataExtractorTests.java
@@ -146,38 +146,6 @@ public void testExtraction() throws IOException {
                 stringContainsInOrder(Arrays.asList("aggregations", "histogram", "time", "terms", "airline", "avg", "responsetime")));
     }
 
-    public void testExtractionGivenMultipleBatches() throws IOException {
-        // Each bucket is 4 key-value pairs and there are 2 terms, thus 600 buckets will be 600 * 4 * 2 = 4800
-        // key-value pairs. They should be processed in 5 batches.
-        int buckets = 600;
-        List<Histogram.Bucket> histogramBuckets = new ArrayList<>(buckets);
-        long timestamp = 1000;
-        for (int i = 0; i < buckets; i++) {
-            histogramBuckets.add(createHistogramBucket(timestamp, 3, Arrays.asList(createMax("time", timestamp),
-                    createTerms("airline", new Term("c", 4, "responsetime", 31.0), new Term("b", 3, "responsetime", 32.0)))));
-            timestamp += 1000L;
-        }
-
-        TestDataExtractor extractor = new TestDataExtractor(1000L, timestamp + 1);
-
-        SearchResponse response = createSearchResponse("time", histogramBuckets);
-        extractor.setNextResponse(response);
-
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(200L));
-        assertThat(extractor.hasNext(), is(false));
-
-        assertThat(capturedSearchRequests.size(), equalTo(1));
-    }
-
     public void testExtractionGivenResponseHasNullAggs() throws IOException {
         TestDataExtractor extractor = new TestDataExtractor(1000L, 2000L);
 
@@ -246,13 +214,16 @@ public void testExtractionGivenCancelHalfWay() throws IOException {
         extractor.setNextResponse(response);
 
         assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(250L));
-        assertThat(extractor.hasNext(), is(true));
-
+        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(2400L));
+        histogramBuckets = new ArrayList<>(buckets);
+        for (int i = 0; i < buckets; i++) {
+            histogramBuckets.add(createHistogramBucket(timestamp, 3, Arrays.asList(createMax("time", timestamp),
+                createTerms("airline", new Term("c", 4, "responsetime", 31.0), new Term("b", 3, "responsetime", 32.0)))));
+            timestamp += 1000L;
+        }
+        response = createSearchResponse("time", histogramBuckets);
+        extractor.setNextResponse(response);
         extractor.cancel();
-
         assertThat(extractor.hasNext(), is(false));
         assertThat(extractor.isCancelled(), is(true));
 

From 4640b5637df9f9cd42ced557fd543c49d069ab01 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 17 Mar 2021 16:56:00 -0400
Subject: [PATCH 10/20] fixing composite agg cancelling logic

---
 .../CompositeAggregationDataExtractor.java    |  19 +-
 .../aggregation/AggregationTestUtils.java     |   9 +
 ...ompositeAggregationDataExtractorTests.java | 397 ++++++++++++++++++
 3 files changed, 419 insertions(+), 6 deletions(-)
 create mode 100644 x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorTests.java

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
index 4f333a24c082a..e5e27c6283614 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -20,6 +20,7 @@
 import org.elasticsearch.xpack.core.ClientHelper;
 import org.elasticsearch.xpack.core.ml.datafeed.extractor.DataExtractor;
 import org.elasticsearch.xpack.core.ml.datafeed.extractor.ExtractorUtils;
+import org.elasticsearch.xpack.core.ml.utils.Intervals;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
 
 import java.io.ByteArrayInputStream;
@@ -52,6 +53,7 @@ class CompositeAggregationDataExtractor implements DataExtractor {
     private final AggregatedSearchRequestBuilder requestBuilder;
     private final long interval;
     private volatile boolean isCancelled;
+    private volatile long nextBucketOnCancel;
     private boolean hasNext;
 
     CompositeAggregationDataExtractor(
@@ -144,8 +146,6 @@ private Aggregations search() {
         if (compositeAgg == null || compositeAgg.getBuckets().isEmpty()) {
             return null;
         }
-        afterKey = compositeAgg.afterKey();
-
         return aggregations;
     }
 
@@ -169,7 +169,6 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
         aggregationToJsonProcessor.process(aggs);
         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
         final boolean hasAfterKey = afterKey != null && (afterKey.get(context.compositeAggDateHistogramGroupSourceName) instanceof Long);
-        final Long nextBucket = hasAfterKey ? (Long)afterKey.get(context.compositeAggDateHistogramGroupSourceName) + interval : null;
         boolean cancellable = aggregationToJsonProcessor.writeAllDocsCancellable(
             timestamp -> {
                 if (isCancelled) {
@@ -178,9 +177,13 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
                     if (hasAfterKey == false) {
                         return true;
                     }
-                    // If the current timestamp is part of the next bucket given our current afterKey, the process can be cancelled
-                    // as it has passed into the next date_histogram bucket in the composite aggregation paging
-                    return timestamp >= nextBucket;
+                    if (nextBucketOnCancel == 0L) {
+                        // If we have been cancelled, record the bucket above our latest timestamp
+                        // This indicates when we have completed the current bucket of this timestamp and thus will move to the next
+                        // date_histogram bucket
+                        nextBucketOnCancel = Intervals.alignToCeil(timestamp, interval);
+                    }
+                    return timestamp >= nextBucketOnCancel;
                 }
                 return false;
             }, outputStream);
@@ -188,6 +191,10 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
         if (isCancelled && cancellable) {
             hasNext = false;
         }
+        // Only set the after key once we have processed the search, allows us to cancel on the first page
+        CompositeAggregation compositeAgg = aggs.get(compositeAggregationBuilder.getName());
+        afterKey = compositeAgg.afterKey();
+
         return new ByteArrayInputStream(outputStream.toByteArray());
     }
 
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
index 5e49232ee8d1d..b809ac19cae18 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AggregationTestUtils.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
 import org.elasticsearch.search.aggregations.bucket.terms.StringTerms;
 import org.elasticsearch.search.aggregations.bucket.terms.Terms;
+import org.elasticsearch.search.aggregations.metrics.Avg;
 import org.elasticsearch.search.aggregations.metrics.GeoCentroid;
 import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation;
 import org.elasticsearch.search.aggregations.metrics.Max;
@@ -102,6 +103,14 @@ static Max createMax(String name, double value) {
         return max;
     }
 
+    static Avg createAvg(String name, double value) {
+        Avg avg = mock(Avg.class);
+        when(avg.getName()).thenReturn(name);
+        when(avg.value()).thenReturn(value);
+        when(avg.getValue()).thenReturn(value);
+        return avg;
+    }
+
     static GeoCentroid createGeoCentroid(String name, long count, double lat, double lon) {
         GeoCentroid centroid = mock(GeoCentroid.class);
         when(centroid.count()).thenReturn(count);
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorTests.java
new file mode 100644
index 0000000000000..d53048db54977
--- /dev/null
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractorTests.java
@@ -0,0 +1,397 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+package org.elasticsearch.xpack.ml.datafeed.extractor.aggregation;
+
+import org.elasticsearch.action.ActionRequestBuilder;
+import org.elasticsearch.action.search.SearchAction;
+import org.elasticsearch.action.search.SearchPhaseExecutionException;
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.search.ShardSearchFailure;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.collect.MapBuilder;
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilder;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.search.aggregations.AggregationBuilders;
+import org.elasticsearch.search.aggregations.Aggregations;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregation;
+import org.elasticsearch.search.aggregations.bucket.composite.CompositeAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.composite.TermsValuesSourceBuilder;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.core.ml.datafeed.DatafeedTimingStats;
+import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter;
+import org.elasticsearch.xpack.ml.datafeed.DatafeedTimingStatsReporter.DatafeedTimingStatsPersister;
+import org.junit.Before;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createAvg;
+import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createCompositeBucket;
+import static org.elasticsearch.xpack.ml.datafeed.extractor.aggregation.AggregationTestUtils.createMax;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.stringContainsInOrder;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class CompositeAggregationDataExtractorTests extends ESTestCase {
+
+    private Client testClient;
+    private List<SearchRequest> capturedSearchRequests;
+    private String jobId;
+    private String timeField;
+    private Set<String> fields;
+    private List<String> indices;
+    private QueryBuilder query;
+    private DatafeedTimingStatsReporter timingStatsReporter;
+    private CompositeAggregationBuilder compositeAggregationBuilder;
+    private AggregatedSearchRequestBuilder aggregatedSearchRequestBuilder;
+    private Map<String, Object> runtimeMappings;
+
+    private class TestDataExtractor extends CompositeAggregationDataExtractor {
+
+        private SearchResponse nextResponse;
+        private SearchPhaseExecutionException ex;
+
+        TestDataExtractor(long start, long end) {
+            super(compositeAggregationBuilder, testClient, createContext(start, end), timingStatsReporter, aggregatedSearchRequestBuilder);
+        }
+
+        @Override
+        protected SearchResponse executeSearchRequest(ActionRequestBuilder<SearchRequest, SearchResponse> searchRequestBuilder) {
+            capturedSearchRequests.add(searchRequestBuilder.request());
+            if (ex != null) {
+                throw ex;
+            }
+            return nextResponse;
+        }
+
+        void setNextResponse(SearchResponse searchResponse) {
+            nextResponse = searchResponse;
+        }
+
+        void setNextResponseToError(SearchPhaseExecutionException ex) {
+            this.ex = ex;
+        }
+    }
+
+    @Before
+    public void setUpTests() {
+        testClient = mock(Client.class);
+        capturedSearchRequests = new ArrayList<>();
+        jobId = "test-job";
+        timeField = "time";
+        fields = new HashSet<>();
+        fields.addAll(Arrays.asList("time", "airline", "responsetime"));
+        indices = Arrays.asList("index-1", "index-2");
+        query = QueryBuilders.matchAllQuery();
+        compositeAggregationBuilder = AggregationBuilders.composite(
+            "buckets",
+            Arrays.asList(
+                new DateHistogramValuesSourceBuilder("time_bucket")
+                    .field("time")
+                    .fixedInterval(new DateHistogramInterval("1000ms")),
+                new TermsValuesSourceBuilder("airline").field("airline")))
+            .size(10)
+            .subAggregation(AggregationBuilders.max("time").field("time"))
+            .subAggregation(AggregationBuilders.avg("responsetime").field("responsetime"));
+        runtimeMappings = Collections.emptyMap();
+        timingStatsReporter = new DatafeedTimingStatsReporter(new DatafeedTimingStats(jobId), mock(DatafeedTimingStatsPersister.class));
+        aggregatedSearchRequestBuilder = (searchSourceBuilder) -> new SearchRequestBuilder(testClient, SearchAction.INSTANCE)
+            .setSource(searchSourceBuilder)
+            .setAllowPartialSearchResults(false)
+            .setIndices(indices.toArray(String[]::new));
+    }
+
+    public void testExtraction() throws IOException {
+        List<CompositeAggregation.Bucket> compositeBucket = Arrays.asList(
+            createCompositeBucket(
+                1000L,
+                "time_bucket",
+                1,
+                Arrays.asList(createMax("time", 1999), createAvg("responsetime", 11.0)),
+                Collections.singletonList(Tuple.tuple("airline", "a"))
+            ),
+            createCompositeBucket(
+                1000L,
+                "time_bucket",
+                2,
+                Arrays.asList(createMax("time", 1999), createAvg("responsetime", 12.0)),
+                Collections.singletonList(Tuple.tuple("airline", "b"))
+            ),
+            createCompositeBucket(
+                2000L,
+                "time_bucket",
+                0,
+                Collections.emptyList(),
+                Collections.emptyList()
+            ),
+            createCompositeBucket(
+                3000L,
+                "time_bucket",
+                4,
+                Arrays.asList(createMax("time", 3999), createAvg("responsetime", 31.0)),
+                Collections.singletonList(Tuple.tuple("airline", "c"))
+            ),
+            createCompositeBucket(
+                3000L,
+                "time_bucket",
+                3,
+                Arrays.asList(createMax("time", 3999), createAvg("responsetime", 32.0)),
+                Collections.singletonList(Tuple.tuple("airline", "b"))
+            ));
+
+        TestDataExtractor extractor = new TestDataExtractor(1000L, 4000L);
+
+        SearchResponse response = createSearchResponse("buckets",
+            compositeBucket,
+            MapBuilder.<String, Object>newMapBuilder()
+                .put("time_bucket", 4000L)
+                .put("airline", "d")
+                .map()
+        );
+        extractor.setNextResponse(response);
+
+        assertThat(extractor.hasNext(), is(true));
+        Optional<InputStream> stream = extractor.next();
+        assertThat(stream.isPresent(), is(true));
+        String expectedStream = "{\"airline\":\"a\",\"time\":1999,\"responsetime\":11.0,\"doc_count\":1} "
+                + "{\"airline\":\"b\",\"time\":1999,\"responsetime\":12.0,\"doc_count\":2} "
+                + "{\"airline\":\"c\",\"time\":3999,\"responsetime\":31.0,\"doc_count\":4} "
+                + "{\"airline\":\"b\",\"time\":3999,\"responsetime\":32.0,\"doc_count\":3}";
+        assertThat(asString(stream.get()), equalTo(expectedStream));
+        assertThat(capturedSearchRequests.size(), equalTo(1));
+
+        String searchRequest = capturedSearchRequests.get(0).toString().replaceAll("\\s", "");
+        assertThat(searchRequest, containsString("\"size\":0"));
+        assertThat(searchRequest, containsString("\"query\":{\"bool\":{\"filter\":[{\"match_all\":{\"boost\":1.0}}," +
+                "{\"range\":{\"time\":{\"from\":1000,\"to\":4000,\"include_lower\":true,\"include_upper\":false," +
+                "\"format\":\"epoch_millis\",\"boost\":1.0}}}]"));
+        assertThat(searchRequest,
+                stringContainsInOrder(Arrays.asList("aggregations", "composite", "time", "terms", "airline", "avg", "responsetime")));
+    }
+
+    public void testExtractionGivenResponseHasNullAggs() throws IOException {
+        TestDataExtractor extractor = new TestDataExtractor(1000L, 2000L);
+
+        SearchResponse response = createSearchResponse(null);
+        extractor.setNextResponse(response);
+
+        assertThat(extractor.hasNext(), is(true));
+        assertThat(extractor.next().isPresent(), is(false));
+        assertThat(extractor.hasNext(), is(false));
+
+        assertThat(capturedSearchRequests.size(), equalTo(1));
+    }
+
+    public void testExtractionGivenResponseHasEmptyAggs() throws IOException {
+        TestDataExtractor extractor = new TestDataExtractor(1000L, 2000L);
+        Aggregations emptyAggs = AggregationTestUtils.createAggs(Collections.emptyList());
+        SearchResponse response = createSearchResponse(emptyAggs);
+        extractor.setNextResponse(response);
+
+        assertThat(extractor.hasNext(), is(true));
+        assertThat(extractor.next().isPresent(), is(false));
+        assertThat(extractor.hasNext(), is(false));
+
+        assertThat(capturedSearchRequests.size(), equalTo(1));
+    }
+
+    public void testExtractionGivenCancelBeforeNext() {
+        TestDataExtractor extractor = new TestDataExtractor(1000L, 4000L);
+        SearchResponse response = createSearchResponse("time", Collections.emptyList(), Collections.emptyMap());
+        extractor.setNextResponse(response);
+
+        extractor.cancel();
+        // Composite aggs should be true because we need to make sure the first search has occurred or not
+        assertThat(extractor.hasNext(), is(true));
+    }
+
+    public void testExtractionCancelOnFirstPage() throws IOException {
+        int numBuckets = 10;
+        List<CompositeAggregation.Bucket> buckets = new ArrayList<>(numBuckets);
+        long timestamp = 1000;
+        for (int i = 0; i < numBuckets; i++) {
+            buckets.add(
+                createCompositeBucket(
+                    timestamp,
+                    "time_bucket",
+                    3,
+                    Arrays.asList(createMax("time", randomLongBetween(timestamp, timestamp + 1000)), createAvg("responsetime", 32.0)),
+                    Collections.singletonList(Tuple.tuple("airline", "c"))
+                )
+            );
+        }
+
+        TestDataExtractor extractor = new TestDataExtractor(1000L, timestamp + 1000 + 1);
+
+        SearchResponse response = createSearchResponse(
+            "buckets",
+            buckets,
+            MapBuilder.<String, Object>newMapBuilder()
+                .put("time_bucket", 1000L)
+                .put("airline", "d")
+                .map()
+        );
+        extractor.setNextResponse(response);
+        extractor.cancel();
+        // We should have next right now as we have not yet determined if we have handled a page or not
+        assertThat(extractor.hasNext(), is(true));
+        // Should be empty
+        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(0L));
+        // Determined that we were on the first page and ended
+        assertThat(extractor.hasNext(), is(false));
+    }
+
+    public void testExtractionGivenCancelHalfWay() throws IOException {
+        int numBuckets = 10;
+        List<CompositeAggregation.Bucket> buckets = new ArrayList<>(numBuckets);
+        long timestamp = 1000;
+        for (int i = 0; i < numBuckets; i++) {
+            buckets.add(
+                createCompositeBucket(
+                    timestamp,
+                    "time_bucket",
+                    3,
+                    Arrays.asList(createMax("time", randomLongBetween(timestamp, timestamp + 1000)), createAvg("responsetime", 32.0)),
+                    Collections.singletonList(Tuple.tuple("airline", "c"))
+                )
+            );
+        }
+
+        TestDataExtractor extractor = new TestDataExtractor(1000L, timestamp + 1000 + 1);
+
+        SearchResponse response = createSearchResponse(
+            "buckets",
+            buckets,
+            MapBuilder.<String, Object>newMapBuilder()
+                .put("time_bucket", 1000L)
+                .put("airline", "d")
+                .map()
+        );
+        extractor.setNextResponse(response);
+
+        assertThat(extractor.hasNext(), is(true));
+        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(10L));
+        buckets = new ArrayList<>(numBuckets);
+        for (int i = 0; i < 6; i++) {
+            buckets.add(
+                createCompositeBucket(
+                    timestamp,
+                    "time_bucket",
+                    3,
+                    Arrays.asList(createMax("time", randomLongBetween(timestamp, timestamp + 1000)), createAvg("responsetime", 32.0)),
+                    Collections.singletonList(Tuple.tuple("airline", "c"))
+                )
+            );
+        }
+        timestamp += 1000;
+        for (int i = 0; i < 4; i++) {
+            buckets.add(
+                createCompositeBucket(
+                    timestamp,
+                    "time_bucket",
+                    3,
+                    Arrays.asList(createMax("time", randomLongBetween(timestamp, timestamp + 1000)),
+                        createAvg("responsetime", 32.0)),
+                    Collections.singletonList(Tuple.tuple("airline", "c"))
+                )
+            );
+        }
+        response = createSearchResponse("buckets",
+            buckets,
+            MapBuilder.<String, Object>newMapBuilder()
+            .put("time_bucket", 3000L)
+            .put("airline", "a")
+            .map());
+        extractor.setNextResponse(response);
+        extractor.cancel();
+        assertThat(extractor.hasNext(), is(true));
+        assertThat(extractor.isCancelled(), is(true));
+        // Only the docs in the previous bucket before cancelling
+        assertThat(countMatches('{', asString(extractor.next().get())), equalTo(6L));
+
+        // Once we have handled the 6 remaining in that time bucket, we shouldn't finish the page and the extractor should end
+        assertThat(extractor.hasNext(), is(false));
+        assertThat(capturedSearchRequests.size(), equalTo(2));
+    }
+
+    public void testExtractionGivenSearchResponseHasError() {
+        TestDataExtractor extractor = new TestDataExtractor(1000L, 2000L);
+        extractor.setNextResponseToError(new SearchPhaseExecutionException("phase 1", "boom", ShardSearchFailure.EMPTY_ARRAY));
+
+        assertThat(extractor.hasNext(), is(true));
+        expectThrows(SearchPhaseExecutionException.class, extractor::next);
+    }
+
+    private CompositeAggregationDataExtractorContext createContext(long start, long end) {
+        return new CompositeAggregationDataExtractorContext(
+            jobId,
+            timeField,
+            fields,
+            indices,
+            query,
+            compositeAggregationBuilder,
+            "time_bucket",
+            start,
+            end,
+            true,
+            Collections.emptyMap(),
+            SearchRequest.DEFAULT_INDICES_OPTIONS,
+            runtimeMappings);
+    }
+
+    @SuppressWarnings("unchecked")
+    private SearchResponse createSearchResponse(String aggName, List<CompositeAggregation.Bucket> buckets, Map<String, Object> afterKey) {
+        CompositeAggregation compositeAggregation = mock(CompositeAggregation.class);
+        when(compositeAggregation.getName()).thenReturn(aggName);
+        when(compositeAggregation.afterKey()).thenReturn(afterKey);
+        when((List<CompositeAggregation.Bucket>)compositeAggregation.getBuckets()).thenReturn(buckets);
+
+        Aggregations searchAggs = AggregationTestUtils.createAggs(Collections.singletonList(compositeAggregation));
+        return createSearchResponse(searchAggs);
+    }
+
+    private SearchResponse createSearchResponse(Aggregations aggregations) {
+        SearchResponse searchResponse = mock(SearchResponse.class);
+        when(searchResponse.status()).thenReturn(RestStatus.OK);
+        when(searchResponse.getScrollId()).thenReturn(randomAlphaOfLength(1000));
+        when(searchResponse.getAggregations()).thenReturn(aggregations);
+        when(searchResponse.getTook()).thenReturn(TimeValue.timeValueMillis(randomNonNegativeLong()));
+        return searchResponse;
+    }
+
+    private static String asString(InputStream inputStream) throws IOException {
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
+            return reader.lines().collect(Collectors.joining("\n"));
+        }
+    }
+
+    private static long countMatches(char c, String text) {
+        return text.chars().filter(current -> current == c).count();
+    }
+}

From df2c7d3735eaa7ec6e826e13e4e16e8832b87776 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Thu, 18 Mar 2021 11:58:46 -0400
Subject: [PATCH 11/20] refreshing index on datafeed stop and adding logging

---
 .../xpack/ml/integration/DatafeedJobsIT.java  | 49 ++++++++++++++++---
 .../action/TransportStopDatafeedAction.java   | 40 ++++++++++++++-
 .../xpack/ml/datafeed/DatafeedJob.java        |  5 +-
 .../AbstractAggregationDataExtractor.java     |  4 +-
 .../CompositeAggregationDataExtractor.java    | 14 ++++++
 5 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
index 947263c4ac842..9c90f3e8ccdd3 100644
--- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
+++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
@@ -6,20 +6,26 @@
  */
 package org.elasticsearch.xpack.ml.integration;
 
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ResourceNotFoundException;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.admin.cluster.node.hotthreads.NodeHotThreads;
 import org.elasticsearch.action.admin.cluster.node.hotthreads.NodesHotThreadsResponse;
+import org.elasticsearch.action.search.SearchRequest;
 import org.elasticsearch.action.support.master.AcknowledgedResponse;
 import org.elasticsearch.common.CheckedRunnable;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.ConcurrentMapLong;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHits;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.bucket.composite.DateHistogramValuesSourceBuilder;
 import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
-import org.elasticsearch.xpack.core.ml.MlTasks;
+import org.elasticsearch.search.builder.SearchSourceBuilder;
 import org.elasticsearch.xpack.core.ml.action.DeleteDatafeedAction;
 import org.elasticsearch.xpack.core.ml.action.GetDatafeedsStatsAction;
 import org.elasticsearch.xpack.core.ml.action.GetJobsStatsAction;
@@ -37,7 +43,6 @@
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.DataCounts;
 import org.elasticsearch.xpack.core.ml.job.results.Bucket;
-import org.hamcrest.CoreMatchers;
 import org.hamcrest.Matcher;
 import org.junit.After;
 
@@ -66,6 +71,12 @@ public class DatafeedJobsIT extends MlNativeAutodetectIntegTestCase {
 
     @After
     public void cleanup() {
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder()
+                .putNull("logger.org.elasticsearch.xpack.ml.datafeed")
+                .build()).get();
         cleanUp();
     }
 
@@ -295,6 +306,12 @@ public void testDatafeedTimingStats_QueryDelayUpdated_TimingStatsNotReset() thro
     }
 
     public void testStopAndRestartCompositeDatafeed() throws Exception {
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder()
+                .put("logger.org.elasticsearch.xpack.ml.datafeed", "TRACE")
+                .build()).get();
         String indexName = "stop-restart-data";
         client().admin().indices().prepareCreate("stop-restart-data")
             .setMapping("time", "type=date")
@@ -401,12 +418,28 @@ public void testStopAndRestartCompositeDatafeed() throws Exception {
         for (int i = 0; i < scrollBuckets.size(); i++) {
             Bucket scrollBucket = scrollBuckets.get(i);
             Bucket compositeBucket = compositeBuckets.get(i);
-            assertThat(
-                "composite bucket [" + compositeBucket.getTimestamp() +"] [" + compositeBucket.getEventCount() +"] does not equal"
-                + "scroll bucket [" + scrollBucket.getTimestamp() + "] [" + scrollBucket.getEventCount() + "]",
-                compositeBucket.getEventCount(),
-                equalTo(scrollBucket.getEventCount())
-            );
+            try {
+                assertThat(
+                    "composite bucket [" + compositeBucket.getTimestamp() + "] [" + compositeBucket.getEventCount() + "] does not equal"
+                        + " scroll bucket [" + scrollBucket.getTimestamp() + "] [" + scrollBucket.getEventCount() + "]",
+                    compositeBucket.getEventCount(),
+                    equalTo(scrollBucket.getEventCount())
+                );
+            } catch (AssertionError ae) {
+                String originalMessage = ae.getMessage();
+                try {
+                    SearchSourceBuilder builder = new SearchSourceBuilder().query(QueryBuilders.rangeQuery("time")
+                        .gte(scrollBucket.getTimestamp().getTime())
+                        .lte(scrollBucket.getTimestamp().getTime() + TimeValue.timeValueHours(1).getMillis()))
+                        .size(10_000);
+                    SearchHits hits = client().search(new SearchRequest()
+                        .indices(indexName)
+                        .source(builder)).actionGet().getHits();
+                    fail("Hits: " + Strings.arrayToDelimitedString(hits.getHits(), "\n") + " \n failure: " + originalMessage);
+                } catch (ElasticsearchException ee) {
+                    fail("could not search indices for better info. Original failure: " + originalMessage);
+                }
+            }
         }
     }
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
index c8e54f45c02c3..d41a05b9e58bd 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
@@ -8,6 +8,7 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ResourceNotFoundException;
 import org.elasticsearch.action.ActionListener;
@@ -16,6 +17,8 @@
 import org.elasticsearch.action.TaskOperationFailure;
 import org.elasticsearch.action.support.ActionFilters;
 import org.elasticsearch.action.support.tasks.TransportTasksAction;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.client.OriginSettingClient;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
@@ -34,6 +37,7 @@
 import org.elasticsearch.xpack.core.ml.action.StopDatafeedAction;
 import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
+import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.datafeed.persistence.DatafeedConfigProvider;
@@ -49,6 +53,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
+
 public class TransportStopDatafeedAction extends TransportTasksAction<TransportStartDatafeedAction.DatafeedTask, StopDatafeedAction.Request,
         StopDatafeedAction.Response, StopDatafeedAction.Response> {
 
@@ -58,17 +64,20 @@ public class TransportStopDatafeedAction extends TransportTasksAction<TransportS
     private final PersistentTasksService persistentTasksService;
     private final DatafeedConfigProvider datafeedConfigProvider;
     private final AnomalyDetectionAuditor auditor;
+    private final OriginSettingClient client;
 
     @Inject
     public TransportStopDatafeedAction(TransportService transportService, ThreadPool threadPool, ActionFilters actionFilters,
                                        ClusterService clusterService, PersistentTasksService persistentTasksService,
-                                       DatafeedConfigProvider datafeedConfigProvider, AnomalyDetectionAuditor auditor) {
+                                       DatafeedConfigProvider datafeedConfigProvider, AnomalyDetectionAuditor auditor,
+                                       Client client) {
         super(StopDatafeedAction.NAME, clusterService, transportService, actionFilters, StopDatafeedAction.Request::new,
             StopDatafeedAction.Response::new, StopDatafeedAction.Response::new, MachineLearning.UTILITY_THREAD_POOL_NAME);
         this.threadPool = Objects.requireNonNull(threadPool);
         this.persistentTasksService = Objects.requireNonNull(persistentTasksService);
         this.datafeedConfigProvider = Objects.requireNonNull(datafeedConfigProvider);
         this.auditor = Objects.requireNonNull(auditor);
+        this.client = new OriginSettingClient(client, ML_ORIGIN);
     }
 
     /**
@@ -163,6 +172,7 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
                                     PersistentTasksCustomMetadata tasks, DiscoveryNodes nodes,
                                     List<String> startedDatafeeds, List<String> stoppingDatafeeds) {
         final Set<String> executorNodes = new HashSet<>();
+        final List<String> startedDatafeedsJobs = new ArrayList<>();
         for (String datafeedId : startedDatafeeds) {
             PersistentTasksCustomMetadata.PersistentTask<?> datafeedTask = MlTasks.getDatafeedTask(datafeedId, tasks);
             if (datafeedTask == null) {
@@ -171,6 +181,7 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
                 assert datafeedTask != null : msg;
                 logger.error(msg);
             } else if (PersistentTasksClusterService.needsReassignment(datafeedTask.getAssignment(), nodes) == false) {
+                startedDatafeedsJobs.add(((StartDatafeedAction.DatafeedParams) datafeedTask.getParams()).getJobId());
                 executorNodes.add(datafeedTask.getExecutorNode());
             } else {
                 // This is the easy case - the datafeed is not currently assigned to a valid node,
@@ -196,7 +207,32 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
                 .collect(Collectors.toList());
 
         ActionListener<StopDatafeedAction.Response> finalListener = ActionListener.wrap(
-                r -> waitForDatafeedStopped(allDataFeedsToWaitFor, request, r, listener),
+                r -> waitForDatafeedStopped(allDataFeedsToWaitFor, request, r, ActionListener.wrap(
+                    finished -> {
+                        if (startedDatafeedsJobs.isEmpty()) {
+                            listener.onResponse(finished);
+                            return;
+                        }
+                        client.admin().indices().prepareRefresh(startedDatafeedsJobs
+                            .stream()
+                            .map(AnomalyDetectorsIndex::jobResultsAliasedName)
+                            .toArray(String[]::new))
+                            .execute(ActionListener.wrap(
+                                _unused -> listener.onResponse(finished),
+                                ex -> {
+                                    logger.info(
+                                        () -> new ParameterizedMessage(
+                                            "failed to refresh job [{}] results indices when stopping datafeeds [{}]",
+                                            startedDatafeedsJobs,
+                                            startedDatafeeds
+                                        ),
+                                        ex);
+                                    listener.onResponse(finished);
+                                }
+                            ));
+                    },
+                    listener::onFailure
+                )),
                 e -> {
                     if (ExceptionsHelper.unwrapCause(e) instanceof FailedNodeException) {
                         // A node has dropped out of the cluster since we started executing the requests.
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
index 5fe9ba565327e..9e06afb7ea84f 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedJob.java
@@ -345,7 +345,10 @@ private void run(long start, long end, FlushJobAction.Request flushRequest) thro
                 DataCounts counts;
                 try (InputStream in = extractedData.get()) {
                     counts = postData(in, XContentType.JSON);
-                    LOGGER.trace("[{}] Processed another {} records", jobId, counts.getProcessedRecordCount());
+                    LOGGER.trace(() -> new ParameterizedMessage("[{}] Processed another {} records with latest timestamp [{}]",
+                        jobId,
+                        counts.getProcessedRecordCount(),
+                        counts.getLatestRecordTimeStamp()));
                     timingStatsReporter.reportDataCounts(counts);
                 } catch (Exception e) {
                     if (e instanceof InterruptedException) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
index 5047d6deaee51..36907a2b096c0 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/AbstractAggregationDataExtractor.java
@@ -159,7 +159,9 @@ private InputStream processNextBatch() throws IOException {
         outputStream.reset();
 
         // We can cancel immediately as we process whole date_histogram buckets at a time
-        hasNext = aggregationToJsonProcessor.writeAllDocsCancellable(_timestamp -> isCancelled, outputStream);
+        aggregationToJsonProcessor.writeAllDocsCancellable(_timestamp -> isCancelled, outputStream);
+        // We process the whole search. So, if we are chunking or not, we have nothing more to process given the current query
+        hasNext = false;
         return new ByteArrayInputStream(outputStream.toByteArray());
     }
 
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
index e5e27c6283614..52b6d404a1469 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/aggregation/CompositeAggregationDataExtractor.java
@@ -182,6 +182,12 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
                         // This indicates when we have completed the current bucket of this timestamp and thus will move to the next
                         // date_histogram bucket
                         nextBucketOnCancel = Intervals.alignToCeil(timestamp, interval);
+                        LOGGER.debug(() -> new ParameterizedMessage(
+                            "[{}] set future timestamp cancel to [{}] via timestamp [{}]",
+                            context.jobId,
+                            nextBucketOnCancel,
+                            timestamp
+                        ));
                     }
                     return timestamp >= nextBucketOnCancel;
                 }
@@ -189,6 +195,14 @@ private InputStream processAggs(Aggregations aggs) throws IOException {
             }, outputStream);
         // If the process is canceled and cancelable, then we can indicate that there are no more buckets to process.
         if (isCancelled && cancellable) {
+            LOGGER.debug(
+                () -> new ParameterizedMessage(
+                    "[{}] cancelled before bucket [{}] on date_histogram page [{}]",
+                    context.jobId,
+                    nextBucketOnCancel,
+                    hasAfterKey ? afterKey.get(context.compositeAggDateHistogramGroupSourceName) : "__null__"
+                )
+            );
             hasNext = false;
         }
         // Only set the after key once we have processed the search, allows us to cancel on the first page

From 055460e37c7c1b78a66ef92c7445bda8e3a911a6 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 23 Mar 2021 15:11:57 -0400
Subject: [PATCH 12/20] marking composite aggs as experimental

---
 .../ml-configuring-aggregations.asciidoc      | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 3214d6454997d..649c37d8201d6 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -11,29 +11,28 @@ distributes these calculations across your cluster. You can then feed this
 aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
-TIP: If you use a terms aggregation and the cardinality of a term is high, 
-use composite aggregations instead.
+TIP: If you use a terms aggregation and the cardinality of a term is high but
+still significantly less than your total number of documents, use composite
+aggregations
+experimental:[support for composite aggregations inside datafeeds is currently experimental].
 
 [discrete]
 [[aggs-limits-dfeeds]]
 == Requirements and limitations
 
-There are some limitations to using aggregations in {dfeeds}. Your aggregation
-must include a `date_histogram` aggregation or a top level `composite` aggregation,
+There are some limitations to using aggregations in {dfeeds}.
+
+Your aggregation must include a `date_histogram` aggregation or a top level `composite` aggregation,
 which in turn must contain a `max` aggregation on the time field.
 This requirement ensures that the aggregated data is a time series and the timestamp
 of each bucket is the time of the last record in the bucket.
 
-For `composite` aggregation support, there must be exactly one `date_histogram` value
-source. That value source must not be sorted in descending order. Additional
-`composite` aggregation value sources are allowed, such as `terms`.
-
 IMPORTANT: The name of the aggregation and the name of the field that the agg
 operates on need to match, otherwise the aggregation doesn't work. For example,
 if you use a `max` aggregation on a time field called `responsetime`, the name
 of the aggregation must be also `responsetime`.
 
-You must also consider the interval of the `date_histogram` or `composite`
+You must consider the interval of the `date_histogram` or `composite`
 aggregation carefully. The bucket span of your {anomaly-job} must be divisible
 by the value of the `calendar_interval` or `fixed_interval` in your aggregation
 (with no remainder). If you specify a `frequency` for your {dfeed},
@@ -61,10 +60,12 @@ To avoid this behavior, make sure that the aggregation interval in the {dfeed}
 configuration and the bucket span in the {anomaly-job} configuration have the
 same values.
 
+Your {dfeed} can contain multiple aggregations, but only the ones with names
+that match values in the job configuration are fed to the job.
 
 [discrete]
-[[aggs-include-jobs]]
-== Including aggregations in {anomaly-jobs}
+[[aggs-using-date-histogram]]
+=== Including aggregations in {anomaly-jobs}
 
 When you create or update an {anomaly-job}, you can include the names of
 aggregations, for example:
@@ -149,14 +150,21 @@ detector field information, consider using a `composite` aggregation. It perform
 better than a `date_histogram` with a nested `term` aggregation and also includes
 all the values of the field instead of the top values per bucket.
 
-NOTE: `composite` aggregations may not be as performant as a scrolling or 
+[discrete]
+[[aggs-using-composite]]
+=== Using composite aggregations in {anomaly-jobs}
+
+experimental::[]
+
+For `composite` aggregation support, there must be exactly one `date_histogram` value
+source. That value source must not be sorted in descending order. Additional
+`composite` aggregation value sources are allowed, such as `terms`.
+
+NOTE: `composite` aggregations may not be as performant as a scrolling or
 `date_histogram` oriented {dfeed}. `composite` aggregations are optimized
 for queries that are either `match_all` or `range` filters. Other types of
 queries may cause the `composite` aggregation to be ineffecient.
 
-Your {dfeed} can contain multiple aggregations, but only the ones with names
-that match values in the job configuration are fed to the job.
-
 Here is an example utilizing a `composite` aggregation instead of a
 `date_histogram`.
 
@@ -255,6 +263,9 @@ job analysis config.
 `derivative` pipeline aggregation to find the first order derivative of the
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
+NOTE: `derivative` or other pipeline aggregations may not work within `composite`
+aggregations.
+
 [source,js]
 ----------------------------------
 "aggregations": {
@@ -441,12 +452,10 @@ the time of the latest record within a bucket.
 You can optionally specify a terms aggregation, which creates buckets for
 different values of a field.
 
-TIP: Instead of nesting a `term` aggregation, use `composite` aggs when you
-have many terms and a simple range query.
-
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not
-all terms are analyzed.
+all terms are analyzed. In this case, consider using `composite` aggs
+experimental:[composite agg support in datafeeds is currently experimental]
 
 You can change this behavior by setting the `size` parameter. To
 determine the cardinality of your data, you can run searches such as:

From 0ff5f47293d8c6a31424c4dcc24ab6e9cef1abaa Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Wed, 24 Mar 2021 08:41:43 -0400
Subject: [PATCH 13/20] Apply suggestions from code review

Co-authored-by: Lisa Cawley <lcawley@elastic.co>
---
 .../ml-configuring-aggregations.asciidoc      | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 649c37d8201d6..83a49a0c86ccf 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -12,9 +12,9 @@ aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
 TIP: If you use a terms aggregation and the cardinality of a term is high but
-still significantly less than your total number of documents, use composite
-aggregations
-experimental:[support for composite aggregations inside datafeeds is currently experimental].
+still significantly less than your total number of documents, use <<search-aggregations-bucket-composite-aggregation,composite
+aggregations>>
+experimental:[Support for composite aggregations inside datafeeds is currently experimental].
 
 [discrete]
 [[aggs-limits-dfeeds]]
@@ -27,7 +27,7 @@ which in turn must contain a `max` aggregation on the time field.
 This requirement ensures that the aggregated data is a time series and the timestamp
 of each bucket is the time of the last record in the bucket.
 
-IMPORTANT: The name of the aggregation and the name of the field that the agg
+IMPORTANT: The name of the aggregation and the name of the field that it
 operates on need to match, otherwise the aggregation doesn't work. For example,
 if you use a `max` aggregation on a time field called `responsetime`, the name
 of the aggregation must be also `responsetime`.
@@ -38,7 +38,7 @@ by the value of the `calendar_interval` or `fixed_interval` in your aggregation
 (with no remainder). If you specify a `frequency` for your {dfeed},
 it must also be divisible by this interval. {anomaly-jobs-cap} cannot use
 `date_histogram` or `composite` aggs with an interval measured in months
-because the length of the month is not fixed. {dfeeds-cap} tolerate weeks or smaller units.
+because the length of the month is not fixed; they can use weeks or smaller units.
 
 TIP: As a rule of thumb, if your detectors use <<ml-metric-functions,metric>> or
 <<ml-sum-functions,sum>> analytical functions, set the `date_histogram` or `composite`
@@ -146,7 +146,7 @@ PUT _ml/datafeeds/datafeed-farequote
 `responsetime`.
 
 TIP: If you are utilizing a `term` aggregation to gather influencer or partition
-detector field information, consider using a `composite` aggregation. It performs
+field information, consider using a `composite` aggregation. It performs
 better than a `date_histogram` with a nested `term` aggregation and also includes
 all the values of the field instead of the top values per bucket.
 
@@ -160,12 +160,12 @@ For `composite` aggregation support, there must be exactly one `date_histogram`
 source. That value source must not be sorted in descending order. Additional
 `composite` aggregation value sources are allowed, such as `terms`.
 
-NOTE: `composite` aggregations may not be as performant as a scrolling or
-`date_histogram` oriented {dfeed}. `composite` aggregations are optimized
+NOTE: A {dfeed} that uses composite aggregations may not be as performant as datafeeds that use scrolling or
+date histogram aggregations. Composite aggregations are optimized
 for queries that are either `match_all` or `range` filters. Other types of
 queries may cause the `composite` aggregation to be ineffecient.
 
-Here is an example utilizing a `composite` aggregation instead of a
+Here is an example that uses a `composite` aggregation instead of a
 `date_histogram`.
 
 Assuming the same job configuration as above.
@@ -243,7 +243,7 @@ PUT _ml/datafeeds/datafeed-farequote-composite
 // TEST[skip:setup:farequote_job]
 
 <1> Provide the `size` to the composite agg to control how many resources
-are used when aggregated the data. A larger `size` means a faster datafeed but
+are used when aggregating the data. A larger `size` means a faster datafeed but
 more cluster resources are utilized when searching.
 <2> The required `date_histogram` composite aggregation source. Make sure it
 is named differently than your desired time field.
@@ -264,7 +264,7 @@ job analysis config.
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
-aggregations.
+aggregations. See <<composite aggregations and pipeline aggs,search-aggregations-bucket-composite-aggregation.html#_pipeline_aggregations>>
 
 [source,js]
 ----------------------------------
@@ -430,8 +430,8 @@ When using a `composite` aggregation:
 ----------------------------------
 // NOTCONSOLE
 
-The top level aggregation must be either a
-{ref}/search-aggregations-bucket.html[bucket aggregation] containing as single
+The top level aggregation must be one of the following:
+*  A {ref}/search-aggregations-bucket.html[bucket aggregation] containing a single
 sub-aggregation that is a `date_histogram`, the top level aggregation is the
 required `date_histogram`, or the top level aggregation is the required `composite`.
 There must be exactly one `date_histogram` or `composite` aggregation. For more information, see
@@ -455,7 +455,7 @@ different values of a field.
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not
 all terms are analyzed. In this case, consider using `composite` aggs
-experimental:[composite agg support in datafeeds is currently experimental]
+experimental:[Support for composite aggregations inside datafeeds is currently experimental].
 
 You can change this behavior by setting the `size` parameter. To
 determine the cardinality of your data, you can run searches such as:

From a2489986d505ff7f3a21de34b10c70a42a347eb0 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 24 Mar 2021 08:58:17 -0400
Subject: [PATCH 14/20] fixing anchor

---
 .../aggregations/bucket/composite-aggregation.asciidoc      | 1 +
 .../anomaly-detection/ml-configuring-aggregations.asciidoc  | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/reference/aggregations/bucket/composite-aggregation.asciidoc b/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
index 87d3fcb379312..851a58443f0f7 100644
--- a/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
@@ -848,6 +848,7 @@ GET /_search
 --------------------------------------------------
 // TESTRESPONSE[s/\.\.\.//]
 
+[[_pipeline_aggregations]]
 ==== Pipeline aggregations
 
 The composite agg is not currently compatible with pipeline aggregations, nor does it make sense in most cases.
diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 83a49a0c86ccf..ba0428efab507 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -12,8 +12,8 @@ aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
 TIP: If you use a terms aggregation and the cardinality of a term is high but
-still significantly less than your total number of documents, use <<search-aggregations-bucket-composite-aggregation,composite
-aggregations>>
+still significantly less than your total number of documents,
+use <<search-aggregations-bucket-composite-aggregation,composite aggregations>>
 experimental:[Support for composite aggregations inside datafeeds is currently experimental].
 
 [discrete]
@@ -264,7 +264,7 @@ job analysis config.
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
-aggregations. See <<composite aggregations and pipeline aggs,search-aggregations-bucket-composite-aggregation.html#_pipeline_aggregations>>
+aggregations. See <<search-aggregations-bucket-composite-aggregation#_pipeline_aggregations, composite aggregations and pipeline aggs>>
 
 [source,js]
 ----------------------------------

From 5152978260251426f389da0a78ebd3599f94aee3 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Wed, 24 Mar 2021 09:07:54 -0400
Subject: [PATCH 15/20] fixing anchor again

---
 .../aggregations/bucket/composite-aggregation.asciidoc          | 2 +-
 .../ml/anomaly-detection/ml-configuring-aggregations.asciidoc   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/reference/aggregations/bucket/composite-aggregation.asciidoc b/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
index 851a58443f0f7..16a5250ab0269 100644
--- a/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
@@ -848,7 +848,7 @@ GET /_search
 --------------------------------------------------
 // TESTRESPONSE[s/\.\.\.//]
 
-[[_pipeline_aggregations]]
+[[search-aggregations-bucket-composite-aggregation-pipeline-aggregations]]
 ==== Pipeline aggregations
 
 The composite agg is not currently compatible with pipeline aggregations, nor does it make sense in most cases.
diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index ba0428efab507..b9ecb1f234f4d 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -264,7 +264,7 @@ job analysis config.
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
-aggregations. See <<search-aggregations-bucket-composite-aggregation#_pipeline_aggregations, composite aggregations and pipeline aggs>>
+aggregations. See <<search-aggregations-bucket-composite-aggregation-pipeline-aggregations, composite aggregations and pipeline aggs>>
 
 [source,js]
 ----------------------------------

From 1878a19729ab63d8c88f47a58f6d6621f832b996 Mon Sep 17 00:00:00 2001
From: lcawl <lcawley@elastic.co>
Date: Wed, 24 Mar 2021 14:55:23 -0700
Subject: [PATCH 16/20] [DOCS] Fixes broken links

---
 .../ml-configuring-aggregations.asciidoc      | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index b9ecb1f234f4d..311048e091eb0 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -12,8 +12,8 @@ aggregated data into the {ml-features} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 
 TIP: If you use a terms aggregation and the cardinality of a term is high but
-still significantly less than your total number of documents,
-use <<search-aggregations-bucket-composite-aggregation,composite aggregations>>
+still significantly less than your total number of documents, use
+{ref}/search-aggregations-bucket-composite-aggregation.html[composite aggregations]
 experimental:[Support for composite aggregations inside datafeeds is currently experimental].
 
 [discrete]
@@ -37,7 +37,7 @@ aggregation carefully. The bucket span of your {anomaly-job} must be divisible
 by the value of the `calendar_interval` or `fixed_interval` in your aggregation
 (with no remainder). If you specify a `frequency` for your {dfeed},
 it must also be divisible by this interval. {anomaly-jobs-cap} cannot use
-`date_histogram` or `composite` aggs with an interval measured in months
+`date_histogram` or `composite` aggregations with an interval measured in months
 because the length of the month is not fixed; they can use weeks or smaller units.
 
 TIP: As a rule of thumb, if your detectors use <<ml-metric-functions,metric>> or
@@ -145,7 +145,7 @@ PUT _ml/datafeeds/datafeed-farequote
 <3> The `avg` aggregation is named `responsetime` and its field is also named
 `responsetime`.
 
-TIP: If you are utilizing a `term` aggregation to gather influencer or partition
+TIP: If you are using a `term` aggregation to gather influencer or partition
 field information, consider using a `composite` aggregation. It performs
 better than a `date_histogram` with a nested `term` aggregation and also includes
 all the values of the field instead of the top values per bucket.
@@ -190,7 +190,7 @@ PUT _ml/anomaly_detectors/farequote-composite
 ----------------------------------
 // TEST[skip:setup:farequote_data]
 
-This is an example of a datafeed utilizing a `composite` aggregation to bucket
+This is an example of a datafeed that uses a `composite` aggregation to bucket
 the metrics based on time and terms:
 
 [source,console]
@@ -244,10 +244,10 @@ PUT _ml/datafeeds/datafeed-farequote-composite
 
 <1> Provide the `size` to the composite agg to control how many resources
 are used when aggregating the data. A larger `size` means a faster datafeed but
-more cluster resources are utilized when searching.
+more cluster resources are used when searching.
 <2> The required `date_histogram` composite aggregation source. Make sure it
 is named differently than your desired time field.
-<3> Instead of utilizing a regular `term` aggregation, adding a composite
+<3> Instead of using a regular `term` aggregation, adding a composite
 aggregation `term` source with the name `airline` works. Note its name
 is the same as the field.
 <4> The required `max` aggregation whose name is the time field in the
@@ -264,7 +264,8 @@ job analysis config.
 counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
-aggregations. See <<search-aggregations-bucket-composite-aggregation-pipeline-aggregations, composite aggregations and pipeline aggs>>
+aggregations. See
+{ref}/search-aggregations-bucket-composite-aggregation-pipeline-aggregations.html [composite aggregations and pipeline aggregations].
 
 [source,js]
 ----------------------------------
@@ -454,7 +455,7 @@ different values of a field.
 
 IMPORTANT: If you use a terms aggregation, by default it returns buckets for
 the top ten terms. Thus if the cardinality of the term is greater than 10, not
-all terms are analyzed. In this case, consider using `composite` aggs
+all terms are analyzed. In this case, consider using `composite` aggregations
 experimental:[Support for composite aggregations inside datafeeds is currently experimental].
 
 You can change this behavior by setting the `size` parameter. To

From cc6e470dbf6886ccc40f585d685054e986d053d6 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Fri, 26 Mar 2021 08:04:20 -0400
Subject: [PATCH 17/20] fixing docs

---
 .../ml-configuring-aggregations.asciidoc               | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 311048e091eb0..6402e21c71f87 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -431,11 +431,13 @@ When using a `composite` aggregation:
 ----------------------------------
 // NOTCONSOLE
 
-The top level aggregation must be one of the following:
+The top level aggregation must be exclusively one of the following:
 *  A {ref}/search-aggregations-bucket.html[bucket aggregation] containing a single
-sub-aggregation that is a `date_histogram`, the top level aggregation is the
-required `date_histogram`, or the top level aggregation is the required `composite`.
-There must be exactly one `date_histogram` or `composite` aggregation. For more information, see
+sub-aggregation that is a `date_histogram`
+*  A top level aggregation that is a `date_histogram`
+*  A top level aggregation is a `composite` aggregation.
+
+There must be exactly one `date_histogram`, `composite` aggregation. For more information, see
 {ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date histogram aggregation] and
 {ref}/search-aggregations-bucket-composite-aggregation.html[Composite aggregation].
 

From 825d4a79195bf1495f2b2c20f568734884f4d79a Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Fri, 26 Mar 2021 09:14:45 -0400
Subject: [PATCH 18/20] Update
 docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: István Zoltán Szabó <istvan.szabo@elastic.co>
---
 .../ml/anomaly-detection/ml-configuring-aggregations.asciidoc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 6402e21c71f87..6d260351a8fcd 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -265,7 +265,7 @@ counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
 aggregations. See
-{ref}/search-aggregations-bucket-composite-aggregation-pipeline-aggregations.html [composite aggregations and pipeline aggregations].
+{ref}/search-aggregations-bucket-composite-aggregation-pipeline-aggregations.html[composite aggregations and pipeline aggregations].
 
 [source,js]
 ----------------------------------

From 94b68333eb6de7da4cc1bfddb7e8aa1aa3521581 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Fri, 26 Mar 2021 10:07:49 -0400
Subject: [PATCH 19/20] Update
 docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: István Zoltán Szabó <istvan.szabo@elastic.co>
---
 .../ml/anomaly-detection/ml-configuring-aggregations.asciidoc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
index 6d260351a8fcd..155b97d0f36a5 100644
--- a/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
+++ b/docs/reference/ml/anomaly-detection/ml-configuring-aggregations.asciidoc
@@ -265,7 +265,7 @@ counter `system.network.out.bytes` for each value of the field `beat.name`.
 
 NOTE: `derivative` or other pipeline aggregations may not work within `composite`
 aggregations. See
-{ref}/search-aggregations-bucket-composite-aggregation-pipeline-aggregations.html[composite aggregations and pipeline aggregations].
+{ref}/search-aggregations-bucket-composite-aggregation.html#search-aggregations-bucket-composite-aggregation-pipeline-aggregations[composite aggregations and pipeline aggregations].
 
 [source,js]
 ----------------------------------

From 47f3d40c35ae625a17856cd170d1aefbe4c7967f Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 30 Mar 2021 07:29:26 -0400
Subject: [PATCH 20/20] addressing PR comment

---
 .../xpack/core/ml/datafeed/DatafeedConfig.java       |  3 +--
 .../core/ml/datafeed/extractor/ExtractorUtils.java   | 12 +++++-------
 .../xpack/ml/integration/DatafeedJobsIT.java         |  2 ++
 .../xpack/ml/action/TransportStopDatafeedAction.java |  2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
index 760a29047b9e0..51647375cf42f 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/DatafeedConfig.java
@@ -921,13 +921,12 @@ private static void checkNoMoreHistogramAggregations(Collection<AggregationBuild
         static void checkHistogramAggregationHasChildMaxTimeAgg(AggregationBuilder histogramAggregation) {
             String timeField = null;
             if (histogramAggregation instanceof ValuesSourceAggregationBuilder) {
-                timeField = ((ValuesSourceAggregationBuilder) histogramAggregation).field();
+                timeField = ((ValuesSourceAggregationBuilder<?>) histogramAggregation).field();
             }
             if (histogramAggregation instanceof CompositeAggregationBuilder) {
                 DateHistogramValuesSourceBuilder valueSource = ExtractorUtils.getDateHistogramValuesSource(
                     (CompositeAggregationBuilder) histogramAggregation
                 );
-                assert valueSource != null : "unexpected null date_histogram composite agg source";
                 timeField = valueSource.field();
             }
 
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
index 90e88c09824fe..5e8a049a3d91b 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/datafeed/extractor/ExtractorUtils.java
@@ -86,7 +86,9 @@ public static boolean isHistogram(AggregationBuilder aggregationBuilder) {
 
     public static boolean isCompositeWithDateHistogramSource(AggregationBuilder aggregationBuilder) {
         return aggregationBuilder instanceof CompositeAggregationBuilder
-            && getDateHistogramValuesSource((CompositeAggregationBuilder) aggregationBuilder) != null;
+            && ((CompositeAggregationBuilder) aggregationBuilder).sources()
+            .stream()
+            .anyMatch(DateHistogramValuesSourceBuilder.class::isInstance);
     }
 
     public static DateHistogramValuesSourceBuilder getDateHistogramValuesSource(CompositeAggregationBuilder compositeAggregationBuilder) {
@@ -95,7 +97,7 @@ public static DateHistogramValuesSourceBuilder getDateHistogramValuesSource(Comp
                 return (DateHistogramValuesSourceBuilder)valuesSourceBuilder;
             }
         }
-        return null;
+        throw ExceptionsHelper.badRequestException("[composite] aggregations require exactly one [date_histogram] value source");
     }
 
     /**
@@ -195,11 +197,7 @@ static DateHistogramAggOrValueSource fromAgg(DateHistogramAggregationBuilder agg
         }
 
         static DateHistogramAggOrValueSource fromCompositeAgg(CompositeAggregationBuilder compositeAggregationBuilder) {
-            DateHistogramValuesSourceBuilder valuesSourceBuilder = getDateHistogramValuesSource(compositeAggregationBuilder);
-            assert valuesSourceBuilder != null : "unexpected missing date histogram value source for composite agg ["
-                + compositeAggregationBuilder.getName()
-                + "]";
-            return new DateHistogramAggOrValueSource(null, valuesSourceBuilder);
+            return new DateHistogramAggOrValueSource(null, getDateHistogramValuesSource(compositeAggregationBuilder));
         }
 
         private final DateHistogramAggregationBuilder agg;
diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
index 9c90f3e8ccdd3..697b4daf65d9b 100644
--- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
+++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DatafeedJobsIT.java
@@ -387,6 +387,7 @@ public void testStopAndRestartCompositeDatafeed() throws Exception {
             Collections.singletonList(indexName))
             .setParsedAggregations(aggs)
             .setFrequency(TimeValue.timeValueHours(1))
+            // Start off chunking at an hour so that it runs more slowly and the test has time to stop it in the middle of processing
             .setChunkingConfig(ChunkingConfig.newManual(TimeValue.timeValueHours(1)))
             .build();
         registerDatafeed(compositeDatafeedConfig);
@@ -403,6 +404,7 @@ public void testStopAndRestartCompositeDatafeed() throws Exception {
         if (getJobStats(compositeJobId).get(0).getState().equals(JobState.OPENED)) {
             updateDatafeed(new DatafeedUpdate.Builder()
                 .setId(compositeDatafeedConfig.getId())
+                // Set to auto to speed up and finish the job
                 .setChunkingConfig(ChunkingConfig.newAuto())
                 .build());
             startDatafeed(
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
index d41a05b9e58bd..620a690ff307b 100644
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStopDatafeedAction.java
@@ -220,7 +220,7 @@ private void normalStopDatafeed(Task task, StopDatafeedAction.Request request, A
                             .execute(ActionListener.wrap(
                                 _unused -> listener.onResponse(finished),
                                 ex -> {
-                                    logger.info(
+                                    logger.warn(
                                         () -> new ParameterizedMessage(
                                             "failed to refresh job [{}] results indices when stopping datafeeds [{}]",
                                             startedDatafeedsJobs,