From 34146093df82ed1ad53a26bfaed8460cef5aaadd Mon Sep 17 00:00:00 2001
From: Rishabh Maurya <rishabhmaurya05@gmail.com>
Date: Wed, 14 Feb 2024 11:40:43 -0800
Subject: [PATCH 01/17] Cardinality aggregation dynamic pruning changes

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        |  10 +-
 .../DisjunctionWithDynamicPruningScorer.java  | 264 ++++++++++++++++++
 .../DynamicPruningCollectorWrapper.java       | 106 +++++++
 .../metrics/CardinalityAggregatorTests.java   |  58 ++++
 4 files changed, 435 insertions(+), 3 deletions(-)
 create mode 100644 server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
 create mode 100644 server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 99c4eaac4b777..91887e2e4a202 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -53,6 +53,7 @@
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.InternalAggregation;
 import org.opensearch.search.aggregations.LeafBucketCollector;
+import org.opensearch.search.aggregations.support.FieldContext;
 import org.opensearch.search.aggregations.support.ValuesSource;
 import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
@@ -71,6 +72,8 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
     private final int precision;
     private final ValuesSource valuesSource;
 
+    private final FieldContext fieldContext;
+
     // Expensive to initialize, so we only initialize it when we have an actual value source
     @Nullable
     private HyperLogLogPlusPlus counts;
@@ -95,6 +98,7 @@ public CardinalityAggregator(
         // TODO: Stop using nulls here
         this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
         this.precision = precision;
+        this.fieldContext = valuesSourceConfig.fieldContext();
         this.counts = valuesSource == null ? null : new HyperLogLogPlusPlus(precision, context.bigArrays(), 1);
     }
 
@@ -132,11 +136,11 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             // only use ordinals if they don't increase memory usage by more than 25%
             if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
                 ordinalsCollectorsUsed++;
-                return new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
+                return new DynamicPruningCollectorWrapper(new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
+                    context, ctx, fieldContext, source);
             }
             ordinalsCollectorsOverheadTooHigh++;
         }
-
         stringHashingCollectorsUsed++;
         return new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
     }
@@ -206,7 +210,7 @@ public void collectDebugInfo(BiConsumer<String, Object> add) {
      *
      * @opensearch.internal
      */
-    private abstract static class Collector extends LeafBucketCollector implements Releasable {
+    abstract static class Collector extends LeafBucketCollector implements Releasable {
 
         public abstract void postCollect() throws IOException;
 
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
new file mode 100644
index 0000000000000..6a7e66e8be2f0
--- /dev/null
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
@@ -0,0 +1,264 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.search.aggregations.metrics;
+
+import org.apache.lucene.search.DisiPriorityQueue;
+import org.apache.lucene.search.DisiWrapper;
+import org.apache.lucene.search.DisjunctionDISIApproximation;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.PriorityQueue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+
+/**
+ * Clone of {@link org.apache.lucene.search} {@code DisjunctionScorer.java} in lucene with following modifications -
+ * 1. {@link #removeAllDISIsOnCurrentDoc()} - it removes all the DISIs for subscorer pointing to current doc. This is
+ * helpful in dynamic pruning for Cardinality aggregation, where once a term is found, it becomes irrelevant for
+ * rest of the search space, so this term's subscorer DISI can be safely removed from list of subscorer to process.
+ * <p>
+ * 2. {@link #removeAllDISIsOnCurrentDoc()} breaks the invariant of Conjuction DISI i.e. the docIDs of all sub-scorers should be
+ * less than or equal to current docID iterator is pointing to. When we remove elements from priority, it results in heapify action, which modifies
+ * the top of the priority queye, which represents the current docID for subscorers here. To address this, we are wrapping the
+ * iterator with {@link SlowDocIdPropagatorDISI} which keeps the iterator pointing to last docID before {@link #removeAllDISIsOnCurrentDoc()}
+ * is called and updates this docID only when next() or advance() is called.
+ */
+public class DisjunctionWithDynamicPruningScorer extends Scorer {
+
+    private final boolean needsScores;
+    private final DisiPriorityQueue subScorers;
+    private final DocIdSetIterator approximation;
+    private final TwoPhase twoPhase;
+
+    private Integer docID;
+
+    public DisjunctionWithDynamicPruningScorer(Weight weight, List<Scorer> subScorers)
+        throws IOException {
+        super(weight);
+        if (subScorers.size() <= 1) {
+            throw new IllegalArgumentException("There must be at least 2 subScorers");
+        }
+        this.subScorers = new DisiPriorityQueue(subScorers.size());
+        for (Scorer scorer : subScorers) {
+            final DisiWrapper w = new DisiWrapper(scorer);
+            this.subScorers.add(w);
+        }
+        this.needsScores = false;
+        this.approximation = new DisjunctionDISIApproximation(this.subScorers);
+
+        boolean hasApproximation = false;
+        float sumMatchCost = 0;
+        long sumApproxCost = 0;
+        // Compute matchCost as the average over the matchCost of the subScorers.
+        // This is weighted by the cost, which is an expected number of matching documents.
+        for (DisiWrapper w : this.subScorers) {
+            long costWeight = (w.cost <= 1) ? 1 : w.cost;
+            sumApproxCost += costWeight;
+            if (w.twoPhaseView != null) {
+                hasApproximation = true;
+                sumMatchCost += w.matchCost * costWeight;
+            }
+        }
+
+        if (hasApproximation == false) { // no sub scorer supports approximations
+            twoPhase = null;
+        } else {
+            final float matchCost = sumMatchCost / sumApproxCost;
+            twoPhase = new TwoPhase(approximation, matchCost);
+        }
+    }
+
+    public void removeAllDISIsOnCurrentDoc() {
+        docID = this.docID();
+        while (subScorers.size() > 0 && subScorers.top().doc == docID) {
+            subScorers.pop();
+        }
+    }
+
+    @Override
+    public DocIdSetIterator iterator() {
+       DocIdSetIterator disi = getIterator();
+       docID = disi.docID();
+       return new SlowDocIdPropagatorDISI(getIterator(), docID);
+    }
+
+    private static class SlowDocIdPropagatorDISI extends DocIdSetIterator {
+        DocIdSetIterator disi;
+
+        Integer curDocId;
+
+        SlowDocIdPropagatorDISI(DocIdSetIterator disi, Integer curDocId) {
+            this.disi = disi;
+            this.curDocId = curDocId;
+        }
+
+        @Override
+        public int docID() {
+            assert curDocId <= disi.docID();
+            return curDocId;
+        }
+
+        @Override
+        public int nextDoc() throws IOException {
+            return advance(curDocId + 1);
+        }
+
+        @Override
+        public int advance(int i) throws IOException {
+            if (i <= disi.docID()) {
+                // since we are slow propagating docIDs, it may happen the disi is already advanced to a higher docID than i
+                // in such scenarios we can simply return the docID where disi is pointing to and update the curDocId
+                curDocId = disi.docID();
+                return disi.docID();
+            }
+            curDocId = disi.advance(i);
+            return curDocId;
+        }
+
+        @Override
+        public long cost() {
+            return disi.cost();
+        }
+    }
+
+    private DocIdSetIterator getIterator() {
+        if (twoPhase != null) {
+            return TwoPhaseIterator.asDocIdSetIterator(twoPhase);
+        } else {
+            return approximation;
+        }
+    }
+
+    @Override
+    public TwoPhaseIterator twoPhaseIterator() {
+        return twoPhase;
+    }
+
+    @Override
+    public float getMaxScore(int i) throws IOException {
+        return 0;
+    }
+
+    private class TwoPhase extends TwoPhaseIterator {
+
+        private final float matchCost;
+        // list of verified matches on the current doc
+        DisiWrapper verifiedMatches;
+        // priority queue of approximations on the current doc that have not been verified yet
+        final PriorityQueue<DisiWrapper> unverifiedMatches;
+
+        private TwoPhase(DocIdSetIterator approximation, float matchCost) {
+            super(approximation);
+            this.matchCost = matchCost;
+            unverifiedMatches =
+                new PriorityQueue<DisiWrapper>(DisjunctionWithDynamicPruningScorer.this.subScorers.size()) {
+                    @Override
+                    protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
+                        return a.matchCost < b.matchCost;
+                    }
+                };
+        }
+
+        DisiWrapper getSubMatches() throws IOException {
+            // iteration order does not matter
+            for (DisiWrapper w : unverifiedMatches) {
+                if (w.twoPhaseView.matches()) {
+                    w.next = verifiedMatches;
+                    verifiedMatches = w;
+                }
+            }
+            unverifiedMatches.clear();
+            return verifiedMatches;
+        }
+
+        @Override
+        public boolean matches() throws IOException {
+            verifiedMatches = null;
+            unverifiedMatches.clear();
+
+            for (DisiWrapper w = subScorers.topList(); w != null; ) {
+                DisiWrapper next = w.next;
+
+                if (w.twoPhaseView == null) {
+                    // implicitly verified, move it to verifiedMatches
+                    w.next = verifiedMatches;
+                    verifiedMatches = w;
+
+                    if (needsScores == false) {
+                        // we can stop here
+                        return true;
+                    }
+                } else {
+                    unverifiedMatches.add(w);
+                }
+                w = next;
+            }
+
+            if (verifiedMatches != null) {
+                return true;
+            }
+
+            // verify subs that have an two-phase iterator
+            // least-costly ones first
+            while (unverifiedMatches.size() > 0) {
+                DisiWrapper w = unverifiedMatches.pop();
+                if (w.twoPhaseView.matches()) {
+                    w.next = null;
+                    verifiedMatches = w;
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        @Override
+        public float matchCost() {
+            return matchCost;
+        }
+    }
+
+
+    @Override
+    public final int docID() {
+        return subScorers.top().doc;
+    }
+
+    DisiWrapper getSubMatches() throws IOException {
+        if (twoPhase == null) {
+            return subScorers.topList();
+        } else {
+            return twoPhase.getSubMatches();
+        }
+    }
+
+    @Override
+    public final float score() throws IOException {
+        return score(getSubMatches());
+    }
+
+    protected float score(DisiWrapper topList) throws IOException {
+        return 1f;
+    }
+
+    @Override
+    public final Collection<ChildScorable> getChildren() throws IOException {
+        ArrayList<ChildScorable> children = new ArrayList<>();
+        for (DisiWrapper scorer = getSubMatches(); scorer != null; scorer = scorer.next) {
+            children.add(new ChildScorable(scorer.scorer, "SHOULD"));
+        }
+        return children;
+    }
+}
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
new file mode 100644
index 0000000000000..f4c3d59a3833f
--- /dev/null
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
@@ -0,0 +1,106 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.search.aggregations.metrics;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.ConjunctionUtils;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.Bits;
+import org.opensearch.search.aggregations.support.FieldContext;
+import org.opensearch.search.aggregations.support.ValuesSource;
+import org.opensearch.search.internal.SearchContext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class DynamicPruningCollectorWrapper extends CardinalityAggregator.Collector {
+
+    private final LeafReaderContext ctx;
+    private final DisjunctionWithDynamicPruningScorer disjunctionScorer;
+    private final DocIdSetIterator disi;
+    private final CardinalityAggregator.Collector delegateCollector;
+
+    DynamicPruningCollectorWrapper(CardinalityAggregator.Collector delegateCollector,
+                                   SearchContext context, LeafReaderContext ctx, FieldContext fieldContext,
+                                   ValuesSource.Bytes.WithOrdinals source) throws IOException {
+        this.ctx = ctx;
+        this.delegateCollector = delegateCollector;
+        final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
+        boolean isCardinalityLow = ordinalValues.getValueCount() < 10;
+        boolean isCardinalityAggregationOnlyAggregation = true;
+        boolean isFieldSupportedForDynamicPruning = true;
+        if (isCardinalityLow && isCardinalityAggregationOnlyAggregation && isFieldSupportedForDynamicPruning) {
+            // create disjunctions from terms
+            // this logic should be pluggable depending on the type of leaf bucket collector by CardinalityAggregator
+            TermsEnum terms = ordinalValues.termsEnum();
+            Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.COMPLETE_NO_SCORES, 1f);
+            Map<Long, Boolean> found = new HashMap<>();
+            List<Scorer> subScorers = new ArrayList<>();
+            while (terms.next() != null && !found.containsKey(terms.ord())) {
+                // TODO can we get rid of terms previously encountered in other segments?
+                TermQuery termQuery = new TermQuery(new Term(fieldContext.field(), terms.term()));
+                Weight subWeight = context.searcher().createWeight(termQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
+                Scorer scorer = subWeight.scorer(ctx);
+                if (scorer != null) {
+                    subScorers.add(scorer);
+                }
+                found.put(terms.ord(), true);
+            }
+            disjunctionScorer = new DisjunctionWithDynamicPruningScorer(weight, subScorers);
+            disi = ConjunctionUtils.intersectScorers(List.of(disjunctionScorer, weight.scorer(ctx)));
+        } else {
+            disjunctionScorer = null;
+            disi = null;
+        }
+    }
+
+    @Override
+    public void collect(int doc, long bucketOrd) throws IOException {
+        if (disi == null || disjunctionScorer == null) {
+            delegateCollector.collect(doc, bucketOrd);
+        } else {
+            // perform the full iteration using dynamic pruning of DISIs and return right away
+            disi.advance(doc);
+            int currDoc = disi.docID();
+            assert currDoc == doc;
+            final Bits liveDocs = ctx.reader().getLiveDocs();
+            assert liveDocs == null || liveDocs.get(currDoc);
+            do {
+                if (liveDocs == null || liveDocs.get(currDoc)) {
+                    delegateCollector.collect(currDoc, bucketOrd);
+                    disjunctionScorer.removeAllDISIsOnCurrentDoc();
+                }
+                currDoc = disi.nextDoc();
+            } while (currDoc != DocIdSetIterator.NO_MORE_DOCS);
+            throw new CollectionTerminatedException();
+        }
+    }
+
+    @Override
+    public void close() {
+        delegateCollector.close();
+    }
+
+    @Override
+    public void postCollect() throws IOException {
+        delegateCollector.postCollect();
+    }
+}
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index cdd17e2fa7dd6..a9966c9e70e76 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -33,15 +33,22 @@
 package org.opensearch.search.aggregations.metrics;
 
 import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.util.BytesRef;
 import org.opensearch.common.CheckedConsumer;
 import org.opensearch.common.geo.GeoPoint;
+import org.opensearch.index.mapper.KeywordFieldMapper;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.NumberFieldMapper;
 import org.opensearch.index.mapper.RangeFieldMapper;
@@ -56,6 +63,7 @@
 import java.util.Set;
 import java.util.function.Consumer;
 
+import static java.util.Arrays.asList;
 import static java.util.Collections.singleton;
 
 public class CardinalityAggregatorTests extends AggregatorTestCase {
@@ -90,6 +98,56 @@ public void testRangeFieldValues() throws IOException {
         }, fieldType);
     }
 
+    public void testDynamicPruningOrdinalCollector() throws IOException {
+        final String fieldName = "testField";
+        final String filterFieldName = "filterField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+        testAggregation(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "1", Field.Store.NO),
+                new KeywordField(fieldName, "2", Field.Store.NO),
+                new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("1")),
+                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "2", Field.Store.NO),
+                new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "1", Field.Store.NO),
+                new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("1"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "2", Field.Store.NO),
+                new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "3", Field.Store.NO),
+                new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("3"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "4", Field.Store.NO),
+                new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("4"))
+            ));
+            iw.addDocument(asList(
+                new KeywordField(fieldName, "5", Field.Store.NO),
+                new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                new SortedSetDocValuesField(fieldName, new BytesRef("5"))
+            ));
+            }, card -> {
+            assertEquals(3.0, card.getValue(), 0);
+            assertTrue(AggregationInspectionHelper.hasValue(card));
+        }, fieldType);
+    }
+
     public void testNoMatchingField() throws IOException {
         testAggregation(new MatchAllDocsQuery(), iw -> {
             iw.addDocument(singleton(new SortedNumericDocValuesField("wrong_number", 7)));

From 0d6f15140924cfc818fec05f56b88cac67464a05 Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Thu, 16 May 2024 08:37:06 -0700
Subject: [PATCH 02/17] Reading

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/DisjunctionWithDynamicPruningScorer.java         | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
index 6a7e66e8be2f0..75fa3d2eb6f93 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
@@ -24,8 +24,9 @@
 
 
 /**
- * Clone of {@link org.apache.lucene.search} {@code DisjunctionScorer.java} in lucene with following modifications -
- * 1. {@link #removeAllDISIsOnCurrentDoc()} - it removes all the DISIs for subscorer pointing to current doc. This is
+ * Clone of {@link org.apache.lucene.search} {@code DisjunctionScorer.java} in lucene with following modifications
+ * <p>
+ * 1. {@link #removeAllDISIsOnCurrentDoc()} removes all the DISIs for subscorer pointing to current doc. This is
  * helpful in dynamic pruning for Cardinality aggregation, where once a term is found, it becomes irrelevant for
  * rest of the search space, so this term's subscorer DISI can be safely removed from list of subscorer to process.
  * <p>

From a18b597ea8dc47fd0211f599908d84008daf5a77 Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Tue, 21 May 2024 07:49:37 -0700
Subject: [PATCH 03/17] remaining disjunction scorer full understand

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java             |  8 +++++++-
 .../DisjunctionWithDynamicPruningScorer.java       | 14 ++++++--------
 .../aggregations/metrics/InternalCardinality.java  |  1 -
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 91887e2e4a202..dd2e7458d81d2 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -125,6 +125,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
         if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals) {
             ValuesSource.Bytes.WithOrdinals source = (ValuesSource.Bytes.WithOrdinals) valuesSource;
             final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
+            final SortedSetDocValues globalOrdinalValues = source.globalOrdinalsValues(ctx);
             final long maxOrd = ordinalValues.getValueCount();
             if (maxOrd == 0) {
                 emptyCollectorsUsed++;
@@ -179,7 +180,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) {
         if (counts == null || owningBucketOrdinal >= counts.maxOrd() || counts.cardinality(owningBucketOrdinal) == 0) {
             return buildEmptyAggregation();
         }
-        // We need to build a copy because the returned Aggregation needs remain usable after
+        // We need to build a copy because the returned Aggregation needs to remain usable after
         // this Aggregator (and its HLL++ counters) is released.
         AbstractHyperLogLogPlusPlus copy = counts.clone(owningBucketOrdinal, BigArrays.NON_RECYCLING_INSTANCE);
         return new InternalCardinality(name, copy, metadata());
@@ -322,6 +323,9 @@ public void collect(int doc, long bucketOrd) throws IOException {
                     bits.set((int) ord);
                 }
             }
+            // for this owning bucket ord, save the values of current doc as value ordinals bits
+            // visitedOrds (array with index as owning bucket, value as bits)
+            // ordinals is number array, each element representing a text or term, and sorted by the values
         }
 
         @Override
@@ -336,6 +340,7 @@ public void postCollect() throws IOException {
 
                 try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) {
                     final MurmurHash3.Hash128 hash = new MurmurHash3.Hash128();
+                    // for every ordinal, we want the hash of its value
                     for (long ord = allVisitedOrds.nextSetBit(0); ord < Long.MAX_VALUE; ord = ord + 1 < maxOrd
                         ? allVisitedOrds.nextSetBit(ord + 1)
                         : Long.MAX_VALUE) {
@@ -347,6 +352,7 @@ public void postCollect() throws IOException {
                     for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) {
                         final BitArray bits = visitedOrds.get(bucket);
                         if (bits != null) {
+                            // for every ordinal of this bucket, we collect by using its hash
                             for (long ord = bits.nextSetBit(0); ord < Long.MAX_VALUE; ord = ord + 1 < maxOrd
                                 ? bits.nextSetBit(ord + 1)
                                 : Long.MAX_VALUE) {
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
index 75fa3d2eb6f93..9b6a0f42e0fa9 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
@@ -31,7 +31,7 @@
  * rest of the search space, so this term's subscorer DISI can be safely removed from list of subscorer to process.
  * <p>
  * 2. {@link #removeAllDISIsOnCurrentDoc()} breaks the invariant of Conjuction DISI i.e. the docIDs of all sub-scorers should be
- * less than or equal to current docID iterator is pointing to. When we remove elements from priority, it results in heapify action, which modifies
+ * less than or equal to current docID iterator is pointing to. When we remove elements from disi priority queue, it results in heapify action, which modifies
  * the top of the priority queye, which represents the current docID for subscorers here. To address this, we are wrapping the
  * iterator with {@link SlowDocIdPropagatorDISI} which keeps the iterator pointing to last docID before {@link #removeAllDISIsOnCurrentDoc()}
  * is called and updates this docID only when next() or advance() is called.
@@ -97,7 +97,6 @@ public DocIdSetIterator iterator() {
 
     private static class SlowDocIdPropagatorDISI extends DocIdSetIterator {
         DocIdSetIterator disi;
-
         Integer curDocId;
 
         SlowDocIdPropagatorDISI(DocIdSetIterator disi, Integer curDocId) {
@@ -147,11 +146,6 @@ public TwoPhaseIterator twoPhaseIterator() {
         return twoPhase;
     }
 
-    @Override
-    public float getMaxScore(int i) throws IOException {
-        return 0;
-    }
-
     private class TwoPhase extends TwoPhaseIterator {
 
         private final float matchCost;
@@ -231,7 +225,6 @@ public float matchCost() {
         }
     }
 
-
     @Override
     public final int docID() {
         return subScorers.top().doc;
@@ -262,4 +255,9 @@ public final Collection<ChildScorable> getChildren() throws IOException {
         }
         return children;
     }
+
+    @Override
+    public float getMaxScore(int i) throws IOException {
+        return 0;
+    }
 }
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/InternalCardinality.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/InternalCardinality.java
index 7e9511ffdd379..9f9ad63220fea 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/InternalCardinality.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/InternalCardinality.java
@@ -117,7 +117,6 @@ public InternalAggregation reduce(List<InternalAggregation> aggregations, Reduce
             return aggregations.get(0);
         } else {
             return new InternalCardinality(name, reduced, getMetadata());
-
         }
     }
 

From 85133c4c3ed43aaf339f665597f89cba98bb761d Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Fri, 24 May 2024 10:53:25 -0700
Subject: [PATCH 04/17] utilize competitive iterator api to perform pruning

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        | 124 +++++++++++++++++-
 .../DisjunctionWithDynamicPruningScorer.java  |  25 ++--
 .../DynamicPruningCollectorWrapper.java       |  13 +-
 .../metrics/CardinalityAggregatorTests.java   |  90 +++++++------
 4 files changed, 193 insertions(+), 59 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index dd2e7458d81d2..6f66fc64f6dc7 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -35,7 +35,15 @@
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DisiPriorityQueue;
+import org.apache.lucene.search.DisiWrapper;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
@@ -59,6 +67,7 @@
 import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.Map;
 import java.util.function.BiConsumer;
 
@@ -137,8 +146,15 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             // only use ordinals if they don't increase memory usage by more than 25%
             if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
                 ordinalsCollectorsUsed++;
-                return new DynamicPruningCollectorWrapper(new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
-                    context, ctx, fieldContext, source);
+                // return new DynamicPruningCollectorWrapper(new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
+                // context, ctx, fieldContext, source);
+                return new CompetitiveCollector(
+                    new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
+                    source,
+                    ctx,
+                    context,
+                    fieldContext
+                );
             }
             ordinalsCollectorsOverheadTooHigh++;
         }
@@ -217,6 +233,110 @@ abstract static class Collector extends LeafBucketCollector implements Releasabl
 
     }
 
+    private static class CompetitiveCollector extends Collector {
+
+        private final Collector delegate;
+        private final DisiPriorityQueue pq;
+
+        CompetitiveCollector(
+            Collector delegate,
+            ValuesSource.Bytes.WithOrdinals source,
+            LeafReaderContext ctx,
+            SearchContext context,
+            FieldContext fieldContext
+        ) throws IOException {
+            this.delegate = delegate;
+
+            final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
+            TermsEnum terms = ordinalValues.termsEnum();
+            Map<BytesRef, Scorer> postingMap = new HashMap<>();
+            while (terms.next() != null) {
+                BytesRef term = terms.term();
+
+                TermQuery termQuery = new TermQuery(new Term(fieldContext.field(), term));
+                Weight subWeight = context.searcher().createWeight(termQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
+                Scorer scorer = subWeight.scorer(ctx);
+
+                postingMap.put(term, scorer);
+            }
+            this.pq = new DisiPriorityQueue(postingMap.size());
+            for (Map.Entry<BytesRef, Scorer> entry : postingMap.entrySet()) {
+                pq.add(new DisiWrapper(entry.getValue()));
+            }
+        }
+
+        @Override
+        public void close() {
+            delegate.close();
+        }
+
+        @Override
+        public void collect(int doc, long owningBucketOrd) throws IOException {
+            delegate.collect(doc, owningBucketOrd);
+        }
+
+        @Override
+        public DocIdSetIterator competitiveIterator() throws IOException {
+            return new DisjunctionDISIWithPruning(pq);
+        }
+
+        @Override
+        public void postCollect() throws IOException {
+            delegate.postCollect();
+        }
+    }
+
+    private static class DisjunctionDISIWithPruning extends DocIdSetIterator {
+
+        final DisiPriorityQueue queue;
+
+        public DisjunctionDISIWithPruning(DisiPriorityQueue queue) {
+            this.queue = queue;
+        }
+
+        @Override
+        public int docID() {
+            return queue.top().doc;
+        }
+
+        @Override
+        public int nextDoc() throws IOException {
+            // don't expect this to be called
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public int advance(int target) throws IOException {
+            // more than advance to the next doc >= target
+            // we also do the pruning of current doc here
+
+            DisiWrapper top = queue.top();
+
+            // after collecting the doc, before advancing to target
+            // we can safely remove all the iterators that having this doc
+            if (top.doc != -1) {
+                int curTopDoc = top.doc;
+                do {
+                    top.doc = top.approximation.advance(Integer.MAX_VALUE);
+                    top = queue.updateTop();
+                } while (top.doc == curTopDoc);
+            }
+
+            if (top.doc >= target) return top.doc;
+            do {
+                top.doc = top.approximation.advance(target);
+                top = queue.updateTop();
+            } while (top.doc < target);
+            return top.doc;
+        }
+
+        @Override
+        public long cost() {
+            // don't expect this to be called
+            throw new UnsupportedOperationException();
+        }
+    }
+
     /**
      * Empty Collector for the Cardinality agg
      *
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
index 9b6a0f42e0fa9..93a01aaa1e053 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
@@ -22,7 +22,6 @@
 import java.util.Collection;
 import java.util.List;
 
-
 /**
  * Clone of {@link org.apache.lucene.search} {@code DisjunctionScorer.java} in lucene with following modifications
  * <p>
@@ -45,8 +44,7 @@ public class DisjunctionWithDynamicPruningScorer extends Scorer {
 
     private Integer docID;
 
-    public DisjunctionWithDynamicPruningScorer(Weight weight, List<Scorer> subScorers)
-        throws IOException {
+    public DisjunctionWithDynamicPruningScorer(Weight weight, List<Scorer> subScorers) throws IOException {
         super(weight);
         if (subScorers.size() <= 1) {
             throw new IllegalArgumentException("There must be at least 2 subScorers");
@@ -90,9 +88,9 @@ public void removeAllDISIsOnCurrentDoc() {
 
     @Override
     public DocIdSetIterator iterator() {
-       DocIdSetIterator disi = getIterator();
-       docID = disi.docID();
-       return new SlowDocIdPropagatorDISI(getIterator(), docID);
+        DocIdSetIterator disi = getIterator();
+        docID = disi.docID();
+        return new SlowDocIdPropagatorDISI(getIterator(), docID);
     }
 
     private static class SlowDocIdPropagatorDISI extends DocIdSetIterator {
@@ -157,13 +155,12 @@ private class TwoPhase extends TwoPhaseIterator {
         private TwoPhase(DocIdSetIterator approximation, float matchCost) {
             super(approximation);
             this.matchCost = matchCost;
-            unverifiedMatches =
-                new PriorityQueue<DisiWrapper>(DisjunctionWithDynamicPruningScorer.this.subScorers.size()) {
-                    @Override
-                    protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
-                        return a.matchCost < b.matchCost;
-                    }
-                };
+            unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionWithDynamicPruningScorer.this.subScorers.size()) {
+                @Override
+                protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
+                    return a.matchCost < b.matchCost;
+                }
+            };
         }
 
         DisiWrapper getSubMatches() throws IOException {
@@ -183,7 +180,7 @@ public boolean matches() throws IOException {
             verifiedMatches = null;
             unverifiedMatches.clear();
 
-            for (DisiWrapper w = subScorers.topList(); w != null; ) {
+            for (DisiWrapper w = subScorers.topList(); w != null;) {
                 DisiWrapper next = w.next;
 
                 if (w.twoPhaseView == null) {
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
index f4c3d59a3833f..cb735a3257289 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
@@ -8,7 +8,6 @@
 
 package org.opensearch.search.aggregations.metrics;
 
-import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
@@ -38,9 +37,13 @@ class DynamicPruningCollectorWrapper extends CardinalityAggregator.Collector {
     private final DocIdSetIterator disi;
     private final CardinalityAggregator.Collector delegateCollector;
 
-    DynamicPruningCollectorWrapper(CardinalityAggregator.Collector delegateCollector,
-                                   SearchContext context, LeafReaderContext ctx, FieldContext fieldContext,
-                                   ValuesSource.Bytes.WithOrdinals source) throws IOException {
+    DynamicPruningCollectorWrapper(
+        CardinalityAggregator.Collector delegateCollector,
+        SearchContext context,
+        LeafReaderContext ctx,
+        FieldContext fieldContext,
+        ValuesSource.Bytes.WithOrdinals source
+    ) throws IOException {
         this.ctx = ctx;
         this.delegateCollector = delegateCollector;
         final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
@@ -52,7 +55,7 @@ class DynamicPruningCollectorWrapper extends CardinalityAggregator.Collector {
             // this logic should be pluggable depending on the type of leaf bucket collector by CardinalityAggregator
             TermsEnum terms = ordinalValues.termsEnum();
             Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.COMPLETE_NO_SCORES, 1f);
-            Map<Long, Boolean> found = new HashMap<>();
+            Map<Long, Boolean> found = new HashMap<>(); // ord : found or not
             List<Scorer> subScorers = new ArrayList<>();
             while (terms.next() != null && !found.containsKey(terms.ord())) {
                 // TODO can we get rid of terms previously encountered in other segments?
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index a9966c9e70e76..d21e7f6ed8550 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -105,44 +105,58 @@ public void testDynamicPruningOrdinalCollector() throws IOException {
         MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
         final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
         testAggregation(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "1", Field.Store.NO),
-                new KeywordField(fieldName, "2", Field.Store.NO),
-                new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("1")),
-                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "2", Field.Store.NO),
-                new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "1", Field.Store.NO),
-                new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("1"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "2", Field.Store.NO),
-                new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "3", Field.Store.NO),
-                new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("3"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "4", Field.Store.NO),
-                new KeywordField(filterFieldName, "bar", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("4"))
-            ));
-            iw.addDocument(asList(
-                new KeywordField(fieldName, "5", Field.Store.NO),
-                new KeywordField(filterFieldName, "bar", Field.Store.NO),
-                new SortedSetDocValuesField(fieldName, new BytesRef("5"))
-            ));
-            }, card -> {
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "1", Field.Store.NO),
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("1")),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "1", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("1"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "3", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("3"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "4", Field.Store.NO),
+                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("4"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "5", Field.Store.NO),
+                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("5"))
+                )
+            );
+        }, card -> {
             assertEquals(3.0, card.getValue(), 0);
             assertTrue(AggregationInspectionHelper.hasValue(card));
         }, fieldType);

From 9d4701c0866c8293e51896af44039574f5ff466f Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Fri, 24 May 2024 15:34:03 -0700
Subject: [PATCH 05/17] handle missing input

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../search/aggregations/metrics/CardinalityAggregator.java   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 6f66fc64f6dc7..6fe1e7345cda4 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -81,6 +81,7 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
     private final int precision;
     private final ValuesSource valuesSource;
 
+    private final ValuesSourceConfig valuesSourceConfig;
     private final FieldContext fieldContext;
 
     // Expensive to initialize, so we only initialize it when we have an actual value source
@@ -105,6 +106,7 @@ public CardinalityAggregator(
     ) throws IOException {
         super(name, context, parent, metadata);
         // TODO: Stop using nulls here
+        this.valuesSourceConfig = valuesSourceConfig;
         this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
         this.precision = precision;
         this.fieldContext = valuesSourceConfig.fieldContext();
@@ -148,6 +150,9 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
                 ordinalsCollectorsUsed++;
                 // return new DynamicPruningCollectorWrapper(new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
                 // context, ctx, fieldContext, source);
+                if (valuesSourceConfig.missing() != null) {
+                    return new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
+                }
                 return new CompetitiveCollector(
                     new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
                     source,

From 77fceeaaed9f9ea6fa621e7aef1341a956e1716c Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Fri, 24 May 2024 15:46:53 -0700
Subject: [PATCH 06/17] add change log

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4b4a5e5f4f981..c62122ddd9ce9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Make outbound side of transport protocol dependent ([#13293](https://github.com/opensearch-project/OpenSearch/pull/13293))
 - [Remote Store] Add dynamic cluster settings to set timeout for segments upload to Remote Store ([#13679](https://github.com/opensearch-project/OpenSearch/pull/13679))
 - [Remote Store] Upload translog checkpoint as object metadata to translog.tlog([#13637](https://github.com/opensearch-project/OpenSearch/pull/13637))
+- Support Dynamic Pruning in Cardinality Aggregation ([#13821](https://github.com/opensearch-project/OpenSearch/pull/13821))
 
 ### Dependencies
 - Bump `com.github.spullara.mustache.java:compiler` from 0.9.10 to 0.9.13 ([#13329](https://github.com/opensearch-project/OpenSearch/pull/13329), [#13559](https://github.com/opensearch-project/OpenSearch/pull/13559))

From d0043ae610fefd3c4d417d823c44e6354aec736a Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Tue, 4 Jun 2024 11:51:01 -0700
Subject: [PATCH 07/17] clean up

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .idea/runConfigurations/Debug_OpenSearch.xml  |   6 +-
 .../metrics/CardinalityAggregator.java        |  11 +-
 .../DisjunctionWithDynamicPruningScorer.java  | 260 ------------------
 .../DynamicPruningCollectorWrapper.java       | 109 --------
 4 files changed, 8 insertions(+), 378 deletions(-)
 delete mode 100644 server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
 delete mode 100644 server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java

diff --git a/.idea/runConfigurations/Debug_OpenSearch.xml b/.idea/runConfigurations/Debug_OpenSearch.xml
index 0d8bf59823acf..c18046f873477 100644
--- a/.idea/runConfigurations/Debug_OpenSearch.xml
+++ b/.idea/runConfigurations/Debug_OpenSearch.xml
@@ -6,6 +6,10 @@
     <option name="HOST" value="localhost" />
     <option name="PORT" value="5005" />
     <option name="AUTO_RESTART" value="true" />
+    <RunnerSettings RunnerId="Debug">
+      <option name="DEBUG_PORT" value="5005" />
+      <option name="LOCAL" value="false" />
+    </RunnerSettings>
     <method v="2" />
   </configuration>
-</component>
+</component>
\ No newline at end of file
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 6fe1e7345cda4..a0b0c1ae430fa 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -82,7 +82,6 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
     private final ValuesSource valuesSource;
 
     private final ValuesSourceConfig valuesSourceConfig;
-    private final FieldContext fieldContext;
 
     // Expensive to initialize, so we only initialize it when we have an actual value source
     @Nullable
@@ -106,11 +105,10 @@ public CardinalityAggregator(
     ) throws IOException {
         super(name, context, parent, metadata);
         // TODO: Stop using nulls here
-        this.valuesSourceConfig = valuesSourceConfig;
         this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
         this.precision = precision;
-        this.fieldContext = valuesSourceConfig.fieldContext();
         this.counts = valuesSource == null ? null : new HyperLogLogPlusPlus(precision, context.bigArrays(), 1);
+        this.valuesSourceConfig = valuesSourceConfig;
     }
 
     @Override
@@ -148,8 +146,6 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             // only use ordinals if they don't increase memory usage by more than 25%
             if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
                 ordinalsCollectorsUsed++;
-                // return new DynamicPruningCollectorWrapper(new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
-                // context, ctx, fieldContext, source);
                 if (valuesSourceConfig.missing() != null) {
                     return new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
                 }
@@ -158,7 +154,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
                     source,
                     ctx,
                     context,
-                    fieldContext
+                    valuesSourceConfig.fieldContext()
                 );
             }
             ordinalsCollectorsOverheadTooHigh++;
@@ -232,7 +228,7 @@ public void collectDebugInfo(BiConsumer<String, Object> add) {
      *
      * @opensearch.internal
      */
-    abstract static class Collector extends LeafBucketCollector implements Releasable {
+    private abstract static class Collector extends LeafBucketCollector implements Releasable {
 
         public abstract void postCollect() throws IOException;
 
@@ -314,7 +310,6 @@ public int nextDoc() throws IOException {
         public int advance(int target) throws IOException {
             // more than advance to the next doc >= target
             // we also do the pruning of current doc here
-
             DisiWrapper top = queue.top();
 
             // after collecting the doc, before advancing to target
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
deleted file mode 100644
index 93a01aaa1e053..0000000000000
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DisjunctionWithDynamicPruningScorer.java
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-package org.opensearch.search.aggregations.metrics;
-
-import org.apache.lucene.search.DisiPriorityQueue;
-import org.apache.lucene.search.DisiWrapper;
-import org.apache.lucene.search.DisjunctionDISIApproximation;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.TwoPhaseIterator;
-import org.apache.lucene.search.Weight;
-import org.apache.lucene.util.PriorityQueue;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Clone of {@link org.apache.lucene.search} {@code DisjunctionScorer.java} in lucene with following modifications
- * <p>
- * 1. {@link #removeAllDISIsOnCurrentDoc()} removes all the DISIs for subscorer pointing to current doc. This is
- * helpful in dynamic pruning for Cardinality aggregation, where once a term is found, it becomes irrelevant for
- * rest of the search space, so this term's subscorer DISI can be safely removed from list of subscorer to process.
- * <p>
- * 2. {@link #removeAllDISIsOnCurrentDoc()} breaks the invariant of Conjuction DISI i.e. the docIDs of all sub-scorers should be
- * less than or equal to current docID iterator is pointing to. When we remove elements from disi priority queue, it results in heapify action, which modifies
- * the top of the priority queye, which represents the current docID for subscorers here. To address this, we are wrapping the
- * iterator with {@link SlowDocIdPropagatorDISI} which keeps the iterator pointing to last docID before {@link #removeAllDISIsOnCurrentDoc()}
- * is called and updates this docID only when next() or advance() is called.
- */
-public class DisjunctionWithDynamicPruningScorer extends Scorer {
-
-    private final boolean needsScores;
-    private final DisiPriorityQueue subScorers;
-    private final DocIdSetIterator approximation;
-    private final TwoPhase twoPhase;
-
-    private Integer docID;
-
-    public DisjunctionWithDynamicPruningScorer(Weight weight, List<Scorer> subScorers) throws IOException {
-        super(weight);
-        if (subScorers.size() <= 1) {
-            throw new IllegalArgumentException("There must be at least 2 subScorers");
-        }
-        this.subScorers = new DisiPriorityQueue(subScorers.size());
-        for (Scorer scorer : subScorers) {
-            final DisiWrapper w = new DisiWrapper(scorer);
-            this.subScorers.add(w);
-        }
-        this.needsScores = false;
-        this.approximation = new DisjunctionDISIApproximation(this.subScorers);
-
-        boolean hasApproximation = false;
-        float sumMatchCost = 0;
-        long sumApproxCost = 0;
-        // Compute matchCost as the average over the matchCost of the subScorers.
-        // This is weighted by the cost, which is an expected number of matching documents.
-        for (DisiWrapper w : this.subScorers) {
-            long costWeight = (w.cost <= 1) ? 1 : w.cost;
-            sumApproxCost += costWeight;
-            if (w.twoPhaseView != null) {
-                hasApproximation = true;
-                sumMatchCost += w.matchCost * costWeight;
-            }
-        }
-
-        if (hasApproximation == false) { // no sub scorer supports approximations
-            twoPhase = null;
-        } else {
-            final float matchCost = sumMatchCost / sumApproxCost;
-            twoPhase = new TwoPhase(approximation, matchCost);
-        }
-    }
-
-    public void removeAllDISIsOnCurrentDoc() {
-        docID = this.docID();
-        while (subScorers.size() > 0 && subScorers.top().doc == docID) {
-            subScorers.pop();
-        }
-    }
-
-    @Override
-    public DocIdSetIterator iterator() {
-        DocIdSetIterator disi = getIterator();
-        docID = disi.docID();
-        return new SlowDocIdPropagatorDISI(getIterator(), docID);
-    }
-
-    private static class SlowDocIdPropagatorDISI extends DocIdSetIterator {
-        DocIdSetIterator disi;
-        Integer curDocId;
-
-        SlowDocIdPropagatorDISI(DocIdSetIterator disi, Integer curDocId) {
-            this.disi = disi;
-            this.curDocId = curDocId;
-        }
-
-        @Override
-        public int docID() {
-            assert curDocId <= disi.docID();
-            return curDocId;
-        }
-
-        @Override
-        public int nextDoc() throws IOException {
-            return advance(curDocId + 1);
-        }
-
-        @Override
-        public int advance(int i) throws IOException {
-            if (i <= disi.docID()) {
-                // since we are slow propagating docIDs, it may happen the disi is already advanced to a higher docID than i
-                // in such scenarios we can simply return the docID where disi is pointing to and update the curDocId
-                curDocId = disi.docID();
-                return disi.docID();
-            }
-            curDocId = disi.advance(i);
-            return curDocId;
-        }
-
-        @Override
-        public long cost() {
-            return disi.cost();
-        }
-    }
-
-    private DocIdSetIterator getIterator() {
-        if (twoPhase != null) {
-            return TwoPhaseIterator.asDocIdSetIterator(twoPhase);
-        } else {
-            return approximation;
-        }
-    }
-
-    @Override
-    public TwoPhaseIterator twoPhaseIterator() {
-        return twoPhase;
-    }
-
-    private class TwoPhase extends TwoPhaseIterator {
-
-        private final float matchCost;
-        // list of verified matches on the current doc
-        DisiWrapper verifiedMatches;
-        // priority queue of approximations on the current doc that have not been verified yet
-        final PriorityQueue<DisiWrapper> unverifiedMatches;
-
-        private TwoPhase(DocIdSetIterator approximation, float matchCost) {
-            super(approximation);
-            this.matchCost = matchCost;
-            unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionWithDynamicPruningScorer.this.subScorers.size()) {
-                @Override
-                protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
-                    return a.matchCost < b.matchCost;
-                }
-            };
-        }
-
-        DisiWrapper getSubMatches() throws IOException {
-            // iteration order does not matter
-            for (DisiWrapper w : unverifiedMatches) {
-                if (w.twoPhaseView.matches()) {
-                    w.next = verifiedMatches;
-                    verifiedMatches = w;
-                }
-            }
-            unverifiedMatches.clear();
-            return verifiedMatches;
-        }
-
-        @Override
-        public boolean matches() throws IOException {
-            verifiedMatches = null;
-            unverifiedMatches.clear();
-
-            for (DisiWrapper w = subScorers.topList(); w != null;) {
-                DisiWrapper next = w.next;
-
-                if (w.twoPhaseView == null) {
-                    // implicitly verified, move it to verifiedMatches
-                    w.next = verifiedMatches;
-                    verifiedMatches = w;
-
-                    if (needsScores == false) {
-                        // we can stop here
-                        return true;
-                    }
-                } else {
-                    unverifiedMatches.add(w);
-                }
-                w = next;
-            }
-
-            if (verifiedMatches != null) {
-                return true;
-            }
-
-            // verify subs that have an two-phase iterator
-            // least-costly ones first
-            while (unverifiedMatches.size() > 0) {
-                DisiWrapper w = unverifiedMatches.pop();
-                if (w.twoPhaseView.matches()) {
-                    w.next = null;
-                    verifiedMatches = w;
-                    return true;
-                }
-            }
-
-            return false;
-        }
-
-        @Override
-        public float matchCost() {
-            return matchCost;
-        }
-    }
-
-    @Override
-    public final int docID() {
-        return subScorers.top().doc;
-    }
-
-    DisiWrapper getSubMatches() throws IOException {
-        if (twoPhase == null) {
-            return subScorers.topList();
-        } else {
-            return twoPhase.getSubMatches();
-        }
-    }
-
-    @Override
-    public final float score() throws IOException {
-        return score(getSubMatches());
-    }
-
-    protected float score(DisiWrapper topList) throws IOException {
-        return 1f;
-    }
-
-    @Override
-    public final Collection<ChildScorable> getChildren() throws IOException {
-        ArrayList<ChildScorable> children = new ArrayList<>();
-        for (DisiWrapper scorer = getSubMatches(); scorer != null; scorer = scorer.next) {
-            children.add(new ChildScorable(scorer.scorer, "SHOULD"));
-        }
-        return children;
-    }
-
-    @Override
-    public float getMaxScore(int i) throws IOException {
-        return 0;
-    }
-}
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
deleted file mode 100644
index cb735a3257289..0000000000000
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/DynamicPruningCollectorWrapper.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-package org.opensearch.search.aggregations.metrics;
-
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.CollectionTerminatedException;
-import org.apache.lucene.search.ConjunctionUtils;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.Weight;
-import org.apache.lucene.util.Bits;
-import org.opensearch.search.aggregations.support.FieldContext;
-import org.opensearch.search.aggregations.support.ValuesSource;
-import org.opensearch.search.internal.SearchContext;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-class DynamicPruningCollectorWrapper extends CardinalityAggregator.Collector {
-
-    private final LeafReaderContext ctx;
-    private final DisjunctionWithDynamicPruningScorer disjunctionScorer;
-    private final DocIdSetIterator disi;
-    private final CardinalityAggregator.Collector delegateCollector;
-
-    DynamicPruningCollectorWrapper(
-        CardinalityAggregator.Collector delegateCollector,
-        SearchContext context,
-        LeafReaderContext ctx,
-        FieldContext fieldContext,
-        ValuesSource.Bytes.WithOrdinals source
-    ) throws IOException {
-        this.ctx = ctx;
-        this.delegateCollector = delegateCollector;
-        final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
-        boolean isCardinalityLow = ordinalValues.getValueCount() < 10;
-        boolean isCardinalityAggregationOnlyAggregation = true;
-        boolean isFieldSupportedForDynamicPruning = true;
-        if (isCardinalityLow && isCardinalityAggregationOnlyAggregation && isFieldSupportedForDynamicPruning) {
-            // create disjunctions from terms
-            // this logic should be pluggable depending on the type of leaf bucket collector by CardinalityAggregator
-            TermsEnum terms = ordinalValues.termsEnum();
-            Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.COMPLETE_NO_SCORES, 1f);
-            Map<Long, Boolean> found = new HashMap<>(); // ord : found or not
-            List<Scorer> subScorers = new ArrayList<>();
-            while (terms.next() != null && !found.containsKey(terms.ord())) {
-                // TODO can we get rid of terms previously encountered in other segments?
-                TermQuery termQuery = new TermQuery(new Term(fieldContext.field(), terms.term()));
-                Weight subWeight = context.searcher().createWeight(termQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
-                Scorer scorer = subWeight.scorer(ctx);
-                if (scorer != null) {
-                    subScorers.add(scorer);
-                }
-                found.put(terms.ord(), true);
-            }
-            disjunctionScorer = new DisjunctionWithDynamicPruningScorer(weight, subScorers);
-            disi = ConjunctionUtils.intersectScorers(List.of(disjunctionScorer, weight.scorer(ctx)));
-        } else {
-            disjunctionScorer = null;
-            disi = null;
-        }
-    }
-
-    @Override
-    public void collect(int doc, long bucketOrd) throws IOException {
-        if (disi == null || disjunctionScorer == null) {
-            delegateCollector.collect(doc, bucketOrd);
-        } else {
-            // perform the full iteration using dynamic pruning of DISIs and return right away
-            disi.advance(doc);
-            int currDoc = disi.docID();
-            assert currDoc == doc;
-            final Bits liveDocs = ctx.reader().getLiveDocs();
-            assert liveDocs == null || liveDocs.get(currDoc);
-            do {
-                if (liveDocs == null || liveDocs.get(currDoc)) {
-                    delegateCollector.collect(currDoc, bucketOrd);
-                    disjunctionScorer.removeAllDISIsOnCurrentDoc();
-                }
-                currDoc = disi.nextDoc();
-            } while (currDoc != DocIdSetIterator.NO_MORE_DOCS);
-            throw new CollectionTerminatedException();
-        }
-    }
-
-    @Override
-    public void close() {
-        delegateCollector.close();
-    }
-
-    @Override
-    public void postCollect() throws IOException {
-        delegateCollector.postCollect();
-    }
-}

From 82a5f0676871e0116269606c837ec82f6b9b58cf Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Tue, 4 Jun 2024 16:46:50 -0700
Subject: [PATCH 08/17] Clean up

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .idea/runConfigurations/Debug_OpenSearch.xml  |  6 +-
 .../metrics/CardinalityAggregator.java        | 85 +++++++++----------
 2 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/.idea/runConfigurations/Debug_OpenSearch.xml b/.idea/runConfigurations/Debug_OpenSearch.xml
index c18046f873477..0d8bf59823acf 100644
--- a/.idea/runConfigurations/Debug_OpenSearch.xml
+++ b/.idea/runConfigurations/Debug_OpenSearch.xml
@@ -6,10 +6,6 @@
     <option name="HOST" value="localhost" />
     <option name="PORT" value="5005" />
     <option name="AUTO_RESTART" value="true" />
-    <RunnerSettings RunnerId="Debug">
-      <option name="DEBUG_PORT" value="5005" />
-      <option name="LOCAL" value="false" />
-    </RunnerSettings>
     <method v="2" />
   </configuration>
-</component>
\ No newline at end of file
+</component>
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index a0b0c1ae430fa..21899384e9406 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -36,6 +36,7 @@
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DisiPriorityQueue;
 import org.apache.lucene.search.DisiWrapper;
@@ -61,7 +62,6 @@
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.InternalAggregation;
 import org.opensearch.search.aggregations.LeafBucketCollector;
-import org.opensearch.search.aggregations.support.FieldContext;
 import org.opensearch.search.aggregations.support.ValuesSource;
 import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
@@ -131,36 +131,38 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             return new DirectCollector(counts, hashValues);
         }
 
+        Collector collector = null;
         if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals) {
             ValuesSource.Bytes.WithOrdinals source = (ValuesSource.Bytes.WithOrdinals) valuesSource;
             final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
-            final SortedSetDocValues globalOrdinalValues = source.globalOrdinalsValues(ctx);
             final long maxOrd = ordinalValues.getValueCount();
             if (maxOrd == 0) {
                 emptyCollectorsUsed++;
-                return new EmptyCollector();
+                collector = new EmptyCollector();
+            } else {
+                final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd);
+                final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision);
+                // only use ordinals if they don't increase memory usage by more than 25%
+                if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
+                    ordinalsCollectorsUsed++;
+                    collector = new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
+                }
+                ordinalsCollectorsOverheadTooHigh++;
             }
+        } else {
+            stringHashingCollectorsUsed++;
+            collector = new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
+        }
 
-            final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd);
-            final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision);
-            // only use ordinals if they don't increase memory usage by more than 25%
-            if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
-                ordinalsCollectorsUsed++;
-                if (valuesSourceConfig.missing() != null) {
-                    return new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
-                }
-                return new CompetitiveCollector(
-                    new OrdinalsCollector(counts, ordinalValues, context.bigArrays()),
-                    source,
-                    ctx,
-                    context,
-                    valuesSourceConfig.fieldContext()
-                );
+        // dynamic pruning optimization
+        if (valuesSourceConfig.missing() == null) {
+            Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
+            if (terms != null) {
+                collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
             }
-            ordinalsCollectorsOverheadTooHigh++;
         }
-        stringHashingCollectorsUsed++;
-        return new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
+
+        return collector;
     }
 
     @Override
@@ -234,32 +236,24 @@ private abstract static class Collector extends LeafBucketCollector implements R
 
     }
 
-    private static class CompetitiveCollector extends Collector {
+    private static class PruningCollector extends Collector {
 
         private final Collector delegate;
         private final DisiPriorityQueue pq;
 
-        CompetitiveCollector(
-            Collector delegate,
-            ValuesSource.Bytes.WithOrdinals source,
-            LeafReaderContext ctx,
-            SearchContext context,
-            FieldContext fieldContext
-        ) throws IOException {
+        PruningCollector(Collector delegate, TermsEnum terms, LeafReaderContext ctx, SearchContext context, String field)
+            throws IOException {
             this.delegate = delegate;
 
-            final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx);
-            TermsEnum terms = ordinalValues.termsEnum();
             Map<BytesRef, Scorer> postingMap = new HashMap<>();
             while (terms.next() != null) {
                 BytesRef term = terms.term();
-
-                TermQuery termQuery = new TermQuery(new Term(fieldContext.field(), term));
+                TermQuery termQuery = new TermQuery(new Term(field, term));
                 Weight subWeight = context.searcher().createWeight(termQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
                 Scorer scorer = subWeight.scorer(ctx);
-
                 postingMap.put(term, scorer);
             }
+
             this.pq = new DisiPriorityQueue(postingMap.size());
             for (Map.Entry<BytesRef, Scorer> entry : postingMap.entrySet()) {
                 pq.add(new DisiWrapper(entry.getValue()));
@@ -277,7 +271,7 @@ public void collect(int doc, long owningBucketOrd) throws IOException {
         }
 
         @Override
-        public DocIdSetIterator competitiveIterator() throws IOException {
+        public DocIdSetIterator competitiveIterator() {
             return new DisjunctionDISIWithPruning(pq);
         }
 
@@ -302,24 +296,26 @@ public int docID() {
 
         @Override
         public int nextDoc() throws IOException {
-            // don't expect this to be called
+            // don't expect this to be called based on its usage in DefaultBulkScorer
             throw new UnsupportedOperationException();
         }
 
+        /**
+         * Aside from advancing to disi towards target
+         * Also perform the subScorer pruning here
+         * advance is performed after collecting the current document
+         * So we can safely remove the subScorers that are on current doc
+         */
         @Override
         public int advance(int target) throws IOException {
-            // more than advance to the next doc >= target
-            // we also do the pruning of current doc here
             DisiWrapper top = queue.top();
-
-            // after collecting the doc, before advancing to target
-            // we can safely remove all the iterators that having this doc
+            // don't do the pruning if this iterator hasn't been used yet
             if (top.doc != -1) {
                 int curTopDoc = top.doc;
                 do {
-                    top.doc = top.approximation.advance(Integer.MAX_VALUE);
+                    top.doc = top.approximation.advance(Integer.MAX_VALUE); // prune
                     top = queue.updateTop();
-                } while (top.doc == curTopDoc);
+                } while (top.doc == curTopDoc); // there may be multiple subScorers on current doc
             }
 
             if (top.doc >= target) return top.doc;
@@ -327,12 +323,13 @@ public int advance(int target) throws IOException {
                 top.doc = top.approximation.advance(target);
                 top = queue.updateTop();
             } while (top.doc < target);
+
             return top.doc;
         }
 
         @Override
         public long cost() {
-            // don't expect this to be called
+            // don't expect this to be called based on its usage in DefaultBulkScorer
             throw new UnsupportedOperationException();
         }
     }

From 35ee3edcbefb1e65e8cdcc9c40fa9d75e2dc780e Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Tue, 4 Jun 2024 20:48:54 -0700
Subject: [PATCH 09/17] Test fix

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../search/aggregations/metrics/CardinalityAggregator.java   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 21899384e9406..ff3b5de45a534 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -146,8 +146,9 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
                 if (ordinalsMemoryUsage < countsMemoryUsage / 4) {
                     ordinalsCollectorsUsed++;
                     collector = new OrdinalsCollector(counts, ordinalValues, context.bigArrays());
+                } else {
+                    ordinalsCollectorsOverheadTooHigh++;
                 }
-                ordinalsCollectorsOverheadTooHigh++;
             }
         } else {
             stringHashingCollectorsUsed++;
@@ -155,7 +156,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
         }
 
         // dynamic pruning optimization
-        if (valuesSourceConfig.missing() == null) {
+        if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
             Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
             if (terms != null) {
                 collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());

From 5d4c6f2e6b8f68c1bd91a9ad7eaaadd3a6477146 Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Thu, 6 Jun 2024 11:34:44 -0700
Subject: [PATCH 10/17] Do all the scoring within Cardinality

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        | 100 ++++++++++++------
 .../metrics/CardinalityAggregatorTests.java   |  30 +++++-
 2 files changed, 98 insertions(+), 32 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index ff3b5de45a534..fe489742210ad 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -32,12 +32,16 @@
 
 package org.opensearch.search.aggregations.metrics;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BulkScorer;
+import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.DisiPriorityQueue;
 import org.apache.lucene.search.DisiWrapper;
 import org.apache.lucene.search.DocIdSetIterator;
@@ -45,6 +49,7 @@
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
@@ -78,6 +83,8 @@
  */
 public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue {
 
+    private static final Logger logger = LogManager.getLogger(CardinalityAggregator.class);
+
     private final int precision;
     private final ValuesSource valuesSource;
 
@@ -94,6 +101,7 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
     private int ordinalsCollectorsUsed;
     private int ordinalsCollectorsOverheadTooHigh;
     private int stringHashingCollectorsUsed;
+    private int dynamicPruningSegments;
 
     public CardinalityAggregator(
         String name,
@@ -155,11 +163,19 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             collector = new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
         }
 
-        // dynamic pruning optimization
         if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
             Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
             if (terms != null) {
+                Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
+                Bits liveDocs = ctx.reader().getLiveDocs();
+                BulkScorer scorer = weight.bulkScorer(ctx);
                 collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
+                scorer.score(collector, liveDocs);
+                collector.postCollect();
+                Releasables.close(collector);
+                logger.debug("Dynamic pruning shard {} segment {}", context.indexShard().shardId(), ctx.ord);
+                dynamicPruningSegments++;
+                throw new CollectionTerminatedException();
             }
         }
 
@@ -224,6 +240,7 @@ public void collectDebugInfo(BiConsumer<String, Object> add) {
         add.accept("ordinals_collectors_used", ordinalsCollectorsUsed);
         add.accept("ordinals_collectors_overhead_too_high", ordinalsCollectorsOverheadTooHigh);
         add.accept("string_hashing_collectors_used", stringHashingCollectorsUsed);
+        add.accept("dynamic_pruning_segments", dynamicPruningSegments);
     }
 
     /**
@@ -240,7 +257,8 @@ private abstract static class Collector extends LeafBucketCollector implements R
     private static class PruningCollector extends Collector {
 
         private final Collector delegate;
-        private final DisiPriorityQueue pq;
+        private final DisiPriorityQueue queue;
+        private final DocIdSetIterator competitiveIterator;
 
         PruningCollector(Collector delegate, TermsEnum terms, LeafReaderContext ctx, SearchContext context, String field)
             throws IOException {
@@ -255,10 +273,12 @@ private static class PruningCollector extends Collector {
                 postingMap.put(term, scorer);
             }
 
-            this.pq = new DisiPriorityQueue(postingMap.size());
+            this.queue = new DisiPriorityQueue(postingMap.size());
             for (Map.Entry<BytesRef, Scorer> entry : postingMap.entrySet()) {
-                pq.add(new DisiWrapper(entry.getValue()));
+                queue.add(new DisiWrapper(entry.getValue()));
             }
+
+            competitiveIterator = new DisjunctionDISI(queue);
         }
 
         @Override
@@ -269,11 +289,26 @@ public void close() {
         @Override
         public void collect(int doc, long owningBucketOrd) throws IOException {
             delegate.collect(doc, owningBucketOrd);
+            prune(doc);
+        }
+
+        /**
+         * Note: the queue may be empty or the queue top may be null after pruning
+         */
+        private void prune(int doc) {
+            DisiWrapper top = queue.top();
+            int curTopDoc = top.doc;
+            if (curTopDoc == doc) {
+                do {
+                    queue.pop();
+                    top = queue.updateTop();
+                } while (queue.size() > 1 && top.doc == curTopDoc);
+            }
         }
 
         @Override
         public DocIdSetIterator competitiveIterator() {
-            return new DisjunctionDISIWithPruning(pq);
+            return competitiveIterator;
         }
 
         @Override
@@ -282,50 +317,53 @@ public void postCollect() throws IOException {
         }
     }
 
-    private static class DisjunctionDISIWithPruning extends DocIdSetIterator {
-
-        final DisiPriorityQueue queue;
+    /**
+     * This DISI is a disjunction of all terms in a segment
+     * And it will be the competitive iterator of the leaf pruning collector
+     * After pruning done after collect, queue top doc may exceed the next doc of (lead) iterator
+     * To still providing a docID slower than the lead iterator for the next iteration
+     * We keep track of a slowDocId that will be updated later during advance
+     */
+    private static class DisjunctionDISI extends DocIdSetIterator {
+        private final DisiPriorityQueue queue;
+        private int slowDocId = -1;
 
-        public DisjunctionDISIWithPruning(DisiPriorityQueue queue) {
+        public DisjunctionDISI(DisiPriorityQueue queue) {
             this.queue = queue;
         }
 
         @Override
         public int docID() {
-            return queue.top().doc;
-        }
-
-        @Override
-        public int nextDoc() throws IOException {
-            // don't expect this to be called based on its usage in DefaultBulkScorer
-            throw new UnsupportedOperationException();
+            return slowDocId;
         }
 
-        /**
-         * Aside from advancing to disi towards target
-         * Also perform the subScorer pruning here
-         * advance is performed after collecting the current document
-         * So we can safely remove the subScorers that are on current doc
-         */
         @Override
         public int advance(int target) throws IOException {
             DisiWrapper top = queue.top();
-            // don't do the pruning if this iterator hasn't been used yet
-            if (top.doc != -1) {
-                int curTopDoc = top.doc;
-                do {
-                    top.doc = top.approximation.advance(Integer.MAX_VALUE); // prune
-                    top = queue.updateTop();
-                } while (top.doc == curTopDoc); // there may be multiple subScorers on current doc
+            if (top == null) {
+                return slowDocId = NO_MORE_DOCS;
+            }
+
+            // This would be the outcome of last pruning
+            // this DISI's docID is already making to the target
+            if (top.doc >= target) {
+                slowDocId = top.doc;
+                return top.doc;
             }
 
-            if (top.doc >= target) return top.doc;
             do {
                 top.doc = top.approximation.advance(target);
                 top = queue.updateTop();
             } while (top.doc < target);
+            slowDocId = queue.size() == 0 ? NO_MORE_DOCS : queue.top().doc;
+
+            return slowDocId;
+        }
 
-            return top.doc;
+        @Override
+        public int nextDoc() {
+            // don't expect this to be called based on its usage in DefaultBulkScorer
+            throw new UnsupportedOperationException();
         }
 
         @Override
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index d21e7f6ed8550..a0519a0b2da8a 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -39,12 +39,19 @@
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.BytesRef;
 import org.opensearch.common.CheckedConsumer;
 import org.opensearch.common.geo.GeoPoint;
@@ -104,7 +111,7 @@ public void testDynamicPruningOrdinalCollector() throws IOException {
 
         MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
         final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
-        testAggregation(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+        testCase2(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
             iw.addDocument(
                 asList(
                     new KeywordField(fieldName, "1", Field.Store.NO),
@@ -271,4 +278,25 @@ private void testAggregation(
     ) throws IOException {
         testCase(aggregationBuilder, query, buildIndex, verify, fieldType);
     }
+
+    protected void testCase2(
+        AggregationBuilder aggregationBuilder,
+        Query query,
+        CheckedConsumer<IndexWriter, IOException> buildIndex,
+        Consumer<InternalCardinality> verify,
+        MappedFieldType... fieldTypes
+    ) throws IOException {
+        try (Directory directory = newDirectory()) {
+            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()));
+            buildIndex.accept(indexWriter);
+            indexWriter.close();
+
+            try (DirectoryReader unwrapped = DirectoryReader.open(directory); IndexReader indexReader = wrapDirectoryReader(unwrapped)) {
+                IndexSearcher indexSearcher = newIndexSearcher(indexReader);
+
+                InternalCardinality agg = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldTypes);
+                verify.accept(agg);
+            }
+        }
+    }
 }

From a2f4bfb50153643bcbfcb3f316c1aa2c114a71bc Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Thu, 6 Jun 2024 15:52:38 -0700
Subject: [PATCH 11/17] clean unnecessary

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../search/aggregations/metrics/CardinalityAggregator.java   | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index fe489742210ad..ccf9d5dfd9351 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -479,9 +479,6 @@ public void collect(int doc, long bucketOrd) throws IOException {
                     bits.set((int) ord);
                 }
             }
-            // for this owning bucket ord, save the values of current doc as value ordinals bits
-            // visitedOrds (array with index as owning bucket, value as bits)
-            // ordinals is number array, each element representing a text or term, and sorted by the values
         }
 
         @Override
@@ -496,7 +493,6 @@ public void postCollect() throws IOException {
 
                 try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) {
                     final MurmurHash3.Hash128 hash = new MurmurHash3.Hash128();
-                    // for every ordinal, we want the hash of its value
                     for (long ord = allVisitedOrds.nextSetBit(0); ord < Long.MAX_VALUE; ord = ord + 1 < maxOrd
                         ? allVisitedOrds.nextSetBit(ord + 1)
                         : Long.MAX_VALUE) {
@@ -508,7 +504,6 @@ public void postCollect() throws IOException {
                     for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) {
                         final BitArray bits = visitedOrds.get(bucket);
                         if (bits != null) {
-                            // for every ordinal of this bucket, we collect by using its hash
                             for (long ord = bits.nextSetBit(0); ord < Long.MAX_VALUE; ord = ord + 1 < maxOrd
                                 ? bits.nextSetBit(ord + 1)
                                 : Long.MAX_VALUE) {

From 29f0841c9369eaa3c5ad61bf5250682794cc858c Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Fri, 7 Jun 2024 07:52:35 -0700
Subject: [PATCH 12/17] fix

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index ccf9d5dfd9351..fd79d0aeb7a32 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -146,7 +146,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             final long maxOrd = ordinalValues.getValueCount();
             if (maxOrd == 0) {
                 emptyCollectorsUsed++;
-                collector = new EmptyCollector();
+                return new EmptyCollector();
             } else {
                 final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd);
                 final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision);
@@ -158,7 +158,9 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
                     ordinalsCollectorsOverheadTooHigh++;
                 }
             }
-        } else {
+        }
+
+        if (collector == null) { // not able to build an OrdinalsCollector
             stringHashingCollectorsUsed++;
             collector = new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
         }
@@ -166,6 +168,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
         if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
             Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
             if (terms != null) {
+                // Specify TOP_DOCS score mode to use competitive iterator from collector
                 Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
                 Bits liveDocs = ctx.reader().getLiveDocs();
                 BulkScorer scorer = weight.bulkScorer(ctx);
@@ -175,7 +178,19 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
                 Releasables.close(collector);
                 logger.debug("Dynamic pruning shard {} segment {}", context.indexShard().shardId(), ctx.ord);
                 dynamicPruningSegments++;
-                throw new CollectionTerminatedException();
+                // return a no-op collector to not breaking profile results
+                return new Collector() {
+                    @Override
+                    public void close() {}
+
+                    @Override
+                    public void postCollect() throws IOException {}
+
+                    @Override
+                    public void collect(int doc, long owningBucketOrd) throws IOException {
+                        throw new CollectionTerminatedException();
+                    }
+                };
             }
         }
 

From 04d6339dc7c9c514063267c9dfd1e194a17696dd Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Fri, 7 Jun 2024 09:20:07 -0700
Subject: [PATCH 13/17] Add dynamic flag for this feature

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../aggregations/metrics/CardinalityIT.java   | 32 +++++++++++
 .../common/settings/ClusterSettings.java      |  1 +
 .../search/DefaultSearchContext.java          | 15 +++++
 .../org/opensearch/search/SearchService.java  |  9 +++
 .../metrics/CardinalityAggregator.java        | 55 +++++++++++--------
 .../search/internal/SearchContext.java        |  4 ++
 6 files changed, 92 insertions(+), 24 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/search/aggregations/metrics/CardinalityIT.java b/server/src/internalClusterTest/java/org/opensearch/search/aggregations/metrics/CardinalityIT.java
index db4ee3571d141..b2ed689622e7d 100644
--- a/server/src/internalClusterTest/java/org/opensearch/search/aggregations/metrics/CardinalityIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/search/aggregations/metrics/CardinalityIT.java
@@ -34,6 +34,7 @@
 
 import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
 
+import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse;
 import org.opensearch.action.index.IndexRequestBuilder;
 import org.opensearch.action.search.SearchResponse;
 import org.opensearch.common.settings.Settings;
@@ -59,6 +60,7 @@
 import static java.util.Collections.emptyMap;
 import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
 import static org.opensearch.index.query.QueryBuilders.matchAllQuery;
+import static org.opensearch.search.SearchService.CARDINALITY_AGGREGATION_PRUNING_THRESHOLD;
 import static org.opensearch.search.SearchService.CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING;
 import static org.opensearch.search.aggregations.AggregationBuilders.cardinality;
 import static org.opensearch.search.aggregations.AggregationBuilders.global;
@@ -255,6 +257,36 @@ public void testSingleValuedString() throws Exception {
         assertCount(count, numDocs);
     }
 
+    public void testDisableDynamicPruning() throws Exception {
+        SearchResponse response = client().prepareSearch("idx")
+            .addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field("str_value"))
+            .get();
+        assertSearchResponse(response);
+
+        Cardinality count1 = response.getAggregations().get("cardinality");
+
+        final ClusterUpdateSettingsResponse updateSettingResponse = client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder().put(CARDINALITY_AGGREGATION_PRUNING_THRESHOLD.getKey(), 0))
+            .get();
+        assertEquals(updateSettingResponse.getTransientSettings().get(CARDINALITY_AGGREGATION_PRUNING_THRESHOLD.getKey()), "0");
+
+        response = client().prepareSearch("idx")
+            .addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field("str_value"))
+            .get();
+        assertSearchResponse(response);
+        Cardinality count2 = response.getAggregations().get("cardinality");
+
+        assertEquals(count1, count2);
+
+        client().admin()
+            .cluster()
+            .prepareUpdateSettings()
+            .setTransientSettings(Settings.builder().putNull(CARDINALITY_AGGREGATION_PRUNING_THRESHOLD.getKey()))
+            .get();
+    }
+
     public void testSingleValuedNumeric() throws Exception {
         SearchResponse response = client().prepareSearch("idx")
             .addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField()))
diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
index 09f32884e0ae1..356a06384fa98 100644
--- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
@@ -534,6 +534,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                 SearchService.MAX_OPEN_PIT_CONTEXT,
                 SearchService.MAX_PIT_KEEPALIVE_SETTING,
                 SearchService.MAX_AGGREGATION_REWRITE_FILTERS,
+                SearchService.CARDINALITY_AGGREGATION_PRUNING_THRESHOLD,
                 CreatePitController.PIT_INIT_KEEP_ALIVE,
                 Node.WRITE_PORTS_FILE_SETTING,
                 Node.NODE_NAME_SETTING,
diff --git a/server/src/main/java/org/opensearch/search/DefaultSearchContext.java b/server/src/main/java/org/opensearch/search/DefaultSearchContext.java
index cd8714f6b556a..abb968c2de245 100644
--- a/server/src/main/java/org/opensearch/search/DefaultSearchContext.java
+++ b/server/src/main/java/org/opensearch/search/DefaultSearchContext.java
@@ -106,6 +106,7 @@
 import java.util.function.Function;
 import java.util.function.LongSupplier;
 
+import static org.opensearch.search.SearchService.CARDINALITY_AGGREGATION_PRUNING_THRESHOLD;
 import static org.opensearch.search.SearchService.CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING;
 import static org.opensearch.search.SearchService.MAX_AGGREGATION_REWRITE_FILTERS;
 
@@ -189,6 +190,7 @@ final class DefaultSearchContext extends SearchContext {
     private final boolean concurrentSearchSettingsEnabled;
     private final SetOnce<Boolean> requestShouldUseConcurrentSearch = new SetOnce<>();
     private final int maxAggRewriteFilters;
+    private final int cardinalityAggregationPruningThreshold;
 
     DefaultSearchContext(
         ReaderContext readerContext,
@@ -244,6 +246,7 @@ final class DefaultSearchContext extends SearchContext {
         this.requestToAggReduceContextBuilder = requestToAggReduceContextBuilder;
 
         this.maxAggRewriteFilters = evaluateFilterRewriteSetting();
+        this.cardinalityAggregationPruningThreshold = evaluateCardinalityAggregationPruningThreshold();
     }
 
     @Override
@@ -1010,4 +1013,16 @@ private int evaluateFilterRewriteSetting() {
         }
         return 0;
     }
+
+    @Override
+    public int cardinalityAggregationPruningThreshold() {
+        return cardinalityAggregationPruningThreshold;
+    }
+
+    private int evaluateCardinalityAggregationPruningThreshold() {
+        if (clusterService != null) {
+            return clusterService.getClusterSettings().get(CARDINALITY_AGGREGATION_PRUNING_THRESHOLD);
+        }
+        return 0;
+    }
 }
diff --git a/server/src/main/java/org/opensearch/search/SearchService.java b/server/src/main/java/org/opensearch/search/SearchService.java
index d371d69a57804..4a1e709bb59ec 100644
--- a/server/src/main/java/org/opensearch/search/SearchService.java
+++ b/server/src/main/java/org/opensearch/search/SearchService.java
@@ -287,6 +287,15 @@ public class SearchService extends AbstractLifecycleComponent implements IndexEv
         Property.NodeScope
     );
 
+    // value 0 means dynamic pruning optimization in aggregations will be disabled
+    public static final Setting<Integer> CARDINALITY_AGGREGATION_PRUNING_THRESHOLD = Setting.intSetting(
+        "search.dynamic_pruning.cardinality_aggregation_threshold",
+        100,
+        0,
+        Property.Dynamic,
+        Property.NodeScope
+    );
+
     public static final int DEFAULT_SIZE = 10;
     public static final int DEFAULT_FROM = 0;
 
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index fd79d0aeb7a32..67a5c59a96154 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -167,31 +167,38 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
 
         if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
             Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
-            if (terms != null) {
-                // Specify TOP_DOCS score mode to use competitive iterator from collector
-                Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
-                Bits liveDocs = ctx.reader().getLiveDocs();
-                BulkScorer scorer = weight.bulkScorer(ctx);
-                collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
-                scorer.score(collector, liveDocs);
-                collector.postCollect();
-                Releasables.close(collector);
-                logger.debug("Dynamic pruning shard {} segment {}", context.indexShard().shardId(), ctx.ord);
-                dynamicPruningSegments++;
-                // return a no-op collector to not breaking profile results
-                return new Collector() {
-                    @Override
-                    public void close() {}
-
-                    @Override
-                    public void postCollect() throws IOException {}
-
-                    @Override
-                    public void collect(int doc, long owningBucketOrd) throws IOException {
-                        throw new CollectionTerminatedException();
-                    }
-                };
+            if (terms == null) return collector;
+            if (terms.size() > context.cardinalityAggregationPruningThreshold()) {
+                logger.debug(
+                    "Cannot prune because {} is greater than the threshold {}",
+                    terms.size(),
+                    context.cardinalityAggregationPruningThreshold()
+                );
+                return collector;
             }
+            // Specify TOP_DOCS score mode to use competitive iterator from collector
+            Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
+            Bits liveDocs = ctx.reader().getLiveDocs();
+            BulkScorer scorer = weight.bulkScorer(ctx);
+            collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
+            scorer.score(collector, liveDocs);
+            collector.postCollect();
+            Releasables.close(collector);
+            logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
+            dynamicPruningSegments++;
+            // return a no-op collector to not breaking profile results
+            return new Collector() {
+                @Override
+                public void close() {}
+
+                @Override
+                public void postCollect() throws IOException {}
+
+                @Override
+                public void collect(int doc, long owningBucketOrd) throws IOException {
+                    throw new CollectionTerminatedException();
+                }
+            };
         }
 
         return collector;
diff --git a/server/src/main/java/org/opensearch/search/internal/SearchContext.java b/server/src/main/java/org/opensearch/search/internal/SearchContext.java
index 0c8240d3a8322..bc4b7058651dd 100644
--- a/server/src/main/java/org/opensearch/search/internal/SearchContext.java
+++ b/server/src/main/java/org/opensearch/search/internal/SearchContext.java
@@ -522,4 +522,8 @@ public String toString() {
     public int maxAggRewriteFilters() {
         return 0;
     }
+
+    public int cardinalityAggregationPruningThreshold() {
+        return 0;
+    }
 }

From 8ea3588a9f151e0ab4a89d00a41105b7a48e7c6a Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Mon, 10 Jun 2024 10:01:52 -0700
Subject: [PATCH 14/17] Add random test, small bug fix

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../org/opensearch/search/SearchService.java  |   2 +-
 .../metrics/CardinalityAggregator.java        |   7 +-
 .../metrics/CardinalityAggregatorTests.java   | 118 ++++++++++++++++--
 .../aggregations/AggregatorTestCase.java      |  14 +++
 4 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/SearchService.java b/server/src/main/java/org/opensearch/search/SearchService.java
index 7c0056fb4206b..7b8763455e68a 100644
--- a/server/src/main/java/org/opensearch/search/SearchService.java
+++ b/server/src/main/java/org/opensearch/search/SearchService.java
@@ -288,7 +288,7 @@ public class SearchService extends AbstractLifecycleComponent implements IndexEv
         Property.NodeScope
     );
 
-    // value 0 means dynamic pruning optimization in aggregations will be disabled
+    // value 0 can disable dynamic pruning optimization in cardinality aggregation
     public static final Setting<Integer> CARDINALITY_AGGREGATION_PRUNING_THRESHOLD = Setting.intSetting(
         "search.dynamic_pruning.cardinality_aggregation_threshold",
         100,
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 67a5c59a96154..9573c6bd44c35 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -170,7 +170,7 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             if (terms == null) return collector;
             if (terms.size() > context.cardinalityAggregationPruningThreshold()) {
                 logger.debug(
-                    "Cannot prune because {} is greater than the threshold {}",
+                    "Cannot prune because terms size {} is greater than the threshold {}",
                     terms.size(),
                     context.cardinalityAggregationPruningThreshold()
                 );
@@ -180,13 +180,16 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
             Bits liveDocs = ctx.reader().getLiveDocs();
             BulkScorer scorer = weight.bulkScorer(ctx);
+            if (scorer == null) {
+                return collector;
+            }
             collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
             scorer.score(collector, liveDocs);
             collector.postCollect();
             Releasables.close(collector);
             logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
             dynamicPruningSegments++;
-            // return a no-op collector to not breaking profile results
+            // return a no-op collector to not breaking the backward compatibility with previous profile results
             return new Collector() {
                 @Override
                 public void close() {}
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index a0519a0b2da8a..7b6a13829fb45 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -55,6 +55,8 @@
 import org.apache.lucene.util.BytesRef;
 import org.opensearch.common.CheckedConsumer;
 import org.opensearch.common.geo.GeoPoint;
+import org.opensearch.core.common.breaker.CircuitBreaker;
+import org.opensearch.core.indices.breaker.NoneCircuitBreakerService;
 import org.opensearch.index.mapper.KeywordFieldMapper;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.NumberFieldMapper;
@@ -62,16 +64,23 @@
 import org.opensearch.index.mapper.RangeType;
 import org.opensearch.search.aggregations.AggregationBuilder;
 import org.opensearch.search.aggregations.AggregatorTestCase;
+import org.opensearch.search.aggregations.InternalAggregation;
+import org.opensearch.search.aggregations.MultiBucketConsumerService;
+import org.opensearch.search.aggregations.pipeline.PipelineAggregator;
 import org.opensearch.search.aggregations.support.AggregationInspectionHelper;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
 
 import static java.util.Arrays.asList;
 import static java.util.Collections.singleton;
+import static org.opensearch.test.InternalAggregationTestCase.DEFAULT_MAX_BUCKETS;
+import static org.mockito.Mockito.when;
 
 public class CardinalityAggregatorTests extends AggregatorTestCase {
 
@@ -105,13 +114,13 @@ public void testRangeFieldValues() throws IOException {
         }, fieldType);
     }
 
-    public void testDynamicPruningOrdinalCollector() throws IOException {
+    public void testDynamicPruningFixedValues() throws IOException {
         final String fieldName = "testField";
         final String filterFieldName = "filterField";
 
         MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
         final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
-        testCase2(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
             iw.addDocument(
                 asList(
                     new KeywordField(fieldName, "1", Field.Store.NO),
@@ -166,7 +175,38 @@ public void testDynamicPruningOrdinalCollector() throws IOException {
         }, card -> {
             assertEquals(3.0, card.getValue(), 0);
             assertTrue(AggregationInspectionHelper.hasValue(card));
-        }, fieldType);
+        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
+    }
+
+    public void testDynamicPruningRandomValues() throws IOException {
+        final String fieldName = "testField";
+        final String filterFieldName = "filterField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+
+        int randomCardinality = randomIntBetween(1, 100);
+        AtomicInteger counter = new AtomicInteger();
+
+        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+            for (int i = 0; i < randomCardinality; i++) {
+                String filterValue = "foo";
+                if (randomBoolean()) {
+                    filterValue = "bar";
+                    counter.getAndIncrement();
+                }
+                iw.addDocument(
+                    asList(
+                        new KeywordField(filterFieldName, filterValue, Field.Store.NO),
+                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
+                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
+                    )
+                );
+            }
+        }, card -> {
+            logger.info("expected {}, cardinality: {}", randomCardinality - counter.get(), card.getValue());
+            assertEquals(randomCardinality - counter.get(), card.getValue(), 0);
+        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
     }
 
     public void testNoMatchingField() throws IOException {
@@ -279,24 +319,78 @@ private void testAggregation(
         testCase(aggregationBuilder, query, buildIndex, verify, fieldType);
     }
 
-    protected void testCase2(
+    private void testDynamicPruning(
         AggregationBuilder aggregationBuilder,
         Query query,
         CheckedConsumer<IndexWriter, IOException> buildIndex,
         Consumer<InternalCardinality> verify,
-        MappedFieldType... fieldTypes
+        MappedFieldType fieldType,
+        int pruningThreshold,
+        Consumer<Integer> verifyCollectCount
     ) throws IOException {
         try (Directory directory = newDirectory()) {
-            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()));
-            buildIndex.accept(indexWriter);
-            indexWriter.close();
+            try (IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()))) {
+                buildIndex.accept(indexWriter);
+            }
+
+            try (IndexReader indexReader = DirectoryReader.open(directory)) {
+                IndexSearcher indexSearcher = newSearcher(indexReader, true, true);
+
+                CountingAggregator aggregator = createCountingAggregator(
+                    query,
+                    aggregationBuilder,
+                    indexSearcher,
+                    fieldType,
+                    pruningThreshold
+                );
+                aggregator.preCollection();
+                indexSearcher.search(query, aggregator);
+                aggregator.postCollection();
 
-            try (DirectoryReader unwrapped = DirectoryReader.open(directory); IndexReader indexReader = wrapDirectoryReader(unwrapped)) {
-                IndexSearcher indexSearcher = newIndexSearcher(indexReader);
+                MultiBucketConsumerService.MultiBucketConsumer reduceBucketConsumer = new MultiBucketConsumerService.MultiBucketConsumer(
+                    Integer.MAX_VALUE,
+                    new NoneCircuitBreakerService().getBreaker(CircuitBreaker.REQUEST)
+                );
+                InternalAggregation.ReduceContext context = InternalAggregation.ReduceContext.forFinalReduction(
+                    aggregator.context().bigArrays(),
+                    getMockScriptService(),
+                    reduceBucketConsumer,
+                    PipelineAggregator.PipelineTree.EMPTY
+                );
+                InternalCardinality topLevel = (InternalCardinality) aggregator.buildTopLevel();
+                InternalCardinality card = (InternalCardinality) topLevel.reduce(Collections.singletonList(topLevel), context);
+                doAssertReducedMultiBucketConsumer(card, reduceBucketConsumer);
 
-                InternalCardinality agg = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldTypes);
-                verify.accept(agg);
+                verify.accept(card);
+
+                verifyCollectCount.accept(aggregator.getCollectCount().get());
             }
         }
     }
+
+    protected CountingAggregator createCountingAggregator(
+        Query query,
+        AggregationBuilder builder,
+        IndexSearcher searcher,
+        MappedFieldType fieldType,
+        int pruningThreshold
+    ) throws IOException {
+        return new CountingAggregator(
+            new AtomicInteger(),
+            createAggregatorWithCustomizableSearchContext(
+                query,
+                builder,
+                searcher,
+                createIndexSettings(),
+                new MultiBucketConsumerService.MultiBucketConsumer(
+                    DEFAULT_MAX_BUCKETS,
+                    new NoneCircuitBreakerService().getBreaker(CircuitBreaker.REQUEST)
+                ),
+                (searchContext) -> {
+                    when(searchContext.cardinalityAggregationPruningThreshold()).thenReturn(pruningThreshold);
+                },
+                fieldType
+            )
+        );
+    }
 }
diff --git a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java
index 02e5d22e147d5..50b27ec000615 100644
--- a/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/search/aggregations/AggregatorTestCase.java
@@ -304,6 +304,20 @@ protected <A extends Aggregator> A createAggregator(
         return createAggregator(aggregationBuilder, searchContext);
     }
 
+    protected <A extends Aggregator> A createAggregatorWithCustomizableSearchContext(
+        Query query,
+        AggregationBuilder aggregationBuilder,
+        IndexSearcher indexSearcher,
+        IndexSettings indexSettings,
+        MultiBucketConsumer bucketConsumer,
+        Consumer<SearchContext> customizeSearchContext,
+        MappedFieldType... fieldTypes
+    ) throws IOException {
+        SearchContext searchContext = createSearchContext(indexSearcher, indexSettings, query, bucketConsumer, fieldTypes);
+        customizeSearchContext.accept(searchContext);
+        return createAggregator(aggregationBuilder, searchContext);
+    }
+
     protected <A extends Aggregator> A createAggregator(AggregationBuilder aggregationBuilder, SearchContext searchContext)
         throws IOException {
         @SuppressWarnings("unchecked")

From 1ae4a3ad527f715324dccd7255226f9ff5ba7815 Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Mon, 10 Jun 2024 14:06:19 -0700
Subject: [PATCH 15/17] address comment

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../org/opensearch/search/SearchService.java  |  2 +-
 .../metrics/CardinalityAggregator.java        | 76 ++++++++++---------
 2 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/SearchService.java b/server/src/main/java/org/opensearch/search/SearchService.java
index 7b8763455e68a..135af91912e5d 100644
--- a/server/src/main/java/org/opensearch/search/SearchService.java
+++ b/server/src/main/java/org/opensearch/search/SearchService.java
@@ -290,7 +290,7 @@ public class SearchService extends AbstractLifecycleComponent implements IndexEv
 
     // value 0 can disable dynamic pruning optimization in cardinality aggregation
     public static final Setting<Integer> CARDINALITY_AGGREGATION_PRUNING_THRESHOLD = Setting.intSetting(
-        "search.dynamic_pruning.cardinality_aggregation_threshold",
+        "search.dynamic_pruning.cardinality_aggregation.max_allowed_cardinality",
         100,
         0,
         Property.Dynamic,
diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index 9573c6bd44c35..b3f0fbac05e5e 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -166,47 +166,55 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
         }
 
         if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
-            Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
-            if (terms == null) return collector;
-            if (terms.size() > context.cardinalityAggregationPruningThreshold()) {
-                logger.debug(
-                    "Cannot prune because terms size {} is greater than the threshold {}",
-                    terms.size(),
-                    context.cardinalityAggregationPruningThreshold()
-                );
-                return collector;
-            }
-            // Specify TOP_DOCS score mode to use competitive iterator from collector
-            Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
-            Bits liveDocs = ctx.reader().getLiveDocs();
-            BulkScorer scorer = weight.bulkScorer(ctx);
-            if (scorer == null) {
+            try {
+                collector = tryWrapWithPruningCollector(ctx, collector);
+            } catch (Exception e) {
                 return collector;
             }
-            collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
-            scorer.score(collector, liveDocs);
-            collector.postCollect();
-            Releasables.close(collector);
-            logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
-            dynamicPruningSegments++;
-            // return a no-op collector to not breaking the backward compatibility with previous profile results
-            return new Collector() {
-                @Override
-                public void close() {}
-
-                @Override
-                public void postCollect() throws IOException {}
-
-                @Override
-                public void collect(int doc, long owningBucketOrd) throws IOException {
-                    throw new CollectionTerminatedException();
-                }
-            };
         }
 
         return collector;
     }
 
+    private Collector tryWrapWithPruningCollector(LeafReaderContext ctx, Collector collector) throws IOException {
+        Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
+        if (terms == null) return collector;
+        if (terms.size() > context.cardinalityAggregationPruningThreshold()) {
+            logger.debug(
+                "Cannot prune because terms size {} is greater than the threshold {}",
+                terms.size(),
+                context.cardinalityAggregationPruningThreshold()
+            );
+            return collector;
+        }
+        // Specify TOP_DOCS score mode to use competitive iterator from collector
+        Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
+        Bits liveDocs = ctx.reader().getLiveDocs();
+        BulkScorer scorer = weight.bulkScorer(ctx);
+        if (scorer == null) {
+            return collector;
+        }
+        collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
+        scorer.score(collector, liveDocs);
+        collector.postCollect();
+        Releasables.close(collector);
+        logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
+        dynamicPruningSegments++;
+        // return a no-op collector to not breaking the backward compatibility with previous profile results
+        return new Collector() {
+            @Override
+            public void close() {}
+
+            @Override
+            public void postCollect() throws IOException {}
+
+            @Override
+            public void collect(int doc, long owningBucketOrd) throws IOException {
+                throw new CollectionTerminatedException();
+            }
+        };
+    }
+
     @Override
     public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException {
         postCollectLastCollector();

From db35f7220dafd3526106d7b61d4767791a1bbc82 Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Mon, 10 Jun 2024 17:13:03 -0700
Subject: [PATCH 16/17] Address comments

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        |  93 +++++--
 .../metrics/CardinalityAggregatorTests.java   | 238 +++++++++++-------
 2 files changed, 210 insertions(+), 121 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index b3f0fbac05e5e..a37452490d1d5 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -53,6 +53,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.opensearch.OpenSearchStatusException;
 import org.opensearch.common.Nullable;
 import org.opensearch.common.hash.MurmurHash3;
 import org.opensearch.common.lease.Releasable;
@@ -62,6 +63,7 @@
 import org.opensearch.common.util.BitMixer;
 import org.opensearch.common.util.LongArray;
 import org.opensearch.common.util.ObjectArray;
+import org.opensearch.core.rest.RestStatus;
 import org.opensearch.index.fielddata.SortedBinaryDocValues;
 import org.opensearch.index.fielddata.SortedNumericDoubleValues;
 import org.opensearch.search.aggregations.Aggregator;
@@ -76,6 +78,8 @@
 import java.util.Map;
 import java.util.function.BiConsumer;
 
+import static org.opensearch.search.SearchService.CARDINALITY_AGGREGATION_PRUNING_THRESHOLD;
+
 /**
  * An aggregator that computes approximate counts of unique values.
  *
@@ -101,7 +105,7 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
     private int ordinalsCollectorsUsed;
     private int ordinalsCollectorsOverheadTooHigh;
     private int stringHashingCollectorsUsed;
-    private int dynamicPruningSegments;
+    private int dynamicPrunedSegments;
 
     public CardinalityAggregator(
         String name,
@@ -165,42 +169,79 @@ private Collector pickCollector(LeafReaderContext ctx) throws IOException {
             collector = new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx)));
         }
 
-        if (parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null) {
-            try {
-                collector = tryWrapWithPruningCollector(ctx, collector);
-            } catch (Exception e) {
+        if (canPrune(parent, subAggregators, valuesSourceConfig)) {
+            Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
+            if (terms == null) return collector;
+            if (exceedMaxThreshold(terms)) {
+                return collector;
+            }
+
+            Collector pruningCollector = tryWrapWithPruningCollector(collector, terms, ctx);
+            if (pruningCollector == null) {
                 return collector;
             }
+
+            if (!tryScoreWithPruningCollector(ctx, pruningCollector)) {
+                return collector;
+            }
+            logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
+            dynamicPrunedSegments++;
+
+            return getNoOpCollector();
         }
 
         return collector;
     }
 
-    private Collector tryWrapWithPruningCollector(LeafReaderContext ctx, Collector collector) throws IOException {
-        Terms terms = ctx.reader().terms(valuesSourceConfig.fieldContext().field());
-        if (terms == null) return collector;
+    private boolean canPrune(Aggregator parent, Aggregator[] subAggregators, ValuesSourceConfig valuesSourceConfig) {
+        return parent == null && subAggregators.length == 0 && valuesSourceConfig.missing() == null && valuesSourceConfig.script() == null;
+    }
+
+    private boolean exceedMaxThreshold(Terms terms) throws IOException {
         if (terms.size() > context.cardinalityAggregationPruningThreshold()) {
             logger.debug(
                 "Cannot prune because terms size {} is greater than the threshold {}",
                 terms.size(),
                 context.cardinalityAggregationPruningThreshold()
             );
-            return collector;
-        }
-        // Specify TOP_DOCS score mode to use competitive iterator from collector
-        Weight weight = context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.TOP_DOCS, 1f);
-        Bits liveDocs = ctx.reader().getLiveDocs();
-        BulkScorer scorer = weight.bulkScorer(ctx);
-        if (scorer == null) {
-            return collector;
-        }
-        collector = new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
-        scorer.score(collector, liveDocs);
-        collector.postCollect();
-        Releasables.close(collector);
-        logger.debug("Dynamic pruned segment {} of shard {}", ctx.ord, context.indexShard().shardId());
-        dynamicPruningSegments++;
-        // return a no-op collector to not breaking the backward compatibility with previous profile results
+            return true;
+        }
+        return false;
+    }
+
+    private Collector tryWrapWithPruningCollector(Collector collector, Terms terms, LeafReaderContext ctx) {
+        try {
+            return new PruningCollector(collector, terms.iterator(), ctx, context, valuesSourceConfig.fieldContext().field());
+        } catch (Exception e) {
+            logger.warn("Failed to build collector for dynamic pruning.", e);
+            return null;
+        }
+    }
+
+    private boolean tryScoreWithPruningCollector(LeafReaderContext ctx, Collector pruningCollector) throws IOException {
+        try {
+            Weight weight = context.query().rewrite(context.searcher()).createWeight(context.searcher(), ScoreMode.TOP_DOCS, 1f);
+            BulkScorer scorer = weight.bulkScorer(ctx);
+            if (scorer == null) {
+                return false;
+            }
+            Bits liveDocs = ctx.reader().getLiveDocs();
+            scorer.score(pruningCollector, liveDocs);
+            pruningCollector.postCollect();
+            Releasables.close(pruningCollector);
+        } catch (Exception e) {
+            throw new OpenSearchStatusException(
+                "Failed when performing dynamic pruning in cardinality aggregation. You can set cluster setting ["
+                    + CARDINALITY_AGGREGATION_PRUNING_THRESHOLD.getKey()
+                    + "] to 0 to disable.",
+                RestStatus.INTERNAL_SERVER_ERROR,
+                e
+            );
+        }
+        return true;
+    }
+
+    private Collector getNoOpCollector() {
         return new Collector() {
             @Override
             public void close() {}
@@ -273,7 +314,7 @@ public void collectDebugInfo(BiConsumer<String, Object> add) {
         add.accept("ordinals_collectors_used", ordinalsCollectorsUsed);
         add.accept("ordinals_collectors_overhead_too_high", ordinalsCollectorsOverheadTooHigh);
         add.accept("string_hashing_collectors_used", stringHashingCollectorsUsed);
-        add.accept("dynamic_pruning_segments", dynamicPruningSegments);
+        add.accept("dynamic_pruned_segments", dynamicPrunedSegments);
     }
 
     /**
@@ -301,7 +342,7 @@ private static class PruningCollector extends Collector {
             while (terms.next() != null) {
                 BytesRef term = terms.term();
                 TermQuery termQuery = new TermQuery(new Term(field, term));
-                Weight subWeight = context.searcher().createWeight(termQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
+                Weight subWeight = termQuery.createWeight(context.searcher(), ScoreMode.COMPLETE_NO_SCORES, 1f);
                 Scorer scorer = subWeight.scorer(ctx);
                 postingMap.put(term, scorer);
             }
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index 7b6a13829fb45..1de553a960414 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -114,101 +114,6 @@ public void testRangeFieldValues() throws IOException {
         }, fieldType);
     }
 
-    public void testDynamicPruningFixedValues() throws IOException {
-        final String fieldName = "testField";
-        final String filterFieldName = "filterField";
-
-        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
-        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
-        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "1", Field.Store.NO),
-                    new KeywordField(fieldName, "2", Field.Store.NO),
-                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("1")),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "2", Field.Store.NO),
-                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "1", Field.Store.NO),
-                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("1"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "2", Field.Store.NO),
-                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "3", Field.Store.NO),
-                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("3"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "4", Field.Store.NO),
-                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("4"))
-                )
-            );
-            iw.addDocument(
-                asList(
-                    new KeywordField(fieldName, "5", Field.Store.NO),
-                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
-                    new SortedSetDocValuesField(fieldName, new BytesRef("5"))
-                )
-            );
-        }, card -> {
-            assertEquals(3.0, card.getValue(), 0);
-            assertTrue(AggregationInspectionHelper.hasValue(card));
-        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
-    }
-
-    public void testDynamicPruningRandomValues() throws IOException {
-        final String fieldName = "testField";
-        final String filterFieldName = "filterField";
-
-        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
-        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
-
-        int randomCardinality = randomIntBetween(1, 100);
-        AtomicInteger counter = new AtomicInteger();
-
-        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
-            for (int i = 0; i < randomCardinality; i++) {
-                String filterValue = "foo";
-                if (randomBoolean()) {
-                    filterValue = "bar";
-                    counter.getAndIncrement();
-                }
-                iw.addDocument(
-                    asList(
-                        new KeywordField(filterFieldName, filterValue, Field.Store.NO),
-                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
-                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
-                    )
-                );
-            }
-        }, card -> {
-            logger.info("expected {}, cardinality: {}", randomCardinality - counter.get(), card.getValue());
-            assertEquals(randomCardinality - counter.get(), card.getValue(), 0);
-        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
-    }
-
     public void testNoMatchingField() throws IOException {
         testAggregation(new MatchAllDocsQuery(), iw -> {
             iw.addDocument(singleton(new SortedNumericDocValuesField("wrong_number", 7)));
@@ -319,6 +224,149 @@ private void testAggregation(
         testCase(aggregationBuilder, query, buildIndex, verify, fieldType);
     }
 
+    public void testDynamicPruningFixedValues() throws IOException {
+        final String fieldName = "testField";
+        final String filterFieldName = "filterField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "1", Field.Store.NO),
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("1")),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "1", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("1"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "2", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("2"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "3", Field.Store.NO),
+                    new KeywordField(filterFieldName, "foo", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("3"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "4", Field.Store.NO),
+                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("4"))
+                )
+            );
+            iw.addDocument(
+                asList(
+                    new KeywordField(fieldName, "5", Field.Store.NO),
+                    new KeywordField(filterFieldName, "bar", Field.Store.NO),
+                    new SortedSetDocValuesField(fieldName, new BytesRef("5"))
+                )
+            );
+        }, card -> {
+            assertEquals(3.0, card.getValue(), 0);
+            assertTrue(AggregationInspectionHelper.hasValue(card));
+        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
+    }
+
+    public void testDynamicPruningRandomValues() throws IOException {
+        final String fieldName = "testField";
+        final String filterFieldName = "filterField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+
+        int randomCardinality = randomIntBetween(1, 100);
+        AtomicInteger counter = new AtomicInteger();
+
+        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+            for (int i = 0; i < randomCardinality; i++) {
+                String filterValue = "foo";
+                if (randomBoolean()) {
+                    filterValue = "bar";
+                    counter.getAndIncrement();
+                }
+                iw.addDocument(
+                    asList(
+                        new KeywordField(filterFieldName, filterValue, Field.Store.NO),
+                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
+                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
+                    )
+                );
+            }
+        }, card -> {
+            logger.info("expected {}, cardinality: {}", randomCardinality - counter.get(), card.getValue());
+            assertEquals(randomCardinality - counter.get(), card.getValue(), 0);
+        }, fieldType, 100, (collectCount) -> assertEquals(0, (int) collectCount));
+    }
+
+    public void testDynamicPruningRandomDelete() throws IOException {
+        final String fieldName = "testField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+
+        int randomCardinality = randomIntBetween(1, 100);
+        AtomicInteger counter = new AtomicInteger();
+
+        testDynamicPruning(aggregationBuilder, new MatchAllDocsQuery(), iw -> {
+            for (int i = 0; i < randomCardinality; i++) {
+                iw.addDocument(
+                    asList(
+                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
+                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
+                    )
+                );
+                if (randomBoolean()) {
+                    iw.deleteDocuments(new Term(fieldName, String.valueOf(i)));
+                    counter.getAndIncrement();
+                }
+            }
+        },
+            card -> { assertEquals(randomCardinality - counter.get(), card.getValue(), 0); },
+            fieldType,
+            100,
+            (collectCount) -> assertEquals(0, (int) collectCount)
+        );
+    }
+
+    public void testDynamicPruningFieldMissingInSegment() throws IOException {
+        final String fieldName = "testField";
+        final String fieldName2 = "testField2";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+
+        testDynamicPruning(aggregationBuilder, new MatchAllDocsQuery(), iw -> {
+            iw.addDocument(asList(new KeywordField(fieldName, "1", Field.Store.NO), new KeywordField(fieldName, "2", Field.Store.NO)));
+            iw.addDocument(asList(new KeywordField(fieldName, "1", Field.Store.NO), new KeywordField(fieldName, "3", Field.Store.NO)));
+            iw.addDocument(asList(new KeywordField(fieldName, "2", Field.Store.NO), new KeywordField(fieldName, "3", Field.Store.NO)));
+            iw.commit();
+            iw.addDocument(asList(new KeywordField(fieldName2, "100", Field.Store.NO)));
+            iw.addDocument(asList(new KeywordField(fieldName2, "101", Field.Store.NO)));
+            iw.addDocument(asList(new KeywordField(fieldName2, "102", Field.Store.NO)));
+        }, card -> { assertEquals(3, card.getValue(), 0); }, fieldType, 100, (collectCount) -> assertEquals(3, (int) collectCount));
+    }
+
     private void testDynamicPruning(
         AggregationBuilder aggregationBuilder,
         Query query,

From 5f7dad385f455b54ecf68d4fec6757aa404b446a Mon Sep 17 00:00:00 2001
From: bowenlan-amzn <bowenlan23@gmail.com>
Date: Tue, 11 Jun 2024 14:01:50 -0700
Subject: [PATCH 17/17] address comments

Signed-off-by: bowenlan-amzn <bowenlan23@gmail.com>
---
 .../metrics/CardinalityAggregator.java        |  9 ++-
 .../metrics/CardinalityAggregatorTests.java   | 72 ++++++++++++++++---
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
index a37452490d1d5..0f3d975960364 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/metrics/CardinalityAggregator.java
@@ -328,6 +328,11 @@ private abstract static class Collector extends LeafBucketCollector implements R
 
     }
 
+    /**
+     * This collector enhance the delegate collector with pruning ability on term field
+     * The iterators of term field values are wrapped into a priority queue, and able to
+     * pop/prune the values after being collected
+     */
     private static class PruningCollector extends Collector {
 
         private final Collector delegate;
@@ -348,8 +353,8 @@ private static class PruningCollector extends Collector {
             }
 
             this.queue = new DisiPriorityQueue(postingMap.size());
-            for (Map.Entry<BytesRef, Scorer> entry : postingMap.entrySet()) {
-                queue.add(new DisiWrapper(entry.getValue()));
+            for (Scorer scorer : postingMap.values()) {
+                queue.add(new DisiWrapper(scorer));
             }
 
             competitiveIterator = new DisjunctionDISI(queue);
diff --git a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
index 1de553a960414..b5dd27e37c332 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/metrics/CardinalityAggregatorTests.java
@@ -43,6 +43,7 @@
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.NoMergePolicy;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -73,6 +74,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
@@ -224,6 +226,39 @@ private void testAggregation(
         testCase(aggregationBuilder, query, buildIndex, verify, fieldType);
     }
 
+    public void testDynamicPruningDisabledWhenExceedingThreshold() throws IOException {
+        final String fieldName = "testField";
+        final String filterFieldName = "filterField";
+
+        MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
+        final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
+
+        int randomCardinality = randomIntBetween(20, 100);
+        AtomicInteger counter = new AtomicInteger();
+
+        testDynamicPruning(aggregationBuilder, new TermQuery(new Term(filterFieldName, "foo")), iw -> {
+            for (int i = 0; i < randomCardinality; i++) {
+                String filterValue = "foo";
+                if (randomBoolean()) {
+                    filterValue = "bar";
+                    counter.getAndIncrement();
+                }
+                iw.addDocument(
+                    asList(
+                        new KeywordField(filterFieldName, filterValue, Field.Store.NO),
+                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
+                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
+                    )
+                );
+            }
+        },
+            card -> { assertEquals(randomCardinality - counter.get(), card.getValue(), 0); },
+            fieldType,
+            10,
+            (collectCount) -> assertEquals(randomCardinality - counter.get(), (int) collectCount)
+        );
+    }
+
     public void testDynamicPruningFixedValues() throws IOException {
         final String fieldName = "testField";
         final String filterFieldName = "filterField";
@@ -356,15 +391,29 @@ public void testDynamicPruningFieldMissingInSegment() throws IOException {
         MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(fieldName);
         final CardinalityAggregationBuilder aggregationBuilder = new CardinalityAggregationBuilder("_name").field(fieldName);
 
+        int randomNumSegments = randomIntBetween(1, 50);
+        logger.info("Indexing [{}] segments", randomNumSegments);
+
         testDynamicPruning(aggregationBuilder, new MatchAllDocsQuery(), iw -> {
-            iw.addDocument(asList(new KeywordField(fieldName, "1", Field.Store.NO), new KeywordField(fieldName, "2", Field.Store.NO)));
-            iw.addDocument(asList(new KeywordField(fieldName, "1", Field.Store.NO), new KeywordField(fieldName, "3", Field.Store.NO)));
-            iw.addDocument(asList(new KeywordField(fieldName, "2", Field.Store.NO), new KeywordField(fieldName, "3", Field.Store.NO)));
+            for (int i = 0; i < randomNumSegments; i++) {
+                iw.addDocument(
+                    asList(
+                        new KeywordField(fieldName, String.valueOf(i), Field.Store.NO),
+                        new SortedSetDocValuesField(fieldName, new BytesRef(String.valueOf(i)))
+                    )
+                );
+                iw.commit();
+            }
+            iw.addDocument(List.of(new KeywordField(fieldName2, "100", Field.Store.NO)));
+            iw.addDocument(List.of(new KeywordField(fieldName2, "101", Field.Store.NO)));
+            iw.addDocument(List.of(new KeywordField(fieldName2, "102", Field.Store.NO)));
             iw.commit();
-            iw.addDocument(asList(new KeywordField(fieldName2, "100", Field.Store.NO)));
-            iw.addDocument(asList(new KeywordField(fieldName2, "101", Field.Store.NO)));
-            iw.addDocument(asList(new KeywordField(fieldName2, "102", Field.Store.NO)));
-        }, card -> { assertEquals(3, card.getValue(), 0); }, fieldType, 100, (collectCount) -> assertEquals(3, (int) collectCount));
+        },
+            card -> { assertEquals(randomNumSegments, card.getValue(), 0); },
+            fieldType,
+            100,
+            (collectCount) -> assertEquals(3, (int) collectCount)
+        );
     }
 
     private void testDynamicPruning(
@@ -377,7 +426,13 @@ private void testDynamicPruning(
         Consumer<Integer> verifyCollectCount
     ) throws IOException {
         try (Directory directory = newDirectory()) {
-            try (IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()))) {
+            try (
+                IndexWriter indexWriter = new IndexWriter(
+                    directory,
+                    new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()).setMergePolicy(NoMergePolicy.INSTANCE)
+                )
+            ) {
+                // disable merge so segment number is same as commit times
                 buildIndex.accept(indexWriter);
             }
 
@@ -411,6 +466,7 @@ private void testDynamicPruning(
 
                 verify.accept(card);
 
+                logger.info("aggregator collect count {}", aggregator.getCollectCount().get());
                 verifyCollectCount.accept(aggregator.getCollectCount().get());
             }
         }