improved calculation of register change probability for UltraLogLog, …

…use of fixed size arrays in ML estimators to allow for compiler optimizations
dynatrace-oss · Dec 27, 2023 · 732909e · 732909e
1 parent 2bb98c1
commit 732909e
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 99 deletions.
diff --git a/src/main/java/com/dynatrace/hash4j/distinctcount/DistinctCountUtil.java b/src/main/java/com/dynatrace/hash4j/distinctcount/DistinctCountUtil.java
@@ -42,22 +42,26 @@ static void checkPrecisionParameter(int p, int minP, int maxP) {
   /**
    * Maximizes the expression
    *
-   * <p>{@code e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ...}
+   * <p>{@code e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ... * (1
+   * - e^{-x/2^n})^b[n]}
    *
    * <p>where {@code a} and all elements of {@code b} must be nonnegative. If this is not the case,
-   * or if neither {@code a} nor any element in {@code b} is positive, or if {@code b.length >= 64}
-   * the behavior of this function is not defined. {@code a} must be either zero or greater than or
-   * equal to 2^{-63}.
+   * or if neither {@code a} nor any element in {@code b} is positive, or if {@code n >= b.length}
+   * or {@code n >= 64} the behavior of this function is not defined. {@code a} must be either zero
+   * or greater than or equal to 2^{-64}.
    *
    * <p>This algorithm is based on Algorithm 8 described in Ertl, Otmar. "New cardinality estimation
    * algorithms for HyperLogLog sketches." arXiv preprint arXiv:1702.01284 (2017).
    *
    * @param a parameter a
    * @param b parameter b
+   * @param n parameter n
    * @param relativeErrorLimit the relative error limit
    * @return the value that maximizes the expression
+   * @throws RuntimeException if n >= b.length
    */
-  static double solveMaximumLikelihoodEquation(double a, int[] b, double relativeErrorLimit) {
+  static double solveMaximumLikelihoodEquation(
+      double a, int[] b, int n, double relativeErrorLimit) {
     // Maximizing the expression
     //
     // e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ...
@@ -119,8 +123,7 @@ static double solveMaximumLikelihoodEquation(double a, int[] b, double relativeE
 
     if (a == 0.) return Double.POSITIVE_INFINITY;
 
-    int kMax;
-    kMax = b.length - 1;
+    int kMax = n;
     while (kMax >= 0 && b[kMax] == 0) {
       --kMax;
     }
@@ -296,12 +299,16 @@ static double estimateDistinctCountFromTokens(TokenIterable tokenIterable) {
     }
 
     double a = 0x1p27;
+    int maxNonZeroIndex = 0;
     for (int i = 0; i < MAX_NLZ_IN_TOKEN; ++i) {
       if (b[i] != 0) {
         a -= b[i] * Double.longBitsToDouble((0x3FFL - i) << 52);
+        maxNonZeroIndex = i;
       }
     }
-    return DistinctCountUtil.solveMaximumLikelihoodEquation(a, b, RELATIVE_ERROR_LIMIT) * 0x1p27;
+    return DistinctCountUtil.solveMaximumLikelihoodEquation(
+            a, b, maxNonZeroIndex, RELATIVE_ERROR_LIMIT)
+        * 0x1p27;
   }
 
   static double unsignedLongToDouble(long l) {

diff --git a/src/main/java/com/dynatrace/hash4j/distinctcount/HyperLogLog.java b/src/main/java/com/dynatrace/hash4j/distinctcount/HyperLogLog.java
@@ -354,7 +354,7 @@ public HyperLogLog addToken(int token, StateChangeObserver stateChangeObserver)
 
   // returns register change probability scaled by 2^64
   private long getScaledRegisterChangeProbability(int registerValue) {
-    return 0x4000000000000000L >>> (p + registerValue - 2);
+    return 0x4000000000000000L >>> (p - 2 + registerValue);
   }
 
   /**
@@ -431,25 +431,17 @@ public double getDistinctCountEstimate(Estimator estimator) {
   @Override
   public double getStateChangeProbability() {
     long sum = 0;
-    for (int off = 0; off + 5 < state.length; off += 6) {
+    for (int off = 0; off + 6 <= state.length; off += 6) {
       int s0 = getInt(state, off);
       int s1 = getInt(state, off + 2);
-      int r0 = s0 & 0x3F;
-      int r1 = (s0 >>> 6) & 0x3F;
-      int r2 = (s0 >>> 12) & 0x3F;
-      int r3 = (s0 >>> 18) & 0x3F;
-      int r4 = (s1 >>> 8) & 0x3F;
-      int r5 = (s1 >>> 14) & 0x3F;
-      int r6 = (s1 >>> 20) & 0x3F;
-      int r7 = (s1 >>> 26) & 0x3F;
-      sum += getScaledRegisterChangeProbability(r0);
-      sum += getScaledRegisterChangeProbability(r1);
-      sum += getScaledRegisterChangeProbability(r2);
-      sum += getScaledRegisterChangeProbability(r3);
-      sum += getScaledRegisterChangeProbability(r4);
-      sum += getScaledRegisterChangeProbability(r5);
-      sum += getScaledRegisterChangeProbability(r6);
-      sum += getScaledRegisterChangeProbability(r7);
+      sum += getScaledRegisterChangeProbability(s0);
+      sum += getScaledRegisterChangeProbability(s0 >>> 6);
+      sum += getScaledRegisterChangeProbability(s0 >>> 12);
+      sum += getScaledRegisterChangeProbability(s0 >>> 18);
+      sum += getScaledRegisterChangeProbability(s1 >>> 8);
+      sum += getScaledRegisterChangeProbability(s1 >>> 14);
+      sum += getScaledRegisterChangeProbability(s1 >>> 20);
+      sum += getScaledRegisterChangeProbability(s1 >>> 26);
     }
     if (sum == 0 && state[0] == 0) {
       // sum can only be zero if either all registers are 0 or all registers are saturated
@@ -540,12 +532,13 @@ static double tau(double x) {
     @Override
     public double estimate(HyperLogLog hyperLogLog) {
       byte[] state = hyperLogLog.state;
+      int p = hyperLogLog.p;
       int c0 = 0;
       int cMax = 0;
       long agg = 0;
-      int maxR = 65 - hyperLogLog.p;
-      long inc = 1L << -hyperLogLog.p;
-      for (int off = 0; off + 5 < state.length; off += 6) {
+      int maxR = 65 - p;
+      long inc = 1L << -p;
+      for (int off = 0; off + 6 <= state.length; off += 6) {
         int s0 = getInt(state, off);
         int s1 = getInt(state, off + 2);
         int r0 = s0 & 0x3F;
@@ -583,23 +576,20 @@ public double estimate(HyperLogLog hyperLogLog) {
       }
       double sum = 0;
 
+      double m = 1 << p;
       if (cMax > 0) {
-        double m = 1 << hyperLogLog.p;
         sum +=
-            Double.longBitsToDouble(0x3FF0000000000000L - ((32L - hyperLogLog.p) << 53))
-                * tau(1. - cMax / m);
+            Double.longBitsToDouble(0x3FF0000000000000L - ((32L - p) << 53)) * tau(1. - cMax / m);
       }
-
-      // if c0 == m agg could have overflown and would be zero, but sum would become infinite anyway
-      // due to the sigma function below, so this does not matter
-      sum += unsignedLongToDouble(agg) / inc;
+      // if c0 == m, agg could have overflown and would be zero, but sum would become infinite
+      // anyway due to the sigma function below, so this does not matter
+      sum += unsignedLongToDouble(agg) * m * 0x1p-64;
 
       if (c0 > 0) {
-        double m = 1 << hyperLogLog.p;
         sum += m * sigma(c0 / m);
       }
 
-      return ESTIMATION_FACTORS[hyperLogLog.p - MIN_P] / sum;
+      return ESTIMATION_FACTORS[p - MIN_P] / sum;
     }
   }
 
@@ -629,10 +619,10 @@ public double estimate(HyperLogLog hyperLogLog) {
       byte[] state = hyperLogLog.state;
       int p = hyperLogLog.p;
       long agg = 0;
-      int[] c = new int[66 - p];
+      int[] c = new int[64];
       long inc = 1L << -p;
 
-      for (int off = 0; off + 5 < state.length; off += 6) {
+      for (int off = 0; off + 6 <= state.length; off += 6) {
         int s0 = getInt(state, off);
         int s1 = getInt(state, off + 2);
         int r0 = s0 & 0x3F;
@@ -651,25 +641,24 @@ public double estimate(HyperLogLog hyperLogLog) {
         agg += inc >>> r5;
         agg += inc >>> r6;
         agg += inc >>> r7;
-        if (r0 < c.length) c[r0] += 1;
-        if (r1 < c.length) c[r1] += 1;
-        if (r2 < c.length) c[r2] += 1;
-        if (r3 < c.length) c[r3] += 1;
-        if (r4 < c.length) c[r4] += 1;
-        if (r5 < c.length) c[r5] += 1;
-        if (r6 < c.length) c[r6] += 1;
-        if (r7 < c.length) c[r7] += 1;
+        c[r0] += 1;
+        c[r1] += 1;
+        c[r2] += 1;
+        c[r3] += 1;
+        c[r4] += 1;
+        c[r5] += 1;
+        c[r6] += 1;
+        c[r7] += 1;
       }
       int m = 1 << p;
 
       if (c[0] == m) return 0.;
       c[0] = 0;
-      c[c.length - 2] += c[c.length - 1];
-      c[c.length - 1] = 0;
-      double a = unsignedLongToDouble(agg) / inc;
+      c[64 - p] += c[65 - p];
+      double a = unsignedLongToDouble(agg) * m * 0x1p-64;
       return m
           * DistinctCountUtil.solveMaximumLikelihoodEquation(
-              a, c, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
+              a, c, 64 - p, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
           / (1. + ML_BIAS_CORRECTION_CONSTANT / m);
     }
   }

diff --git a/src/main/java/com/dynatrace/hash4j/distinctcount/UltraLogLog.java b/src/main/java/com/dynatrace/hash4j/distinctcount/UltraLogLog.java
@@ -351,26 +351,9 @@ public double getDistinctCountEstimate(Estimator estimator) {
   // visible for testing
   // returns register change probability scaled by 2^64
   static long getScaledRegisterChangeProbability(byte reg, int p) {
-    int r = reg & 0xFF;
-    int r2 = r - (p << 2) - 4;
-    if (r2 < 0) {
-      long ret = 4L;
-      if (r2 == -2 || r2 == -8) {
-        ret -= 2;
-      }
-      if (r2 == -2 || r2 == -4) {
-        ret -= 1;
-      }
-      return ret << (62 - p);
-    } else {
-      int k = r2 >>> 2;
-      long ret = 0xE000000000000000L;
-      int y0 = r & 1;
-      int y1 = (r >>> 1) & 1;
-      ret -= (long) y0 << 63;
-      ret -= (long) y1 << 62;
-      return ret >>> (k + p);
-    }
+    if (reg == 0) return 1L << -p;
+    long k = (reg >>> 2) - p + 1;
+    return ((((reg & 2) | ((reg & 1) << 2)) ^ 7L) << ~k) >>> p;
   }
 
   /**
@@ -464,8 +447,7 @@ public double estimate(UltraLogLog ultraLogLog) {
       int p = ultraLogLog.getP();
 
       long sum = 0;
-      int[] b = new int[65 - p];
-
+      int[] b = new int[64];
       for (byte r : state) {
         sum += contribute(r & 0xff, b, p);
       }
@@ -477,13 +459,12 @@ public double estimate(UltraLogLog ultraLogLog) {
         return (state[0] == 0) ? 0 : Double.POSITIVE_INFINITY;
       }
       b[63 - p] += b[64 - p];
-      b[64 - p] = 0;
       double factor = m << 1;
       double a = unsignedLongToDouble(sum) * factor * 0x1p-64;
 
       return factor
           * DistinctCountUtil.solveMaximumLikelihoodEquation(
-              a, b, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
+              a, b, 63 - p, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
           / (1. + ML_BIAS_CORRECTION_CONSTANT / m);
     }
   }

diff --git a/src/test/java/com/dynatrace/hash4j/distinctcount/DistinctCountUtilTest.java b/src/test/java/com/dynatrace/hash4j/distinctcount/DistinctCountUtilTest.java
@@ -73,7 +73,7 @@ private static double solveN(double a, int... b) {
           if (b != null) {
             for (int i = 0; i < b.length; ++i) {
               if (b[i] > 0) {
-                double f = 1. / (1L << i);
+                double f = Double.longBitsToDouble((0x3ffL - i) << 52);
                 sum += b[i] * f / Math.expm1(x * f);
               }
             }
@@ -87,66 +87,76 @@ private static double solveN(double a, int... b) {
   @Test
   void testSolveMaximumLikelihoodEquation() {
     double relativeErrorLimit = 1e-14;
-    assertThat(solveMaximumLikelihoodEquation(1., new int[] {}, relativeErrorLimit)).isZero();
-    assertThat(solveMaximumLikelihoodEquation(2., new int[] {}, relativeErrorLimit)).isZero();
-    assertThat(solveMaximumLikelihoodEquation(0., new int[] {1}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(1., new int[] {}, -1, relativeErrorLimit)).isZero();
+    assertThat(solveMaximumLikelihoodEquation(2., new int[] {}, -1, relativeErrorLimit)).isZero();
+    assertThat(solveMaximumLikelihoodEquation(0., new int[] {1}, 0, relativeErrorLimit))
         .isPositive()
         .isInfinite();
-    assertThat(solveMaximumLikelihoodEquation(0., new int[] {1, 0}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(0., new int[] {1, 0}, 1, relativeErrorLimit))
         .isPositive()
         .isInfinite();
 
-    assertThat(solveMaximumLikelihoodEquation(1., new int[] {1}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(1., new int[] {1}, 0, relativeErrorLimit))
         .isCloseTo(solve1(1., 1), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(2., new int[] {3}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(2., new int[] {3}, 0, relativeErrorLimit))
         .isCloseTo(solve1(2., 3), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2}, 0, relativeErrorLimit))
         .isCloseTo(solve1(3., 2), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(5., new int[] {7}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(5., new int[] {7}, 0, relativeErrorLimit))
         .isCloseTo(solve1(5., 7), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(11., new int[] {7}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(11., new int[] {7}, 0, relativeErrorLimit))
         .isCloseTo(solve1(11., 7), withPercentage(1e-6));
     assertThat(
             solveMaximumLikelihoodEquation(
-                0.03344574927673416, new int[] {238}, relativeErrorLimit))
+                0.03344574927673416, new int[] {238}, 0, relativeErrorLimit))
         .isCloseTo(solve1(0.03344574927673416, 238), withPercentage(1e-6));
 
-    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 0}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 0}, 1, relativeErrorLimit))
         .isCloseTo(solve2(3., 2, 0), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(5., new int[] {7, 0}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(5., new int[] {7, 0}, 1, relativeErrorLimit))
         .isCloseTo(solve2(5., 7, 0), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(11., new int[] {7, 0}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(11., new int[] {7, 0}, 1, relativeErrorLimit))
         .isCloseTo(solve2(11., 7, 0), withPercentage(1e-6));
     assertThat(
             solveMaximumLikelihoodEquation(
-                0.12274207925281233, new int[] {574, 580}, relativeErrorLimit))
+                0.12274207925281233, new int[] {574, 580}, 1, relativeErrorLimit))
         .isCloseTo(solve2(0.12274207925281233, 574, 580), withPercentage(1e-6));
 
-    assertThat(solveMaximumLikelihoodEquation(1., new int[] {2, 3}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(1., new int[] {2, 3}, 1, relativeErrorLimit))
         .isCloseTo(solve2(1., 2, 3), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1}, 1, relativeErrorLimit))
         .isCloseTo(solve2(3., 2, 1), withPercentage(1e-6));
 
-    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1, 4, 5}, relativeErrorLimit))
+    assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1, 4, 5}, 3, relativeErrorLimit))
         .isCloseTo(solveN(3., 2, 1, 4, 5), withPercentage(1e-6));
-    assertThat(solveMaximumLikelihoodEquation(3., new int[] {6, 7, 2, 1, 4, 5}, relativeErrorLimit))
+    assertThat(
+            solveMaximumLikelihoodEquation(3., new int[] {6, 7, 2, 1, 4, 5}, 5, relativeErrorLimit))
         .isCloseTo(solveN(3., 6, 7, 2, 1, 4, 5), withPercentage(1e-6));
     assertThat(
             solveMaximumLikelihoodEquation(
-                7., new int[] {0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0}, relativeErrorLimit))
+                7., new int[] {0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0}, 11, relativeErrorLimit))
         .isCloseTo(solveN(7., 0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0), withPercentage(1e-6));
     assertThat(
             solveMaximumLikelihoodEquation(
-                7., new int[] {0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0}, relativeErrorLimit))
+                7., new int[] {0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0}, 11, relativeErrorLimit))
         .isCloseTo(solveN(7., 0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0), withPercentage(1e-6));
-
+    assertThat(solveMaximumLikelihoodEquation(0x1p-64, new int[] {0}, -1, relativeErrorLimit))
+        .isCloseTo(solve1(0x1p-64, 0), withPercentage(1e-6));
+    assertThat(solveMaximumLikelihoodEquation(0x1p-64, new int[] {1}, 0, relativeErrorLimit))
+        .isCloseTo(solve1(0x1p-64, 1), withPercentage(1e-6));
+    {
+      int[] b = new int[65];
+      b[64] = 1;
+      assertThat(solveMaximumLikelihoodEquation(1., b, 64, relativeErrorLimit))
+          .isCloseTo(solveN(1., b), withPercentage(1e-6));
+    }
     // many more cases to have full code coverage
     SplittableRandom random = new SplittableRandom(0x93b723ca5f234685L);
 
     for (int i = 0; i < 10000; ++i) {
       double a = 1. - random.nextDouble();
       int b0 = random.nextInt(1000);
-      assertThat(solveMaximumLikelihoodEquation(a, new int[] {b0}, relativeErrorLimit))
+      assertThat(solveMaximumLikelihoodEquation(a, new int[] {b0}, 0, relativeErrorLimit))
           .isCloseTo(solve1(a, b0), withPercentage(1e-6));
     }
   }