Skip to content

Commit

Permalink
improved calculation of register change probability for UltraLogLog, …
Browse files Browse the repository at this point in the history
…use of fixed size arrays in ML estimators to allow for compiler optimizations
  • Loading branch information
oertl committed Dec 27, 2023
1 parent 2bb98c1 commit 732909e
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,26 @@ static void checkPrecisionParameter(int p, int minP, int maxP) {
/**
* Maximizes the expression
*
* <p>{@code e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ...}
* <p>{@code e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ... * (1
* - e^{-x/2^n})^b[n]}
*
* <p>where {@code a} and all elements of {@code b} must be nonnegative. If this is not the case,
* or if neither {@code a} nor any element in {@code b} is positive, or if {@code b.length >= 64}
* the behavior of this function is not defined. {@code a} must be either zero or greater than or
* equal to 2^{-63}.
* or if neither {@code a} nor any element in {@code b} is positive, or if {@code n >= b.length}
* or {@code n >= 64} the behavior of this function is not defined. {@code a} must be either zero
* or greater than or equal to 2^{-64}.
*
* <p>This algorithm is based on Algorithm 8 described in Ertl, Otmar. "New cardinality estimation
* algorithms for HyperLogLog sketches." arXiv preprint arXiv:1702.01284 (2017).
*
* @param a parameter a
* @param b parameter b
* @param n parameter n
* @param relativeErrorLimit the relative error limit
* @return the value that maximizes the expression
* @throws RuntimeException if n >= b.length
*/
static double solveMaximumLikelihoodEquation(double a, int[] b, double relativeErrorLimit) {
static double solveMaximumLikelihoodEquation(
double a, int[] b, int n, double relativeErrorLimit) {
// Maximizing the expression
//
// e^{-x*a} * (1 - e^{-x})^b[0] * (1 - e^{-x/2})^b[1] * (1 - e^{-x/2^2})^b[2] * ...
Expand Down Expand Up @@ -119,8 +123,7 @@ static double solveMaximumLikelihoodEquation(double a, int[] b, double relativeE

if (a == 0.) return Double.POSITIVE_INFINITY;

int kMax;
kMax = b.length - 1;
int kMax = n;
while (kMax >= 0 && b[kMax] == 0) {
--kMax;
}
Expand Down Expand Up @@ -296,12 +299,16 @@ static double estimateDistinctCountFromTokens(TokenIterable tokenIterable) {
}

double a = 0x1p27;
int maxNonZeroIndex = 0;
for (int i = 0; i < MAX_NLZ_IN_TOKEN; ++i) {
if (b[i] != 0) {
a -= b[i] * Double.longBitsToDouble((0x3FFL - i) << 52);
maxNonZeroIndex = i;
}
}
return DistinctCountUtil.solveMaximumLikelihoodEquation(a, b, RELATIVE_ERROR_LIMIT) * 0x1p27;
return DistinctCountUtil.solveMaximumLikelihoodEquation(
a, b, maxNonZeroIndex, RELATIVE_ERROR_LIMIT)
* 0x1p27;
}

static double unsignedLongToDouble(long l) {
Expand Down
77 changes: 33 additions & 44 deletions src/main/java/com/dynatrace/hash4j/distinctcount/HyperLogLog.java
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ public HyperLogLog addToken(int token, StateChangeObserver stateChangeObserver)

// returns register change probability scaled by 2^64
private long getScaledRegisterChangeProbability(int registerValue) {
return 0x4000000000000000L >>> (p + registerValue - 2);
return 0x4000000000000000L >>> (p - 2 + registerValue);
}

/**
Expand Down Expand Up @@ -431,25 +431,17 @@ public double getDistinctCountEstimate(Estimator estimator) {
@Override
public double getStateChangeProbability() {
long sum = 0;
for (int off = 0; off + 5 < state.length; off += 6) {
for (int off = 0; off + 6 <= state.length; off += 6) {
int s0 = getInt(state, off);
int s1 = getInt(state, off + 2);
int r0 = s0 & 0x3F;
int r1 = (s0 >>> 6) & 0x3F;
int r2 = (s0 >>> 12) & 0x3F;
int r3 = (s0 >>> 18) & 0x3F;
int r4 = (s1 >>> 8) & 0x3F;
int r5 = (s1 >>> 14) & 0x3F;
int r6 = (s1 >>> 20) & 0x3F;
int r7 = (s1 >>> 26) & 0x3F;
sum += getScaledRegisterChangeProbability(r0);
sum += getScaledRegisterChangeProbability(r1);
sum += getScaledRegisterChangeProbability(r2);
sum += getScaledRegisterChangeProbability(r3);
sum += getScaledRegisterChangeProbability(r4);
sum += getScaledRegisterChangeProbability(r5);
sum += getScaledRegisterChangeProbability(r6);
sum += getScaledRegisterChangeProbability(r7);
sum += getScaledRegisterChangeProbability(s0);
sum += getScaledRegisterChangeProbability(s0 >>> 6);
sum += getScaledRegisterChangeProbability(s0 >>> 12);
sum += getScaledRegisterChangeProbability(s0 >>> 18);
sum += getScaledRegisterChangeProbability(s1 >>> 8);
sum += getScaledRegisterChangeProbability(s1 >>> 14);
sum += getScaledRegisterChangeProbability(s1 >>> 20);
sum += getScaledRegisterChangeProbability(s1 >>> 26);
}
if (sum == 0 && state[0] == 0) {
// sum can only be zero if either all registers are 0 or all registers are saturated
Expand Down Expand Up @@ -540,12 +532,13 @@ static double tau(double x) {
@Override
public double estimate(HyperLogLog hyperLogLog) {
byte[] state = hyperLogLog.state;
int p = hyperLogLog.p;
int c0 = 0;
int cMax = 0;
long agg = 0;
int maxR = 65 - hyperLogLog.p;
long inc = 1L << -hyperLogLog.p;
for (int off = 0; off + 5 < state.length; off += 6) {
int maxR = 65 - p;
long inc = 1L << -p;
for (int off = 0; off + 6 <= state.length; off += 6) {
int s0 = getInt(state, off);
int s1 = getInt(state, off + 2);
int r0 = s0 & 0x3F;
Expand Down Expand Up @@ -583,23 +576,20 @@ public double estimate(HyperLogLog hyperLogLog) {
}
double sum = 0;

double m = 1 << p;
if (cMax > 0) {
double m = 1 << hyperLogLog.p;
sum +=
Double.longBitsToDouble(0x3FF0000000000000L - ((32L - hyperLogLog.p) << 53))
* tau(1. - cMax / m);
Double.longBitsToDouble(0x3FF0000000000000L - ((32L - p) << 53)) * tau(1. - cMax / m);
}

// if c0 == m agg could have overflown and would be zero, but sum would become infinite anyway
// due to the sigma function below, so this does not matter
sum += unsignedLongToDouble(agg) / inc;
// if c0 == m, agg could have overflown and would be zero, but sum would become infinite
// anyway due to the sigma function below, so this does not matter
sum += unsignedLongToDouble(agg) * m * 0x1p-64;

if (c0 > 0) {
double m = 1 << hyperLogLog.p;
sum += m * sigma(c0 / m);
}

return ESTIMATION_FACTORS[hyperLogLog.p - MIN_P] / sum;
return ESTIMATION_FACTORS[p - MIN_P] / sum;
}
}

Expand Down Expand Up @@ -629,10 +619,10 @@ public double estimate(HyperLogLog hyperLogLog) {
byte[] state = hyperLogLog.state;
int p = hyperLogLog.p;
long agg = 0;
int[] c = new int[66 - p];
int[] c = new int[64];
long inc = 1L << -p;

for (int off = 0; off + 5 < state.length; off += 6) {
for (int off = 0; off + 6 <= state.length; off += 6) {
int s0 = getInt(state, off);
int s1 = getInt(state, off + 2);
int r0 = s0 & 0x3F;
Expand All @@ -651,25 +641,24 @@ public double estimate(HyperLogLog hyperLogLog) {
agg += inc >>> r5;
agg += inc >>> r6;
agg += inc >>> r7;
if (r0 < c.length) c[r0] += 1;
if (r1 < c.length) c[r1] += 1;
if (r2 < c.length) c[r2] += 1;
if (r3 < c.length) c[r3] += 1;
if (r4 < c.length) c[r4] += 1;
if (r5 < c.length) c[r5] += 1;
if (r6 < c.length) c[r6] += 1;
if (r7 < c.length) c[r7] += 1;
c[r0] += 1;
c[r1] += 1;
c[r2] += 1;
c[r3] += 1;
c[r4] += 1;
c[r5] += 1;
c[r6] += 1;
c[r7] += 1;
}
int m = 1 << p;

if (c[0] == m) return 0.;
c[0] = 0;
c[c.length - 2] += c[c.length - 1];
c[c.length - 1] = 0;
double a = unsignedLongToDouble(agg) / inc;
c[64 - p] += c[65 - p];
double a = unsignedLongToDouble(agg) * m * 0x1p-64;
return m
* DistinctCountUtil.solveMaximumLikelihoodEquation(
a, c, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
a, c, 64 - p, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
/ (1. + ML_BIAS_CORRECTION_CONSTANT / m);
}
}
Expand Down
29 changes: 5 additions & 24 deletions src/main/java/com/dynatrace/hash4j/distinctcount/UltraLogLog.java
Original file line number Diff line number Diff line change
Expand Up @@ -351,26 +351,9 @@ public double getDistinctCountEstimate(Estimator estimator) {
// visible for testing
// returns register change probability scaled by 2^64
static long getScaledRegisterChangeProbability(byte reg, int p) {
int r = reg & 0xFF;
int r2 = r - (p << 2) - 4;
if (r2 < 0) {
long ret = 4L;
if (r2 == -2 || r2 == -8) {
ret -= 2;
}
if (r2 == -2 || r2 == -4) {
ret -= 1;
}
return ret << (62 - p);
} else {
int k = r2 >>> 2;
long ret = 0xE000000000000000L;
int y0 = r & 1;
int y1 = (r >>> 1) & 1;
ret -= (long) y0 << 63;
ret -= (long) y1 << 62;
return ret >>> (k + p);
}
if (reg == 0) return 1L << -p;
long k = (reg >>> 2) - p + 1;
return ((((reg & 2) | ((reg & 1) << 2)) ^ 7L) << ~k) >>> p;
}

/**
Expand Down Expand Up @@ -464,8 +447,7 @@ public double estimate(UltraLogLog ultraLogLog) {
int p = ultraLogLog.getP();

long sum = 0;
int[] b = new int[65 - p];

int[] b = new int[64];
for (byte r : state) {
sum += contribute(r & 0xff, b, p);
}
Expand All @@ -477,13 +459,12 @@ public double estimate(UltraLogLog ultraLogLog) {
return (state[0] == 0) ? 0 : Double.POSITIVE_INFINITY;
}
b[63 - p] += b[64 - p];
b[64 - p] = 0;
double factor = m << 1;
double a = unsignedLongToDouble(sum) * factor * 0x1p-64;

return factor
* DistinctCountUtil.solveMaximumLikelihoodEquation(
a, b, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
a, b, 63 - p, ML_EQUATION_SOLVER_EPS / Math.sqrt(m))
/ (1. + ML_BIAS_CORRECTION_CONSTANT / m);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ private static double solveN(double a, int... b) {
if (b != null) {
for (int i = 0; i < b.length; ++i) {
if (b[i] > 0) {
double f = 1. / (1L << i);
double f = Double.longBitsToDouble((0x3ffL - i) << 52);
sum += b[i] * f / Math.expm1(x * f);
}
}
Expand All @@ -87,66 +87,76 @@ private static double solveN(double a, int... b) {
@Test
void testSolveMaximumLikelihoodEquation() {
double relativeErrorLimit = 1e-14;
assertThat(solveMaximumLikelihoodEquation(1., new int[] {}, relativeErrorLimit)).isZero();
assertThat(solveMaximumLikelihoodEquation(2., new int[] {}, relativeErrorLimit)).isZero();
assertThat(solveMaximumLikelihoodEquation(0., new int[] {1}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(1., new int[] {}, -1, relativeErrorLimit)).isZero();
assertThat(solveMaximumLikelihoodEquation(2., new int[] {}, -1, relativeErrorLimit)).isZero();
assertThat(solveMaximumLikelihoodEquation(0., new int[] {1}, 0, relativeErrorLimit))
.isPositive()
.isInfinite();
assertThat(solveMaximumLikelihoodEquation(0., new int[] {1, 0}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(0., new int[] {1, 0}, 1, relativeErrorLimit))
.isPositive()
.isInfinite();

assertThat(solveMaximumLikelihoodEquation(1., new int[] {1}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(1., new int[] {1}, 0, relativeErrorLimit))
.isCloseTo(solve1(1., 1), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(2., new int[] {3}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(2., new int[] {3}, 0, relativeErrorLimit))
.isCloseTo(solve1(2., 3), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2}, 0, relativeErrorLimit))
.isCloseTo(solve1(3., 2), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(5., new int[] {7}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(5., new int[] {7}, 0, relativeErrorLimit))
.isCloseTo(solve1(5., 7), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(11., new int[] {7}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(11., new int[] {7}, 0, relativeErrorLimit))
.isCloseTo(solve1(11., 7), withPercentage(1e-6));
assertThat(
solveMaximumLikelihoodEquation(
0.03344574927673416, new int[] {238}, relativeErrorLimit))
0.03344574927673416, new int[] {238}, 0, relativeErrorLimit))
.isCloseTo(solve1(0.03344574927673416, 238), withPercentage(1e-6));

assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 0}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 0}, 1, relativeErrorLimit))
.isCloseTo(solve2(3., 2, 0), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(5., new int[] {7, 0}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(5., new int[] {7, 0}, 1, relativeErrorLimit))
.isCloseTo(solve2(5., 7, 0), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(11., new int[] {7, 0}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(11., new int[] {7, 0}, 1, relativeErrorLimit))
.isCloseTo(solve2(11., 7, 0), withPercentage(1e-6));
assertThat(
solveMaximumLikelihoodEquation(
0.12274207925281233, new int[] {574, 580}, relativeErrorLimit))
0.12274207925281233, new int[] {574, 580}, 1, relativeErrorLimit))
.isCloseTo(solve2(0.12274207925281233, 574, 580), withPercentage(1e-6));

assertThat(solveMaximumLikelihoodEquation(1., new int[] {2, 3}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(1., new int[] {2, 3}, 1, relativeErrorLimit))
.isCloseTo(solve2(1., 2, 3), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1}, 1, relativeErrorLimit))
.isCloseTo(solve2(3., 2, 1), withPercentage(1e-6));

assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1, 4, 5}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(3., new int[] {2, 1, 4, 5}, 3, relativeErrorLimit))
.isCloseTo(solveN(3., 2, 1, 4, 5), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(3., new int[] {6, 7, 2, 1, 4, 5}, relativeErrorLimit))
assertThat(
solveMaximumLikelihoodEquation(3., new int[] {6, 7, 2, 1, 4, 5}, 5, relativeErrorLimit))
.isCloseTo(solveN(3., 6, 7, 2, 1, 4, 5), withPercentage(1e-6));
assertThat(
solveMaximumLikelihoodEquation(
7., new int[] {0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0}, relativeErrorLimit))
7., new int[] {0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0}, 11, relativeErrorLimit))
.isCloseTo(solveN(7., 0, 0, 6, 7, 2, 1, 4, 5, 0, 0, 0, 0), withPercentage(1e-6));
assertThat(
solveMaximumLikelihoodEquation(
7., new int[] {0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0}, relativeErrorLimit))
7., new int[] {0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0}, 11, relativeErrorLimit))
.isCloseTo(solveN(7., 0, 0, 6, 7, 0, 0, 4, 5, 0, 0, 0, 0), withPercentage(1e-6));

assertThat(solveMaximumLikelihoodEquation(0x1p-64, new int[] {0}, -1, relativeErrorLimit))
.isCloseTo(solve1(0x1p-64, 0), withPercentage(1e-6));
assertThat(solveMaximumLikelihoodEquation(0x1p-64, new int[] {1}, 0, relativeErrorLimit))
.isCloseTo(solve1(0x1p-64, 1), withPercentage(1e-6));
{
int[] b = new int[65];
b[64] = 1;
assertThat(solveMaximumLikelihoodEquation(1., b, 64, relativeErrorLimit))
.isCloseTo(solveN(1., b), withPercentage(1e-6));
}
// many more cases to have full code coverage
SplittableRandom random = new SplittableRandom(0x93b723ca5f234685L);

for (int i = 0; i < 10000; ++i) {
double a = 1. - random.nextDouble();
int b0 = random.nextInt(1000);
assertThat(solveMaximumLikelihoodEquation(a, new int[] {b0}, relativeErrorLimit))
assertThat(solveMaximumLikelihoodEquation(a, new int[] {b0}, 0, relativeErrorLimit))
.isCloseTo(solve1(a, b0), withPercentage(1e-6));
}
}
Expand Down

0 comments on commit 732909e

Please sign in to comment.