Switch PFCOUNT to LogLog-Beta algorithm.

The new algorithm provides the same speed with a smaller error for cardinalities in the range 0-100k. Before switching, the new and old algorithm behavior was studied in details in the context of issue #3677. You can find a few graphs and motivations there.
2025-06-17 19:21:21 +00:00 · 2016-12-16 11:05:10 +01:00
parent 0224be8811
commit 87538cb7fe
4 changed files with 15 additions and 56 deletions
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@ -994,50 +994,21 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
        serverPanic("Unknown HyperLogLog encoding in hllCount()");
    }

-    if(server.hll_use_loglogbeta) {
-        /* For loglog-beta there is a single formula to compute
-         * cardinality for the enture range
-         */
+    /* Apply loglog-beta to the raw estimate. See:
+     * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation
+     * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung
+     * arXiv:1612.02284 */
+    double zl = log(ez + 1);
+    double beta = -0.370393911*ez +
+                   0.070471823*zl +
+                   0.17393686*pow(zl,2) +
+                   0.16339839*pow(zl,3) +
+                  -0.09237745*pow(zl,4) +
+                   0.03738027*pow(zl,5) +
+                  -0.005384159*pow(zl,6) +
+                   0.00042419*pow(zl,7);

-        double zl = log(ez + 1);
-        double beta = -0.370393911*ez +
-                       0.070471823*zl +
-                       0.17393686*pow(zl,2) +
-                       0.16339839*pow(zl,3) +
-                      -0.09237745*pow(zl,4) +
-                       0.03738027*pow(zl,5) +
-                      -0.005384159*pow(zl,6) +
-                       0.00042419*pow(zl,7);
-
-        E  = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
-    } else {
-        /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */
-        E = (1/E)*alpha*m*m;
-
-        /* Use the LINEARCOUNTING algorithm for small cardinalities.
-        * For larger values but up to 72000 HyperLogLog raw approximation is
-        * used since linear counting error starts to increase. However HyperLogLog
-        * shows a strong bias in the range 2.5*16384 - 72000, so we try to
-        * compensate for it. */
-        if (E < m*2.5 && ez != 0) {
-            E = m*log(m/ez); /* LINEARCOUNTING() */
-        } else if (m == 16384 && E < 72000) {
-            /* We did polynomial regression of the bias for this range, this
-            * way we can compute the bias for a given cardinality and correct
-            * according to it. Only apply the correction for P=14 that's what
-            * we use and the value the correction was verified with. */
-            double bias = 5.9119*1.0e-18*(E*E*E*E)
-                        -1.4253*1.0e-12*(E*E*E)+
-                        1.2940*1.0e-7*(E*E)
-                        -5.2921*1.0e-3*E+
-                        83.3216;
-            E -= E*(bias/100);
-        }
-        /* We don't apply the correction for E > 1/30 of 2^32 since we use
-        * a 64 bit function and 6 bit counters. To apply the correction for
-        * 1/30 of 2^64 is not needed since it would require a huge set
-        * to approach such a value. */
-    }
+    E  = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
    return (uint64_t) E;
 }