Merge #531

531: Fix errors in hyperloglog implementation r=WireBaron a=WireBaron This change fixes bugs where the bias corrected estimated was first being calculated incorrectly, and then secondly not used. Tests have been updated to reflect the new implementation, and a new test has been added to randomly verify bias corrected results have an error less than 5%. Fixes #469 Fixes #508 Co-authored-by: Brian Rowe <brian@timescale.com>
timescale · Sep 12, 2022 · 3515244 · 3515244
2 parents b743334 + 3e83279
commit 3515244
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 14 deletions.
diff --git a/crates/hyperloglogplusplus/src/dense.rs b/crates/hyperloglogplusplus/src/dense.rs
@@ -100,7 +100,7 @@ impl<'s> Storage<'s> {
         if h <= self.threshold() {
             h as u64
         } else {
-            e as u64
+            e_p as u64
         }
     }
 
@@ -155,7 +155,7 @@ impl<'s> Storage<'s> {
 
         let mut value = 0.0;
         for i in 0..6 {
-            value += distances[i] * bias_data[i];
+            value += distances[i] * bias_data[neighbors[i]];
         }
 
         return value;
@@ -295,7 +295,7 @@ mod tests {
             hll.add_hash(hash(i));
         }
         // FIXME examine in more detail
-        assert_eq!(hll.estimate_count(), 469);
+        assert_eq!(hll.estimate_count(), 430);
     }
 
     #[test]
@@ -331,7 +331,7 @@ mod tests {
         for i in 0..10000 {
             hll.add_hash(hash(i));
         }
-        assert_eq!(hll.estimate_count(), 11513);
+        assert_eq!(hll.estimate_count(), 11347);
     }
 
     #[test]
@@ -349,7 +349,7 @@ mod tests {
         for i in 0..100000 {
             hll.add_hash(hash(i));
         }
-        assert_eq!(hll.estimate_count(), 126_448);
+        assert_eq!(hll.estimate_count(), 117304);
     }
 
     #[test]

diff --git a/crates/hyperloglogplusplus/src/lib.rs b/crates/hyperloglogplusplus/src/lib.rs
@@ -312,7 +312,7 @@ mod tests {
         for i in 0..100_000 {
             hll.add(&i);
         }
-        assert_eq!(hll.estimate_count(), 126_448);
+        assert_eq!(hll.estimate_count(), 117_304);
         assert!(!hll.is_sparse());
         assert_eq!(hll.num_bytes(), 49_153);
         assert!(hll.num_bytes() <= (1 << 16) * 6 / 8 + 1)

diff --git a/docs/hyperloglog.md b/docs/hyperloglog.md
@@ -94,7 +94,7 @@ FROM (
 ```output
  count
 -------
-   152
+   153
 ```
 
 ---
@@ -128,7 +128,7 @@ FROM generate_series(1, 100) data
 ```output
  distinct_count
 ----------------
-            114
+            104
 ```
 
 ## **stderror** <a id="hyperloglog_stderror"></a>

diff --git a/docs/test_caggs.md b/docs/test_caggs.md
@@ -49,9 +49,9 @@ ORDER BY week;
 ```output
           week          | distinct_count |       rate        |    skewness_x
 ------------------------+----------------+-------------------+-------------------
- 2020-06-08 00:00:00+00 |             73 | 0.000627079174983 |  0.0970167274813
- 2020-06-15 00:00:00+00 |             70 |  0.00065369261477 | -0.0885157388226
- 2020-06-22 00:00:00+00 |             68 | 0.000680306054558 |  0.0864685035294
+ 2020-06-08 00:00:00+00 |             49 | 0.000627079174983 |  0.0970167274813
+ 2020-06-15 00:00:00+00 |             45 |  0.00065369261477 | -0.0885157388226
+ 2020-06-22 00:00:00+00 |             42 | 0.000680306054558 |  0.0864685035294
  2020-06-29 00:00:00+00 |             36 | 0.000706919494345 | -0.0257336371983
  2020-07-06 00:00:00+00 |             31 | 0.000971390552229 |   0.169001960922
  2020-07-13 00:00:00+00 |             28 |   0.0011626746507 |  0.0432068720231
@@ -67,7 +67,7 @@ FROM weekly_aggs;
 ```output
  distinct_count | stderror
 ----------------+----------
-            123 |     0.13
+            115 |     0.13
 ```
 
 ```SQL

diff --git a/extension/src/hyperloglog.rs b/extension/src/hyperloglog.rs
@@ -455,6 +455,7 @@ mod tests {
     use super::*;
 
     use pgx_macros::pg_test;
+    use rand::distributions::{Distribution, Uniform};
 
     #[pg_test]
     fn test_hll_aggregate() {
@@ -815,7 +816,7 @@ mod tests {
                     ) q";
                 let count = client.select(query, None, None).first().get_one::<i64>();
 
-                assert_eq!(count, Some(152));
+                assert_eq!(count, Some(153));
             }
         });
     }
@@ -890,6 +891,38 @@ mod tests {
         });
     }
 
+    #[pg_test]
+    fn bias_correct_values_accurate() {
+        const NUM_BIAS_TRIALS: usize = 5;
+        const MAX_TRIAL_ERROR: f64 = 0.05;
+        Spi::execute(|client| {
+            // This should match THRESHOLD_DATA_VEC from b=12-18
+            let thresholds = vec![3100, 6500, 11500, 20000, 50000, 120000, 350000];
+            let rand_precision: Uniform<usize> = Uniform::new_inclusive(12, 18);
+            let mut rng = rand::thread_rng();
+            for _ in 0..NUM_BIAS_TRIALS {
+                let precision = rand_precision.sample(&mut rng);
+                let rand_cardinality: Uniform<usize> =
+                    Uniform::new_inclusive(thresholds[precision - 12], 5 * (1 << precision));
+                let cardinality = rand_cardinality.sample(&mut rng);
+                let query = format!(
+                    "SELECT hyperloglog({}, v) -> distinct_count() FROM generate_series(1, {}) v",
+                    1 << precision,
+                    cardinality
+                );
+
+                let estimate = client
+                    .select(&query, None, None)
+                    .first()
+                    .get_one::<i64>()
+                    .unwrap();
+
+                let error = (estimate as f64 / cardinality as f64).abs() - 1.;
+                assert!(error < MAX_TRIAL_ERROR, "hyperloglog with {} buckets on cardinality {} gave a result of {}.  Resulting error {} exceeds max allowed ({})", 2^precision, cardinality, estimate, error, MAX_TRIAL_ERROR);
+            }
+        });
+    }
+
     // FIXME these tests don't run on CI
     // #[pg_test(error = "Invalid value for size 262145. Size must be between 16 and 262144, though less than 1024 not recommended")]
     // fn test_hll_error_too_large() {

diff --git a/tools/update-tester/src/testrunner.rs b/tools/update-tester/src/testrunner.rs
@@ -210,7 +210,7 @@ impl TestClient {
             CREATE MATERIALIZED VIEW regression_view AS \
                 SELECT \
                     counter_agg(ts, val) AS countagg, \
-                    hyperloglog(32, val) AS hll, \
+                    hyperloglog(1024, val) AS hll, \
                     time_weight('locf', ts, val) AS twa, \
                     uddsketch(100, 0.001, val) as udd, \
                     tdigest(100, val) as tdig, \