diff --git a/crates/hyperloglogplusplus/src/dense.rs b/crates/hyperloglogplusplus/src/dense.rs index 689bfbdf..073d0fbc 100644 --- a/crates/hyperloglogplusplus/src/dense.rs +++ b/crates/hyperloglogplusplus/src/dense.rs @@ -100,7 +100,7 @@ impl<'s> Storage<'s> { if h <= self.threshold() { h as u64 } else { - e as u64 + e_p as u64 } } @@ -155,7 +155,7 @@ impl<'s> Storage<'s> { let mut value = 0.0; for i in 0..6 { - value += distances[i] * bias_data[i]; + value += distances[i] * bias_data[neighbors[i]]; } return value; @@ -295,7 +295,7 @@ mod tests { hll.add_hash(hash(i)); } // FIXME examine in more detail - assert_eq!(hll.estimate_count(), 469); + assert_eq!(hll.estimate_count(), 430); } #[test] @@ -331,7 +331,7 @@ mod tests { for i in 0..10000 { hll.add_hash(hash(i)); } - assert_eq!(hll.estimate_count(), 11513); + assert_eq!(hll.estimate_count(), 11347); } #[test] @@ -349,7 +349,7 @@ mod tests { for i in 0..100000 { hll.add_hash(hash(i)); } - assert_eq!(hll.estimate_count(), 126_448); + assert_eq!(hll.estimate_count(), 117304); } #[test] diff --git a/crates/hyperloglogplusplus/src/lib.rs b/crates/hyperloglogplusplus/src/lib.rs index c7961e26..a4668e65 100644 --- a/crates/hyperloglogplusplus/src/lib.rs +++ b/crates/hyperloglogplusplus/src/lib.rs @@ -312,7 +312,7 @@ mod tests { for i in 0..100_000 { hll.add(&i); } - assert_eq!(hll.estimate_count(), 126_448); + assert_eq!(hll.estimate_count(), 117_304); assert!(!hll.is_sparse()); assert_eq!(hll.num_bytes(), 49_153); assert!(hll.num_bytes() <= (1 << 16) * 6 / 8 + 1) diff --git a/docs/hyperloglog.md b/docs/hyperloglog.md index 9342c4c6..c5b11c2c 100644 --- a/docs/hyperloglog.md +++ b/docs/hyperloglog.md @@ -94,7 +94,7 @@ FROM ( ```output count ------- - 152 + 153 ``` --- @@ -128,7 +128,7 @@ FROM generate_series(1, 100) data ```output distinct_count ---------------- - 114 + 104 ``` ## **stderror** diff --git a/docs/test_caggs.md b/docs/test_caggs.md index 7ffe25fd..81ff2ea5 100644 --- a/docs/test_caggs.md +++ b/docs/test_caggs.md @@ -49,9 +49,9 @@ ORDER BY week; ```output week | distinct_count | rate | skewness_x ------------------------+----------------+-------------------+------------------- - 2020-06-08 00:00:00+00 | 73 | 0.000627079174983 | 0.0970167274813 - 2020-06-15 00:00:00+00 | 70 | 0.00065369261477 | -0.0885157388226 - 2020-06-22 00:00:00+00 | 68 | 0.000680306054558 | 0.0864685035294 + 2020-06-08 00:00:00+00 | 49 | 0.000627079174983 | 0.0970167274813 + 2020-06-15 00:00:00+00 | 45 | 0.00065369261477 | -0.0885157388226 + 2020-06-22 00:00:00+00 | 42 | 0.000680306054558 | 0.0864685035294 2020-06-29 00:00:00+00 | 36 | 0.000706919494345 | -0.0257336371983 2020-07-06 00:00:00+00 | 31 | 0.000971390552229 | 0.169001960922 2020-07-13 00:00:00+00 | 28 | 0.0011626746507 | 0.0432068720231 @@ -67,7 +67,7 @@ FROM weekly_aggs; ```output distinct_count | stderror ----------------+---------- - 123 | 0.13 + 115 | 0.13 ``` ```SQL diff --git a/extension/src/hyperloglog.rs b/extension/src/hyperloglog.rs index f3235f4f..60065689 100644 --- a/extension/src/hyperloglog.rs +++ b/extension/src/hyperloglog.rs @@ -455,6 +455,7 @@ mod tests { use super::*; use pgx_macros::pg_test; + use rand::distributions::{Distribution, Uniform}; #[pg_test] fn test_hll_aggregate() { @@ -815,7 +816,7 @@ mod tests { ) q"; let count = client.select(query, None, None).first().get_one::(); - assert_eq!(count, Some(152)); + assert_eq!(count, Some(153)); } }); } @@ -890,6 +891,38 @@ mod tests { }); } + #[pg_test] + fn bias_correct_values_accurate() { + const NUM_BIAS_TRIALS: usize = 5; + const MAX_TRIAL_ERROR: f64 = 0.05; + Spi::execute(|client| { + // This should match THRESHOLD_DATA_VEC from b=12-18 + let thresholds = vec![3100, 6500, 11500, 20000, 50000, 120000, 350000]; + let rand_precision: Uniform = Uniform::new_inclusive(12, 18); + let mut rng = rand::thread_rng(); + for _ in 0..NUM_BIAS_TRIALS { + let precision = rand_precision.sample(&mut rng); + let rand_cardinality: Uniform = + Uniform::new_inclusive(thresholds[precision - 12], 5 * (1 << precision)); + let cardinality = rand_cardinality.sample(&mut rng); + let query = format!( + "SELECT hyperloglog({}, v) -> distinct_count() FROM generate_series(1, {}) v", + 1 << precision, + cardinality + ); + + let estimate = client + .select(&query, None, None) + .first() + .get_one::() + .unwrap(); + + let error = (estimate as f64 / cardinality as f64).abs() - 1.; + assert!(error < MAX_TRIAL_ERROR, "hyperloglog with {} buckets on cardinality {} gave a result of {}. Resulting error {} exceeds max allowed ({})", 2^precision, cardinality, estimate, error, MAX_TRIAL_ERROR); + } + }); + } + // FIXME these tests don't run on CI // #[pg_test(error = "Invalid value for size 262145. Size must be between 16 and 262144, though less than 1024 not recommended")] // fn test_hll_error_too_large() { diff --git a/tools/update-tester/src/testrunner.rs b/tools/update-tester/src/testrunner.rs index dd343c12..c51c78a6 100644 --- a/tools/update-tester/src/testrunner.rs +++ b/tools/update-tester/src/testrunner.rs @@ -210,7 +210,7 @@ impl TestClient { CREATE MATERIALIZED VIEW regression_view AS \ SELECT \ counter_agg(ts, val) AS countagg, \ - hyperloglog(32, val) AS hll, \ + hyperloglog(1024, val) AS hll, \ time_weight('locf', ts, val) AS twa, \ uddsketch(100, 0.001, val) as udd, \ tdigest(100, val) as tdig, \