diff --git a/crates/hyperloglogplusplus/src/dense.rs b/crates/hyperloglogplusplus/src/dense.rs
index 689bfbdf..073d0fbc 100644
--- a/crates/hyperloglogplusplus/src/dense.rs
+++ b/crates/hyperloglogplusplus/src/dense.rs
@@ -100,7 +100,7 @@ impl<'s> Storage<'s> {
if h <= self.threshold() {
h as u64
} else {
- e as u64
+ e_p as u64
}
}
@@ -155,7 +155,7 @@ impl<'s> Storage<'s> {
let mut value = 0.0;
for i in 0..6 {
- value += distances[i] * bias_data[i];
+ value += distances[i] * bias_data[neighbors[i]];
}
return value;
@@ -295,7 +295,7 @@ mod tests {
hll.add_hash(hash(i));
}
// FIXME examine in more detail
- assert_eq!(hll.estimate_count(), 469);
+ assert_eq!(hll.estimate_count(), 430);
}
#[test]
@@ -331,7 +331,7 @@ mod tests {
for i in 0..10000 {
hll.add_hash(hash(i));
}
- assert_eq!(hll.estimate_count(), 11513);
+ assert_eq!(hll.estimate_count(), 11347);
}
#[test]
@@ -349,7 +349,7 @@ mod tests {
for i in 0..100000 {
hll.add_hash(hash(i));
}
- assert_eq!(hll.estimate_count(), 126_448);
+ assert_eq!(hll.estimate_count(), 117304);
}
#[test]
diff --git a/crates/hyperloglogplusplus/src/lib.rs b/crates/hyperloglogplusplus/src/lib.rs
index c7961e26..a4668e65 100644
--- a/crates/hyperloglogplusplus/src/lib.rs
+++ b/crates/hyperloglogplusplus/src/lib.rs
@@ -312,7 +312,7 @@ mod tests {
for i in 0..100_000 {
hll.add(&i);
}
- assert_eq!(hll.estimate_count(), 126_448);
+ assert_eq!(hll.estimate_count(), 117_304);
assert!(!hll.is_sparse());
assert_eq!(hll.num_bytes(), 49_153);
assert!(hll.num_bytes() <= (1 << 16) * 6 / 8 + 1)
diff --git a/docs/hyperloglog.md b/docs/hyperloglog.md
index 9342c4c6..c5b11c2c 100644
--- a/docs/hyperloglog.md
+++ b/docs/hyperloglog.md
@@ -94,7 +94,7 @@ FROM (
```output
count
-------
- 152
+ 153
```
---
@@ -128,7 +128,7 @@ FROM generate_series(1, 100) data
```output
distinct_count
----------------
- 114
+ 104
```
## **stderror**
diff --git a/docs/test_caggs.md b/docs/test_caggs.md
index 7ffe25fd..81ff2ea5 100644
--- a/docs/test_caggs.md
+++ b/docs/test_caggs.md
@@ -49,9 +49,9 @@ ORDER BY week;
```output
week | distinct_count | rate | skewness_x
------------------------+----------------+-------------------+-------------------
- 2020-06-08 00:00:00+00 | 73 | 0.000627079174983 | 0.0970167274813
- 2020-06-15 00:00:00+00 | 70 | 0.00065369261477 | -0.0885157388226
- 2020-06-22 00:00:00+00 | 68 | 0.000680306054558 | 0.0864685035294
+ 2020-06-08 00:00:00+00 | 49 | 0.000627079174983 | 0.0970167274813
+ 2020-06-15 00:00:00+00 | 45 | 0.00065369261477 | -0.0885157388226
+ 2020-06-22 00:00:00+00 | 42 | 0.000680306054558 | 0.0864685035294
2020-06-29 00:00:00+00 | 36 | 0.000706919494345 | -0.0257336371983
2020-07-06 00:00:00+00 | 31 | 0.000971390552229 | 0.169001960922
2020-07-13 00:00:00+00 | 28 | 0.0011626746507 | 0.0432068720231
@@ -67,7 +67,7 @@ FROM weekly_aggs;
```output
distinct_count | stderror
----------------+----------
- 123 | 0.13
+ 115 | 0.13
```
```SQL
diff --git a/extension/src/hyperloglog.rs b/extension/src/hyperloglog.rs
index f3235f4f..60065689 100644
--- a/extension/src/hyperloglog.rs
+++ b/extension/src/hyperloglog.rs
@@ -455,6 +455,7 @@ mod tests {
use super::*;
use pgx_macros::pg_test;
+ use rand::distributions::{Distribution, Uniform};
#[pg_test]
fn test_hll_aggregate() {
@@ -815,7 +816,7 @@ mod tests {
) q";
let count = client.select(query, None, None).first().get_one::();
- assert_eq!(count, Some(152));
+ assert_eq!(count, Some(153));
}
});
}
@@ -890,6 +891,38 @@ mod tests {
});
}
+ #[pg_test]
+ fn bias_correct_values_accurate() {
+ const NUM_BIAS_TRIALS: usize = 5;
+ const MAX_TRIAL_ERROR: f64 = 0.05;
+ Spi::execute(|client| {
+ // This should match THRESHOLD_DATA_VEC from b=12-18
+ let thresholds = vec![3100, 6500, 11500, 20000, 50000, 120000, 350000];
+ let rand_precision: Uniform = Uniform::new_inclusive(12, 18);
+ let mut rng = rand::thread_rng();
+ for _ in 0..NUM_BIAS_TRIALS {
+ let precision = rand_precision.sample(&mut rng);
+ let rand_cardinality: Uniform =
+ Uniform::new_inclusive(thresholds[precision - 12], 5 * (1 << precision));
+ let cardinality = rand_cardinality.sample(&mut rng);
+ let query = format!(
+ "SELECT hyperloglog({}, v) -> distinct_count() FROM generate_series(1, {}) v",
+ 1 << precision,
+ cardinality
+ );
+
+ let estimate = client
+ .select(&query, None, None)
+ .first()
+ .get_one::()
+ .unwrap();
+
+ let error = (estimate as f64 / cardinality as f64).abs() - 1.;
+ assert!(error < MAX_TRIAL_ERROR, "hyperloglog with {} buckets on cardinality {} gave a result of {}. Resulting error {} exceeds max allowed ({})", 2^precision, cardinality, estimate, error, MAX_TRIAL_ERROR);
+ }
+ });
+ }
+
// FIXME these tests don't run on CI
// #[pg_test(error = "Invalid value for size 262145. Size must be between 16 and 262144, though less than 1024 not recommended")]
// fn test_hll_error_too_large() {
diff --git a/tools/update-tester/src/testrunner.rs b/tools/update-tester/src/testrunner.rs
index dd343c12..c51c78a6 100644
--- a/tools/update-tester/src/testrunner.rs
+++ b/tools/update-tester/src/testrunner.rs
@@ -210,7 +210,7 @@ impl TestClient {
CREATE MATERIALIZED VIEW regression_view AS \
SELECT \
counter_agg(ts, val) AS countagg, \
- hyperloglog(32, val) AS hll, \
+ hyperloglog(1024, val) AS hll, \
time_weight('locf', ts, val) AS twa, \
uddsketch(100, 0.001, val) as udd, \
tdigest(100, val) as tdig, \