Skip to content

Commit

Permalink
Merge #531
Browse files Browse the repository at this point in the history
531: Fix errors in hyperloglog implementation r=WireBaron a=WireBaron

This change fixes bugs where the bias corrected estimated was first being calculated incorrectly, and then secondly not used.  Tests have been updated to reflect the new implementation, and a new test has been added to randomly verify bias corrected results have an error less than 5%.

Fixes #469 
Fixes #508 

Co-authored-by: Brian Rowe <brian@timescale.com>
  • Loading branch information
bors[bot] and Brian Rowe authored Sep 12, 2022
2 parents b743334 + 3e83279 commit 3515244
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 14 deletions.
10 changes: 5 additions & 5 deletions crates/hyperloglogplusplus/src/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ impl<'s> Storage<'s> {
if h <= self.threshold() {
h as u64
} else {
e as u64
e_p as u64
}
}

Expand Down Expand Up @@ -155,7 +155,7 @@ impl<'s> Storage<'s> {

let mut value = 0.0;
for i in 0..6 {
value += distances[i] * bias_data[i];
value += distances[i] * bias_data[neighbors[i]];
}

return value;
Expand Down Expand Up @@ -295,7 +295,7 @@ mod tests {
hll.add_hash(hash(i));
}
// FIXME examine in more detail
assert_eq!(hll.estimate_count(), 469);
assert_eq!(hll.estimate_count(), 430);
}

#[test]
Expand Down Expand Up @@ -331,7 +331,7 @@ mod tests {
for i in 0..10000 {
hll.add_hash(hash(i));
}
assert_eq!(hll.estimate_count(), 11513);
assert_eq!(hll.estimate_count(), 11347);
}

#[test]
Expand All @@ -349,7 +349,7 @@ mod tests {
for i in 0..100000 {
hll.add_hash(hash(i));
}
assert_eq!(hll.estimate_count(), 126_448);
assert_eq!(hll.estimate_count(), 117304);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion crates/hyperloglogplusplus/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ mod tests {
for i in 0..100_000 {
hll.add(&i);
}
assert_eq!(hll.estimate_count(), 126_448);
assert_eq!(hll.estimate_count(), 117_304);
assert!(!hll.is_sparse());
assert_eq!(hll.num_bytes(), 49_153);
assert!(hll.num_bytes() <= (1 << 16) * 6 / 8 + 1)
Expand Down
4 changes: 2 additions & 2 deletions docs/hyperloglog.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ FROM (
```output
count
-------
152
153
```

---
Expand Down Expand Up @@ -128,7 +128,7 @@ FROM generate_series(1, 100) data
```output
distinct_count
----------------
114
104
```

## **stderror** <a id="hyperloglog_stderror"></a>
Expand Down
8 changes: 4 additions & 4 deletions docs/test_caggs.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ ORDER BY week;
```output
week | distinct_count | rate | skewness_x
------------------------+----------------+-------------------+-------------------
2020-06-08 00:00:00+00 | 73 | 0.000627079174983 | 0.0970167274813
2020-06-15 00:00:00+00 | 70 | 0.00065369261477 | -0.0885157388226
2020-06-22 00:00:00+00 | 68 | 0.000680306054558 | 0.0864685035294
2020-06-08 00:00:00+00 | 49 | 0.000627079174983 | 0.0970167274813
2020-06-15 00:00:00+00 | 45 | 0.00065369261477 | -0.0885157388226
2020-06-22 00:00:00+00 | 42 | 0.000680306054558 | 0.0864685035294
2020-06-29 00:00:00+00 | 36 | 0.000706919494345 | -0.0257336371983
2020-07-06 00:00:00+00 | 31 | 0.000971390552229 | 0.169001960922
2020-07-13 00:00:00+00 | 28 | 0.0011626746507 | 0.0432068720231
Expand All @@ -67,7 +67,7 @@ FROM weekly_aggs;
```output
distinct_count | stderror
----------------+----------
123 | 0.13
115 | 0.13
```

```SQL
Expand Down
35 changes: 34 additions & 1 deletion extension/src/hyperloglog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ mod tests {
use super::*;

use pgx_macros::pg_test;
use rand::distributions::{Distribution, Uniform};

#[pg_test]
fn test_hll_aggregate() {
Expand Down Expand Up @@ -815,7 +816,7 @@ mod tests {
) q";
let count = client.select(query, None, None).first().get_one::<i64>();

assert_eq!(count, Some(152));
assert_eq!(count, Some(153));
}
});
}
Expand Down Expand Up @@ -890,6 +891,38 @@ mod tests {
});
}

#[pg_test]
fn bias_correct_values_accurate() {
const NUM_BIAS_TRIALS: usize = 5;
const MAX_TRIAL_ERROR: f64 = 0.05;
Spi::execute(|client| {
// This should match THRESHOLD_DATA_VEC from b=12-18
let thresholds = vec![3100, 6500, 11500, 20000, 50000, 120000, 350000];
let rand_precision: Uniform<usize> = Uniform::new_inclusive(12, 18);
let mut rng = rand::thread_rng();
for _ in 0..NUM_BIAS_TRIALS {
let precision = rand_precision.sample(&mut rng);
let rand_cardinality: Uniform<usize> =
Uniform::new_inclusive(thresholds[precision - 12], 5 * (1 << precision));
let cardinality = rand_cardinality.sample(&mut rng);
let query = format!(
"SELECT hyperloglog({}, v) -> distinct_count() FROM generate_series(1, {}) v",
1 << precision,
cardinality
);

let estimate = client
.select(&query, None, None)
.first()
.get_one::<i64>()
.unwrap();

let error = (estimate as f64 / cardinality as f64).abs() - 1.;
assert!(error < MAX_TRIAL_ERROR, "hyperloglog with {} buckets on cardinality {} gave a result of {}. Resulting error {} exceeds max allowed ({})", 2^precision, cardinality, estimate, error, MAX_TRIAL_ERROR);
}
});
}

// FIXME these tests don't run on CI
// #[pg_test(error = "Invalid value for size 262145. Size must be between 16 and 262144, though less than 1024 not recommended")]
// fn test_hll_error_too_large() {
Expand Down
2 changes: 1 addition & 1 deletion tools/update-tester/src/testrunner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ impl TestClient {
CREATE MATERIALIZED VIEW regression_view AS \
SELECT \
counter_agg(ts, val) AS countagg, \
hyperloglog(32, val) AS hll, \
hyperloglog(1024, val) AS hll, \
time_weight('locf', ts, val) AS twa, \
uddsketch(100, 0.001, val) as udd, \
tdigest(100, val) as tdig, \
Expand Down

0 comments on commit 3515244

Please sign in to comment.