[ML] Refactor DataStreamDiagnostics to use array

This commit refactors the DataStreamDiagnostics class achieving the following advantages: - simpler code; by encapsulating the moving bucket histogram into its own class - better performance; by using an array to store the buckets instead of a map - explicit handling of gap buckets; in preparation of fixing elastic#30080
dimitris-athanasiou · Apr 30, 2018 · 83c7814 · 83c7814
1 parent 05160e6
commit 83c7814
Show file tree

Hide file tree

Showing 5 changed files with 274 additions and 229 deletions.
diff --git a/...ck/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/DataCountsReporter.java b/...ck/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/DataCountsReporter.java
@@ -12,8 +12,9 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.xpack.core.ml.job.config.Job;
-import org.elasticsearch.xpack.ml.job.persistence.JobDataCountsPersister;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.DataCounts;
+import org.elasticsearch.xpack.ml.job.persistence.JobDataCountsPersister;
+import org.elasticsearch.xpack.ml.job.process.diagnostics.DataStreamDiagnostics;
 
 import java.util.Date;
 import java.util.Locale;

diff --git a/...plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/DataStreamDiagnostics.java b/...plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/DataStreamDiagnostics.java
diff --git a/.../ml/src/main/java/org/elasticsearch/xpack/ml/job/process/diagnostics/BucketHistogram.java b/.../ml/src/main/java/org/elasticsearch/xpack/ml/job/process/diagnostics/BucketHistogram.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.process.diagnostics;
+
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.utils.Intervals;
+
+/**
+ * A moving window of buckets that allow keeping
+ * track of some statistics like the bucket count,
+ * empty or sparse buckets, etc.
+ *
+ * The counts are stored in an array that functions as a
+ * circular buffer. When time is advanced, all buckets
+ * out of the window are flushed.
+ */
+class BucketHistogram {
+
+    private static final int MIN_BUCKETS = 10;
+
+    private final long bucketSpanMs;
+    private final long latencyMs;
+    private final int maxSize;
+    private final long[] buckets;
+    private long movingBucketCount = 0;
+    private long latestBucketStartMs = -1;
+    private int latestBucketIndex;
+    private long earliestBucketStartMs = -1;
+    private int earliestBucketIndex;
+    private long latestFlushedBucketStartMs = -1;
+    private final BucketFlushListener bucketFlushListener;
+
+    public BucketHistogram(Job job, BucketFlushListener bucketFlushListener) {
+        bucketSpanMs = job.getAnalysisConfig().getBucketSpan().millis();
+        latencyMs = job.getAnalysisConfig().getLatency() == null ? 0 : job.getAnalysisConfig().getLatency().millis();
+        maxSize = Math.max((int) (latencyMs / bucketSpanMs), MIN_BUCKETS);
+        buckets = new long[maxSize];
+        this.bucketFlushListener = bucketFlushListener;
+    }
+
+    public void addRecord(long recordTimestampMs) {
+        long bucketStartMs = Intervals.alignToFloor(recordTimestampMs, bucketSpanMs);
+
+        // Initialize earliest/latest times
+        if (latestBucketStartMs < 0) {
+            latestBucketStartMs = bucketStartMs;
+            earliestBucketStartMs = bucketStartMs;
+        }
+
+        advanceTime(bucketStartMs);
+        addToBucket(bucketStartMs);
+    }
+
+    private void advanceTime(long bucketStartMs) {
+        while (bucketStartMs > latestBucketStartMs) {
+            int flushBucketIndex = (latestBucketIndex + 1) % maxSize;
+
+            if (flushBucketIndex == earliestBucketIndex) {
+                flush(flushBucketIndex);
+                movingBucketCount -= buckets[flushBucketIndex];
+                earliestBucketStartMs += bucketSpanMs;
+                earliestBucketIndex = (earliestBucketIndex + 1) % maxSize;
+            }
+            buckets[flushBucketIndex] = 0L;
+
+            latestBucketStartMs += bucketSpanMs;
+            latestBucketIndex = flushBucketIndex;
+        }
+    }
+
+    private void addToBucket(long bucketStartMs) {
+        int offsetToLatest = (int) ((bucketStartMs - latestBucketStartMs) / bucketSpanMs);
+        int bucketIndex = (latestBucketIndex + offsetToLatest) % maxSize;
+        if (bucketIndex < 0) {
+            bucketIndex = maxSize + bucketIndex;
+        }
+
+        ++buckets[bucketIndex];
+        ++movingBucketCount;
+
+        if (bucketStartMs < earliestBucketStartMs) {
+            earliestBucketStartMs = bucketStartMs;
+            earliestBucketIndex = bucketIndex;
+        }
+    }
+
+    private void flush(int bucketIndex) {
+        long bucketStartMs = getTimestampMs(bucketIndex);
+        if (bucketStartMs > latestFlushedBucketStartMs) {
+            bucketFlushListener.onBucketFlush(bucketStartMs, buckets[bucketIndex]);
+            latestFlushedBucketStartMs = bucketStartMs;
+        }
+    }
+
+    private long getTimestampMs(int bucketIndex) {
+        int offsetToLatest = latestBucketIndex - bucketIndex;
+        if (offsetToLatest < 0) {
+            offsetToLatest = maxSize + offsetToLatest;
+        }
+        return latestBucketStartMs - offsetToLatest * bucketSpanMs;
+    }
+
+    public void flush() {
+        if (latestBucketStartMs < 0) {
+            return;
+        }
+
+        int bucketIndex = earliestBucketIndex;
+        while (bucketIndex != latestBucketIndex) {
+            flush(bucketIndex);
+            bucketIndex = (bucketIndex + 1) % maxSize;
+        }
+    }
+
+    public double averageBucketCount() {
+        return (double) movingBucketCount / size();
+    }
+
+    private int size() {
+        if (latestBucketStartMs < 0) {
+            return 0;
+        }
+        return (int) ((latestBucketStartMs - earliestBucketStartMs) / bucketSpanMs) + 1;
+    }
+
+    public interface BucketFlushListener {
+        void onBucketFlush(long bucketStartMs, long bucketCounts);
+    }
+}