From d56ba2a33587dd7476565ea86cffc6ac0da40055 Mon Sep 17 00:00:00 2001 From: Alexandre Fonseca Date: Wed, 31 Jan 2024 13:42:56 +0000 Subject: [PATCH] Preemptively adjust every 100 samples --- .../collectors_discrete_dynamic_sampler.c | 10 ++++++++-- .../collectors/discrete_dynamic_sampler_spec.rb | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.c b/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.c index 669d76fb29f..bf72ba5e109 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.c +++ b/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.c @@ -9,6 +9,7 @@ #define BASE_SAMPLING_INTERVAL 50 #define ADJUSTMENT_WINDOW_NS SECONDS_AS_NS(1) +#define ADJUSTMENT_WINDOW_SAMPLES 100 #define EMA_SMOOTHING_FACTOR 0.6 #define EXP_MOVING_AVERAGE(last, avg, first) first ? last : (1-EMA_SMOOTHING_FACTOR) * avg + EMA_SMOOTHING_FACTOR * last @@ -110,8 +111,13 @@ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sample static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) { long window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns; - if (window_time_ns < ADJUSTMENT_WINDOW_NS) { - // not enough time has passed to perform a readjustment + if (window_time_ns < ADJUSTMENT_WINDOW_NS && sampler->events_since_last_readjustment < ADJUSTMENT_WINDOW_SAMPLES) { + // not enough time or samples have passed to perform a readjustment + return; + } + + if (window_time_ns == 0) { + // should not be possible given previous condition but lets protect against div by 0 below. return; } diff --git a/spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb b/spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb index c53648cf89b..209c5ddce35 100644 --- a/spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb +++ b/spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb @@ -157,6 +157,23 @@ def update_overhead_target(new_overhead_target, sampler_instance = sampler) expect(sampler._native_probability).to be > 30 # % end end + + context 'with a big spike <1s spike that fits within an adjustment window' do + it 'will readjust preemptively with smaller windows to prevent sampling overload' do + # Start with a very small constant load during a long time. So low in fact that we'll + # decide to sample everything + simulate_load(duration_seconds: 60, events_per_second: 1, sampling_seconds: 0.0001) + expect(sampler._native_probability).to eq(100) # % + + # Now lets do a big event spike over half a second. This is within an adjustment + # window so in theory we should only react after it occurred. But if this happened + # we'd sample all of these events. Instead, we expect our sampler to preempt + # adjustments with smaller windows to try and contain the deluge + stats = simulate_load(duration_seconds: 0.5, events_per_second: 5000, sampling_seconds: 0.0001) + expect(stats[:num_samples]).to be < 300 + expect(sampler._native_probability).to be < 10 # % + end + end end context 'when sampling time worsens' do