Skip to content

Commit

Permalink
Preemptively adjust every 100 samples
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexJF committed Jan 31, 2024
1 parent 36316cd commit d56ba2a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#define BASE_SAMPLING_INTERVAL 50

#define ADJUSTMENT_WINDOW_NS SECONDS_AS_NS(1)
#define ADJUSTMENT_WINDOW_SAMPLES 100

#define EMA_SMOOTHING_FACTOR 0.6
#define EXP_MOVING_AVERAGE(last, avg, first) first ? last : (1-EMA_SMOOTHING_FACTOR) * avg + EMA_SMOOTHING_FACTOR * last
Expand Down Expand Up @@ -110,8 +111,13 @@ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sample
static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
long window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns;

if (window_time_ns < ADJUSTMENT_WINDOW_NS) {
// not enough time has passed to perform a readjustment
if (window_time_ns < ADJUSTMENT_WINDOW_NS && sampler->events_since_last_readjustment < ADJUSTMENT_WINDOW_SAMPLES) {
// not enough time or samples have passed to perform a readjustment
return;
}

if (window_time_ns == 0) {
// should not be possible given previous condition but lets protect against div by 0 below.
return;
}

Expand Down
17 changes: 17 additions & 0 deletions spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,23 @@ def update_overhead_target(new_overhead_target, sampler_instance = sampler)
expect(sampler._native_probability).to be > 30 # %
end
end

context 'with a big spike <1s spike that fits within an adjustment window' do
it 'will readjust preemptively with smaller windows to prevent sampling overload' do
# Start with a very small constant load during a long time. So low in fact that we'll
# decide to sample everything
simulate_load(duration_seconds: 60, events_per_second: 1, sampling_seconds: 0.0001)
expect(sampler._native_probability).to eq(100) # %

# Now lets do a big event spike over half a second. This is within an adjustment
# window so in theory we should only react after it occurred. But if this happened
# we'd sample all of these events. Instead, we expect our sampler to preempt
# adjustments with smaller windows to try and contain the deluge
stats = simulate_load(duration_seconds: 0.5, events_per_second: 5000, sampling_seconds: 0.0001)
expect(stats[:num_samples]).to be < 300
expect(sampler._native_probability).to be < 10 # %
end
end
end

context 'when sampling time worsens' do
Expand Down

0 comments on commit d56ba2a

Please sign in to comment.