Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-9044] Handle timing hiccups #3440

Merged
merged 31 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6c82a38
[PROF-9036] Include profiling worker and GC stats in profiles
AlexJF Feb 1, 2024
3dd16ab
Missing test fixes
AlexJF Feb 1, 2024
0699954
Fix typo
AlexJF Feb 1, 2024
ad85fdb
Make sampler snapshot work on all rubies
AlexJF Feb 1, 2024
b806172
Fix events/ns->events/s calc
AlexJF Feb 2, 2024
a5392a6
Use non-fraction percentage in sampler snapshot
AlexJF Feb 2, 2024
59b5fad
[PROF-9045] Test big spikes within adjustment window
AlexJF Feb 2, 2024
ed53cd1
[PROF-9045] Allow adjustment preemption based on number of samples
AlexJF Feb 2, 2024
7e759e7
[PROF-9045] Test preemption does not lead to early forgetting
AlexJF Feb 5, 2024
7f4df67
Use more widely available gc stat key.
AlexJF Feb 5, 2024
dd820cf
[PROF-9045] Weight moving averages by size of adjustment window
AlexJF Feb 5, 2024
bbcd788
Fix failures on older rubies
AlexJF Feb 5, 2024
f5ee30e
Fix failures on jrubies
AlexJF Feb 5, 2024
dc06f81
Match only on count
AlexJF Feb 5, 2024
3f76438
Merge branch 'master' into alexjf/prof-9036-stats-as-internal-data
AlexJF Feb 5, 2024
4988fb8
Merge branch 'alexjf/prof-9036-stats-as-internal-data' into alexjf/pr…
AlexJF Feb 5, 2024
6234ae2
[PROF-9035] Stop worker on allocation sampler failure
AlexJF Feb 6, 2024
973dc67
[PROF-9044] Test for timing hiccups
AlexJF Feb 6, 2024
a07ac12
[PROF-9044] Introduce maximum allowed sampling time
AlexJF Feb 6, 2024
500a3a9
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 12, 2024
f912b5b
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 16, 2024
640b1a2
Address comments
AlexJF Feb 16, 2024
4a03871
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 19, 2024
48efd30
Merge branch 'alexjf/prof-9035-handle-clock-errors-allocation-sampler…
AlexJF Feb 19, 2024
43fde70
Update spec/datadog/profiling/collectors/discrete_dynamic_sampler_spe…
AlexJF Feb 19, 2024
9539408
Merge branch 'alexjf/prof-9044-handle-timing-hiccups' of github.com:D…
AlexJF Feb 19, 2024
09a1f37
Address comments
AlexJF Feb 19, 2024
d09435b
Update spec/datadog/profiling/collectors/discrete_dynamic_sampler_spe…
AlexJF Mar 4, 2024
401065d
Merge branch 'master' into alexjf/prof-9044-handle-timing-hiccups
AlexJF Mar 5, 2024
759a17c
[PROF-9044] Fix botched merge
AlexJF Mar 5, 2024
101cb52
[PROF-9044] More botched merge leftovers
AlexJF Mar 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@

#define ADJUSTMENT_WINDOW_NS SECONDS_AS_NS(1)
#define ADJUSTMENT_WINDOW_SAMPLES 100
// Any average sampling times above this value will be clamped to this value.
// In practice, this limits the budget consumption of a single sample to that of an adjustment window,
// thus aiming for a minimum sample rate of once per adjustment window (dependent on actual event rate).
// NOTE: This is our main strategy to deal with timing hiccups such as those that can be caused by
// suspensions, system overloads and other things that could lead to arbitrarily big sampling
// time measurements.
#define MAX_ALLOWED_SAMPLING_NS(target_overhead) (long) (ADJUSTMENT_WINDOW_NS * target_overhead / 100.)

#define EMA_SMOOTHING_FACTOR 0.6

Expand All @@ -31,6 +38,7 @@ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler, long now_
// This fake readjustment will use a hardcoded sampling interval
.sampling_interval = BASE_SAMPLING_INTERVAL,
.sampling_probability = 1.0 / BASE_SAMPLING_INTERVAL,
.max_sampling_time_ns = MAX_ALLOWED_SAMPLING_NS(target_overhead),
// But we want to make sure we sample at least once in the next window so that our first
// real readjustment has some notion of how heavy sampling is. Therefore, we'll make it so that
// the next event is automatically sampled by artificially locating it in the interval threshold.
Expand Down Expand Up @@ -134,6 +142,15 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {

// Lets update our average sampling time per event
long avg_sampling_time_in_window_ns = sampler->samples_since_last_readjustment == 0 ? 0 : sampler->sampling_time_since_last_readjustment_ns / sampler->samples_since_last_readjustment;
if (avg_sampling_time_in_window_ns > sampler->max_sampling_time_ns) {
// If the average sampling time in the previous window was deemed unnacceptable, clamp it to the
// maximum acceptable value and register this operation in our counter.
// NOTE: This is important so that events like suspensions or system overloads do not lead us to
// learn arbitrarily big sampling times which may then result in us not sampling anything
// for very long periods of time.
avg_sampling_time_in_window_ns = sampler->max_sampling_time_ns;
sampler->sampling_time_clamps++;
}
sampler->sampling_time_ns = ewma_adj_window(
avg_sampling_time_in_window_ns,
sampler->sampling_time_ns,
Expand Down Expand Up @@ -260,6 +277,9 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
fprintf(stderr, "expected allocs in 60s=%f\n", allocs_in_60s);
fprintf(stderr, "expected samples in 60s=%f\n", samples_in_60s);
fprintf(stderr, "expected sampling time in 60s=%f (previous real=%f)\n", expected_total_sampling_time_in_60s, real_total_sampling_time_in_60s);
fprintf(stderr, "expected max overhead in 60s=%f\n", target_overhead / 100.0 * 60);
fprintf(stderr, "\n");
fprintf(stderr, "sampling_time_clamps=%zu\n", sampler->sampling_time_clamps);
fprintf(stderr, "-------\n");
#endif

Expand All @@ -280,6 +300,8 @@ VALUE discrete_dynamic_sampler_state_snapshot(discrete_dynamic_sampler *sampler)
ID2SYM(rb_intern("sampling_probability")), /* => */ DBL2NUM(sampler->sampling_probability * 100),
ID2SYM(rb_intern("events_since_last_readjustment")), /* => */ ULONG2NUM(sampler->events_since_last_readjustment),
ID2SYM(rb_intern("samples_since_last_readjustment")), /* => */ ULONG2NUM(sampler->samples_since_last_readjustment),
ID2SYM(rb_intern("max_sampling_time_ns")), /* => */ LONG2NUM(sampler->max_sampling_time_ns),
AlexJF marked this conversation as resolved.
Show resolved Hide resolved
ID2SYM(rb_intern("sampling_time_clamps")), /* => */ ULONG2NUM(sampler->sampling_time_clamps),
};
VALUE hash = rb_hash_new();
for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(hash, arguments[i], arguments[i+1]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ typedef struct discrete_dynamic_sampler {
// NOTE: This is an inverted view of the probability.
// NOTE: A value of 0 works as +inf, effectively disabling sampling (to align with probability=0)
unsigned long sampling_interval;
// Max allowed value for an individual sampling time measurement.
long max_sampling_time_ns;

// -- Sampling State --
// How many events have we seen since we last decided to sample.
Expand All @@ -55,6 +57,9 @@ typedef struct discrete_dynamic_sampler {
// A negative number that we add to target_overhead to serve as extra padding to
// try and mitigate observed overshooting of max sampling time.
double target_overhead_adjustment;

// -- Interesting stats --
unsigned long sampling_time_clamps;
} discrete_dynamic_sampler;


Expand Down
122 changes: 106 additions & 16 deletions spec/datadog/profiling/collectors/discrete_dynamic_sampler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,25 @@
end

subject!(:sampler) do
sampler = Datadog::Profiling::Collectors::DiscreteDynamicSampler::Testing::Sampler.new((@now * 1e9).to_i)
sampler = Datadog::Profiling::Collectors::DiscreteDynamicSampler::Testing::Sampler.new(to_ns(@now))
update_overhead_target(max_overhead_target, sampler)
sampler
end

def to_ns(secs)
(secs * 1e9).to_i
end

def to_sec(ns)
(ns / 1e9)
end

def maybe_sample(sampling_seconds:)
start_ns = (@now * 1e9).to_i
end_ns = start_ns + (sampling_seconds * 1e9).to_i
sampler._native_after_sample(end_ns) / 1e9 if sampler._native_should_sample(start_ns)
return false unless sampler._native_should_sample(to_ns(@now))

sampler._native_after_sample(to_ns(@now + sampling_seconds))
@now += sampling_seconds
true
end

def simulate_load(duration_seconds:, events_per_second:, sampling_seconds:)
Expand All @@ -31,14 +41,13 @@ def simulate_load(duration_seconds:, events_per_second:, sampling_seconds:)
# We update time at the beginning on purpose to force the last event to
# occur at the end of the specified duration window. In other words, we
# consciously go with end-aligned allocations in these simulated loads
# so that it's easier to force
# so that it's easier to force an adjustment to occur at the end of it.
@now += time_between_events
sampling_time = maybe_sample(sampling_seconds: sampling_seconds)
next if sampling_time.nil?
sampled = maybe_sample(sampling_seconds: sampling_seconds)
next unless sampled

num_samples += 1
total_sampling_seconds += sampling_time
@now += sampling_time
total_sampling_seconds += sampling_seconds
end
{
sampling_ratio: num_samples.to_f / num_events,
Expand Down Expand Up @@ -245,19 +254,100 @@ def sampler_current_events_per_sec
simulate_load(duration_seconds: 1, events_per_second: 4, sampling_seconds: 0.08)
expect(sampler_current_probability).to eq(0)

# Since that initial readjustment set probability to 0, all events in the next window will be ignored but
# ideally we should slowly relax this over time otherwise probability = 0 would be a terminal state (if we
# Since that initial readjustment set probability to 0, all events in the next window will be ignored
stats = simulate_load(duration_seconds: 1, events_per_second: 4, sampling_seconds: 0.08)
expect(stats[:num_samples]).to eq(0)

# Ideally we should slowly relax this over time otherwise probability = 0 would be a terminal state (if we
# never sample again, we'll be "stuck" with the same sampling overhead view that determined probability = 0
# in the first place since no new sampling data came in). Because of that, over a large enough window, we
# should get some "are things still as bad as before?" probing samples.
#
# Question is: how long do we have to wait for probing samples? Intuitively, we need to build enough budget over
# time for us to be able to take that probing hit assuming things remain the same. Each adjustment window
# with no sampling activity earns us 0.02 seconds of budget. Therefore we need 4 of these to go by before
# we see the next probing sample.
stats = simulate_load(duration_seconds: 3, events_per_second: 4, sampling_seconds: 2)
expect(stats[:num_samples]).to eq(0)
stats = simulate_load(duration_seconds: 1, events_per_second: 4, sampling_seconds: 2)
# with no sampling activity earns us 0.02 seconds of budget. Therefore in theory we'd need 4 of these windows
# to go by before we see the next probing sample. However, an additional factor comes in -- the minimum sampling
# target -- and we are actually clamping the 0.08 sampling_seconds for an individual event to just 0.02 so we expect
# to see a sample in the next window already
stats = simulate_load(duration_seconds: 1, events_per_second: 4, sampling_seconds: 0.08)
expect(stats[:num_samples]).to eq(1)
end

it "a weird hiccup shouldn't be able to disable sampling for whole minutes" do
# A normal load
simulate_load(duration_seconds: 60, events_per_second: 50, sampling_seconds: 0.0001)

# Followed by a a single super weird sampling event that takes an apparent 1000 seconds to complete
# (e.g. due to a process suspension)
maybe_sample(sampling_seconds: 1000)

# Followed by another normal load
stats = simulate_load(duration_seconds: 60, events_per_second: 50, sampling_seconds: 0.001)

# We expect the weird sample to not prevent us from sampling at least 1 sample per second from
# the above load
expect(stats[:num_samples]).to be >= 60
end

describe '.state_snapshot' do
let(:state_snapshot) { sampler._native_state_snapshot }

it 'fills a Ruby hash with relevant data from a sampler instance' do
# Max overhead of 2% over 1 second means a max of 0.02 seconds of sampling.
# With each sample taking 0.01 seconds, we can afford to do 2 of these every second.
# At an event rate of 8/sec we can sample 1/4 of total events.
simulate_load(duration_seconds: 120, events_per_second: 8, sampling_seconds: 0.01)

expect(state_snapshot).to match(
{
target_overhead: max_overhead_target,
target_overhead_adjustment: be_between(-max_overhead_target, 0),
events_per_sec: be_within(1).of(8),
sampling_time_ns: be_within(to_ns(0.001)).of(to_ns(0.01)),
sampling_interval: 4,
sampling_probability: be_within(2).of(25), # %
events_since_last_readjustment: be_between(0, 8),
samples_since_last_readjustment: be_between(0, 2),
max_sampling_time_ns: to_ns(1) * 0.02,
sampling_time_clamps: 0
}
)
end

context 'in a situation that triggers our minimum sampling target mechanism' do
let(:max_overhead_target) { 1.0 }

it 'captures time clamps accurately' do
# Max overhead of 1% over 1 second means a max of 0.01 seconds of sampling.
# With each sample taking 0.02 seconds, we can in theory only do one sample every 2 seconds.
# However, our minimum sampling target ensure we try at least one sample per window and we'll
# clamp its sampling time.
simulate_load(duration_seconds: 120, events_per_second: 20, sampling_seconds: 0.02)

expect(state_snapshot).to match(
{
target_overhead: max_overhead_target,
target_overhead_adjustment: be_between(-max_overhead_target, 0),
events_per_sec: be_within(1).of(20),
# This is clamped to 0.01
sampling_time_ns: be_within(to_ns(0.001)).of(to_ns(0.01)),
# In theory we should be sampling every 20 samples but lets make this flexible to account
# for our target overhead adjustment logic which may increase this
sampling_interval: be >= 20,
# Same as above. In theory 5% but overhead adjustment may push this down a bit.
sampling_probability: be <= 5, # %
events_since_last_readjustment: be_between(0, 20),
samples_since_last_readjustment: be_between(0, 1),
max_sampling_time_ns: to_ns(1) * 0.01,
# In theory we should be sampling every adjustment window and each of those samples should
# have been clamped for a grand total of 120. However, target overhead adjustment logic will
# be triggered given we have a consistent 2x overshooting of maximum sampling time. This
# adjustment logic will essentially move our target overhead to be closer to 0.5% rather than
# the real 1% so we'd expect approx. 120 / 2 = 60 (clamped) samples.
sampling_time_clamps: be_between(50, 70),
}
)
end
end
end
end
Loading