Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-9044] Handle timing hiccups #3440

Merged
merged 31 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6c82a38
[PROF-9036] Include profiling worker and GC stats in profiles
AlexJF Feb 1, 2024
3dd16ab
Missing test fixes
AlexJF Feb 1, 2024
0699954
Fix typo
AlexJF Feb 1, 2024
ad85fdb
Make sampler snapshot work on all rubies
AlexJF Feb 1, 2024
b806172
Fix events/ns->events/s calc
AlexJF Feb 2, 2024
a5392a6
Use non-fraction percentage in sampler snapshot
AlexJF Feb 2, 2024
59b5fad
[PROF-9045] Test big spikes within adjustment window
AlexJF Feb 2, 2024
ed53cd1
[PROF-9045] Allow adjustment preemption based on number of samples
AlexJF Feb 2, 2024
7e759e7
[PROF-9045] Test preemption does not lead to early forgetting
AlexJF Feb 5, 2024
7f4df67
Use more widely available gc stat key.
AlexJF Feb 5, 2024
dd820cf
[PROF-9045] Weight moving averages by size of adjustment window
AlexJF Feb 5, 2024
bbcd788
Fix failures on older rubies
AlexJF Feb 5, 2024
f5ee30e
Fix failures on jrubies
AlexJF Feb 5, 2024
dc06f81
Match only on count
AlexJF Feb 5, 2024
3f76438
Merge branch 'master' into alexjf/prof-9036-stats-as-internal-data
AlexJF Feb 5, 2024
4988fb8
Merge branch 'alexjf/prof-9036-stats-as-internal-data' into alexjf/pr…
AlexJF Feb 5, 2024
6234ae2
[PROF-9035] Stop worker on allocation sampler failure
AlexJF Feb 6, 2024
973dc67
[PROF-9044] Test for timing hiccups
AlexJF Feb 6, 2024
a07ac12
[PROF-9044] Introduce maximum allowed sampling time
AlexJF Feb 6, 2024
500a3a9
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 12, 2024
f912b5b
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 16, 2024
640b1a2
Address comments
AlexJF Feb 16, 2024
4a03871
Merge branch 'master' into alexjf/prof-9035-handle-clock-errors-alloc…
AlexJF Feb 19, 2024
48efd30
Merge branch 'alexjf/prof-9035-handle-clock-errors-allocation-sampler…
AlexJF Feb 19, 2024
43fde70
Update spec/datadog/profiling/collectors/discrete_dynamic_sampler_spe…
AlexJF Feb 19, 2024
9539408
Merge branch 'alexjf/prof-9044-handle-timing-hiccups' of github.com:D…
AlexJF Feb 19, 2024
09a1f37
Address comments
AlexJF Feb 19, 2024
d09435b
Update spec/datadog/profiling/collectors/discrete_dynamic_sampler_spe…
AlexJF Mar 4, 2024
401065d
Merge branch 'master' into alexjf/prof-9044-handle-timing-hiccups
AlexJF Mar 5, 2024
759a17c
[PROF-9044] Fix botched merge
AlexJF Mar 5, 2024
101cb52
[PROF-9044] More botched merge leftovers
AlexJF Mar 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
static VALUE stop(VALUE self_instance, VALUE optional_exception);
static void stop_state(struct cpu_and_wall_time_worker_state *state, VALUE optional_exception);
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
static void *run_sampling_trigger_loop(void *state_ptr);
static void interrupt_sampling_trigger_loop(void *state_ptr);
Expand Down Expand Up @@ -213,6 +214,8 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
static VALUE rescued_sample_allocation(VALUE tracepoint_data);
static void handle_allocation_sampler_error(struct cpu_and_wall_time_worker_state *state, const char *error, bool can_raise);
static VALUE _native_force_allocation_sampler_failure(DDTRACE_UNUSED VALUE self);

// Note on sampler global state safety:
//
Expand Down Expand Up @@ -277,6 +280,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
rb_define_singleton_method(testing_module, "_native_with_blocked_sigprof", _native_with_blocked_sigprof, 0);
rb_define_singleton_method(testing_module, "_native_force_allocation_sampler_failure", _native_force_allocation_sampler_failure, 0);
}

// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
Expand Down Expand Up @@ -306,7 +310,6 @@ static VALUE _native_new(VALUE klass) {
state->idle_sampling_helper_instance = Qnil;
state->owner_thread = Qnil;
dynamic_sampling_rate_init(&state->cpu_dynamic_sampling_rate);
discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation");
state->gc_tracepoint = Qnil;
state->object_allocation_tracepoint = Qnil;

Expand All @@ -318,6 +321,11 @@ static VALUE _native_new(VALUE klass) {

reset_stats_not_thread_safe(state);

const char *error = discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation");
if (error) {
handle_allocation_sampler_error(state, error, false);
}

return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
}

Expand Down Expand Up @@ -353,7 +361,10 @@ static VALUE _native_initialize(
// TODO: May be nice to offer customization here? Distribute available "overhead" margin with a bias towards one or the other
// sampler.
dynamic_sampling_rate_set_overhead_target_percentage(&state->cpu_dynamic_sampling_rate, total_overhead_target_percentage / 2);
discrete_dynamic_sampler_set_overhead_target_percentage(&state->allocation_sampler, total_overhead_target_percentage / 2);
const char *error = discrete_dynamic_sampler_set_overhead_target_percentage(&state->allocation_sampler, total_overhead_target_percentage / 2);
if (error) {
handle_allocation_sampler_error(state, error, true);
}
}

state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
Expand Down Expand Up @@ -407,7 +418,10 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {

// Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
dynamic_sampling_rate_reset(&state->cpu_dynamic_sampling_rate);
discrete_dynamic_sampler_reset(&state->allocation_sampler);
const char *error = discrete_dynamic_sampler_reset(&state->allocation_sampler);
if (error) {
handle_allocation_sampler_error(state, error, true);
}

// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
active_sampler_instance_state = state;
Expand Down Expand Up @@ -469,15 +483,19 @@ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE
return stop(self_instance, /* optional_exception: */ Qnil);
}

static VALUE stop(VALUE self_instance, VALUE optional_exception) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

static void stop_state(struct cpu_and_wall_time_worker_state *state, VALUE optional_exception) {
atomic_store(&state->should_run, false);
state->failure_exception = optional_exception;

// Disable the tracepoints as soon as possible, so the VM doesn't keep on calling them
disable_tracepoints(state);
}

static VALUE stop(VALUE self_instance, VALUE optional_exception) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

stop_state(state, optional_exception);

return Qtrue;
}
Expand Down Expand Up @@ -995,9 +1013,16 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
return;
}

if (state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler)) {
state->stats.allocation_skipped++;
return;
if (state->dynamic_sampling_rate_enabled) {
discrete_dynamic_sampler_should_sample_result result = discrete_dynamic_sampler_should_sample(&state->allocation_sampler);
if (result.error) {
handle_allocation_sampler_error(state, result.error, false);
return;
}
if (!result.should_sample) {
state->stats.allocation_skipped++;
return;
}
}

// @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
Expand All @@ -1010,8 +1035,14 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);

if (state->dynamic_sampling_rate_enabled) {
// NOTE: To keep things lean when dynamic sampling rate is disabled we skip clock interactions.
uint64_t sampling_time_ns = discrete_dynamic_sampler_after_sample(&state->allocation_sampler);
// NOTE: To keep things lean when dynamic sampling rate is disabled we skip clock interactions which is
// why we're fine with having this inside this conditional.
discrete_dynamic_sampler_after_sample_result result = discrete_dynamic_sampler_after_sample(&state->allocation_sampler);
if (result.error) {
handle_allocation_sampler_error(state, result.error, false);
return;
}
uint64_t sampling_time_ns = result.sampling_time_ns;
state->stats.allocation_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.allocation_sampling_time_ns_min);
state->stats.allocation_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.allocation_sampling_time_ns_max);
state->stats.allocation_sampling_time_ns_total += sampling_time_ns;
Expand Down Expand Up @@ -1061,3 +1092,29 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
// Return a dummy VALUE because we're called from rb_rescue2 which requires it
return Qnil;
}

static void handle_allocation_sampler_error(struct cpu_and_wall_time_worker_state *state, const char *error, bool can_raise) {
if (error == NULL) {
return;
}

// Lets make sure our error messages explicilty mention this is for the allocation sampler. We may end up having more than one
// discrete sampler in the future.
VALUE error_rbstr = rb_str_new_cstr("allocation sampler - ");
rb_str_cat_cstr(error_rbstr, error);

if (can_raise) {
rb_raise(rb_eRuntimeError, "%"PRIsVALUE, error_rbstr);
} else {
// If we can't raise an immediate exception at the calling site, use the asynchronous flow through the main worker loop.
stop_state(state, rb_exc_new_str(rb_eRuntimeError, error_rbstr));
}
}

static VALUE _native_force_allocation_sampler_failure(DDTRACE_UNUSED VALUE self) {
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above

discrete_dynamic_sampler_testing_force_fail(&state->allocation_sampler);

return Qnil;
}
Loading
Loading