Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-8864] Dynamic allocation sampling #3395

Merged
merged 17 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@
#include "collectors_thread_context.h"
#include "collectors_dynamic_sampling_rate.h"
#include "collectors_idle_sampling_helper.h"
#include "collectors_discrete_dynamic_sampler.h"
#include "private_vm_api_access.h"
#include "setup_signal_handler.h"
#include "time_helpers.h"

// Maximum allowed value for an allocation weight. Attempts to use higher values will result in clamping.
unsigned int MAX_ALLOC_WEIGHT = 65535;

// Used to trigger the execution of Collectors::ThreadState, which implements all of the sampling logic
// itself; this class only implements the "when to do it" part.
//
Expand Down Expand Up @@ -89,13 +93,13 @@ struct cpu_and_wall_time_worker_state {
bool gc_profiling_enabled;
bool no_signals_workaround_enabled;
bool dynamic_sampling_rate_enabled;
int allocation_sample_every;
bool allocation_profiling_enabled;
VALUE self_instance;
VALUE thread_context_collector_instance;
VALUE idle_sampling_helper_instance;
VALUE owner_thread;
dynamic_sampling_rate_state dynamic_sampling_rate;
dynamic_sampling_rate_state cpu_dynamic_sampling_rate;
discrete_dynamic_sampler allocation_sampler;
VALUE gc_tracepoint; // Used to get gc start/finish information
VALUE object_allocation_tracepoint; // Used to get allocation counts and allocation profiling

Expand Down Expand Up @@ -159,7 +163,6 @@ static VALUE _native_initialize(
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled,
VALUE dynamic_sampling_rate_overhead_target_percentage,
VALUE allocation_sample_every,
VALUE allocation_profiling_enabled
);
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
Expand Down Expand Up @@ -244,7 +247,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 9);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 8);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
Expand Down Expand Up @@ -284,12 +287,12 @@ static VALUE _native_new(VALUE klass) {
state->gc_profiling_enabled = false;
state->no_signals_workaround_enabled = false;
state->dynamic_sampling_rate_enabled = true;
state->allocation_sample_every = 0;
state->allocation_profiling_enabled = false;
state->thread_context_collector_instance = Qnil;
state->idle_sampling_helper_instance = Qnil;
state->owner_thread = Qnil;
dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
dynamic_sampling_rate_init(&state->cpu_dynamic_sampling_rate);
discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation");
state->gc_tracepoint = Qnil;
state->object_allocation_tracepoint = Qnil;

Expand All @@ -313,13 +316,11 @@ static VALUE _native_initialize(
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled,
VALUE dynamic_sampling_rate_overhead_target_percentage,
VALUE allocation_sample_every,
VALUE allocation_profiling_enabled
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);
ENFORCE_BOOLEAN(no_signals_workaround_enabled);
ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);
ENFORCE_TYPE(dynamic_sampling_rate_overhead_target_percentage, T_FLOAT);
ENFORCE_BOOLEAN(allocation_profiling_enabled);

Expand All @@ -329,12 +330,16 @@ static VALUE _native_initialize(
state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
dynamic_sampling_rate_set_overhead_target_percentage(&state->dynamic_sampling_rate, NUM2DBL(dynamic_sampling_rate_overhead_target_percentage));
state->allocation_sample_every = NUM2INT(allocation_sample_every);
state->allocation_profiling_enabled = (allocation_profiling_enabled == Qtrue);

if (state->allocation_sample_every <= 0) {
rb_raise(rb_eArgError, "Unexpected value for allocation_sample_every: %d. This value must be > 0.", state->allocation_sample_every);
double total_overhead_target_percentage = NUM2DBL(dynamic_sampling_rate_overhead_target_percentage);
if (!state->allocation_profiling_enabled) {
dynamic_sampling_rate_set_overhead_target_percentage(&state->cpu_dynamic_sampling_rate, total_overhead_target_percentage);
} else {
// TODO: May be nice to offer customization here? Distribute available "overhead" margin with a bias towards one or the other
// sampler.
dynamic_sampling_rate_set_overhead_target_percentage(&state->cpu_dynamic_sampling_rate, total_overhead_target_percentage / 2);
discrete_dynamic_sampler_set_overhead_target_percentage(&state->allocation_sampler, total_overhead_target_percentage / 2);
}

state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
Expand Down Expand Up @@ -387,7 +392,8 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
if (state->stop_thread == rb_thread_current()) return Qnil;

// Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
dynamic_sampling_rate_reset(&state->dynamic_sampling_rate);
dynamic_sampling_rate_reset(&state->cpu_dynamic_sampling_rate);
AlexJF marked this conversation as resolved.
Show resolved Hide resolved
discrete_dynamic_sampler_reset(&state->allocation_sampler);

// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
active_sampler_instance_state = state;
Expand Down Expand Up @@ -560,7 +566,7 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
// Note that we deliberately should NOT combine this sleep_for with the one above because the result of
// `dynamic_sampling_rate_get_sleep` may have changed while the above sleep was ongoing.
uint64_t extra_sleep =
dynamic_sampling_rate_get_sleep(&state->dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
dynamic_sampling_rate_get_sleep(&state->cpu_dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
if (state->dynamic_sampling_rate_enabled && extra_sleep > 0) sleep_for(extra_sleep);
}

Expand Down Expand Up @@ -600,7 +606,7 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {

long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);

if (state->dynamic_sampling_rate_enabled && !dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
if (state->dynamic_sampling_rate_enabled && !dynamic_sampling_rate_should_sample(&state->cpu_dynamic_sampling_rate, wall_time_ns_before_sample)) {
state->stats.skipped_sample_because_of_dynamic_sampling_rate++;
return Qnil;
}
Expand All @@ -620,7 +626,7 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
state->stats.sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.sampling_time_ns_max);
state->stats.sampling_time_ns_total += sampling_time_ns;

dynamic_sampling_rate_after_sample(&state->dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);
dynamic_sampling_rate_after_sample(&state->cpu_dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);

// Return a dummy VALUE because we're called from rb_rescue2 which requires it
return Qnil;
Expand Down Expand Up @@ -931,18 +937,20 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
return;
}

if (state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler)) {
return;
}

// @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
// invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
// to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
// defined as not being able to allocate) sets this.
state->during_sample = true;

// TODO: This is a placeholder sampling decision strategy. We plan to replace it with a better one soon (e.g. before
// beta), and having something here allows us to test the rest of feature, sampling decision aside.
if (allocation_count % state->allocation_sample_every == 0) {
// Rescue against any exceptions that happen during sampling
safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
}
// Rescue against any exceptions that happen during sampling
safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);

discrete_dynamic_sampler_after_sample(&state->allocation_sampler);

state->during_sample = false;
}
Expand Down Expand Up @@ -974,7 +982,14 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
VALUE new_object = rb_tracearg_object(data);

thread_context_collector_sample_allocation(state->thread_context_collector_instance, state->allocation_sample_every, new_object);
unsigned long allocations_since_last_sample = state->dynamic_sampling_rate_enabled ?
// if we're doing dynamic sampling, ask the sampler how many events since last sample
discrete_dynamic_sampler_events_since_last_sample(&state->allocation_sampler) :
// if we aren't, then we're sampling every event
1;
// TODO: Signal in the profile that clamping happened?
unsigned int weight = allocations_since_last_sample > MAX_ALLOC_WEIGHT ? MAX_ALLOC_WEIGHT : (unsigned int) allocations_since_last_sample;
thread_context_collector_sample_allocation(state->thread_context_collector_instance, weight, new_object);
AlexJF marked this conversation as resolved.
Show resolved Hide resolved

// Return a dummy VALUE because we're called from rb_rescue2 which requires it
return Qnil;
Expand Down
Loading
Loading