DataDog · AlexJF · Mar 5, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -181,6 +181,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
 static VALUE _native_sampling_loop(VALUE self, VALUE instance);
 static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
 static VALUE stop(VALUE self_instance, VALUE optional_exception);
+static void stop_state(struct cpu_and_wall_time_worker_state *state, VALUE optional_exception);
 static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
 static void *run_sampling_trigger_loop(void *state_ptr);
 static void interrupt_sampling_trigger_loop(void *state_ptr);
@@ -213,6 +214,8 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
 static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
 static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
 static VALUE rescued_sample_allocation(VALUE tracepoint_data);
+static void handle_allocation_sampler_error(struct cpu_and_wall_time_worker_state *state, const char *error, bool can_raise);
+static VALUE _native_force_allocation_sampler_failure(DDTRACE_UNUSED VALUE self);
 
 // Note on sampler global state safety:
 //
@@ -277,6 +280,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
   rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
   rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
   rb_define_singleton_method(testing_module, "_native_with_blocked_sigprof", _native_with_blocked_sigprof, 0);
+  rb_define_singleton_method(testing_module, "_native_force_allocation_sampler_failure", _native_force_allocation_sampler_failure, 0);
 }
 
 // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -306,7 +310,6 @@ static VALUE _native_new(VALUE klass) {
   state->idle_sampling_helper_instance = Qnil;
   state->owner_thread = Qnil;
   dynamic_sampling_rate_init(&state->cpu_dynamic_sampling_rate);
-  discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation");
   state->gc_tracepoint = Qnil;
   state->object_allocation_tracepoint = Qnil;
 
@@ -318,6 +321,11 @@ static VALUE _native_new(VALUE klass) {
 
   reset_stats_not_thread_safe(state);
 
+  const char *error = discrete_dynamic_sampler_init(&state->allocation_sampler, "allocation");
+  if (error) {
+    handle_allocation_sampler_error(state, error, false);
+  }
+
   return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
 }
 
@@ -353,7 +361,10 @@ static VALUE _native_initialize(
     // TODO: May be nice to offer customization here? Distribute available "overhead" margin with a bias towards one or the other
     // sampler.
     dynamic_sampling_rate_set_overhead_target_percentage(&state->cpu_dynamic_sampling_rate, total_overhead_target_percentage / 2);
-    discrete_dynamic_sampler_set_overhead_target_percentage(&state->allocation_sampler, total_overhead_target_percentage / 2);
+    const char *error = discrete_dynamic_sampler_set_overhead_target_percentage(&state->allocation_sampler, total_overhead_target_percentage / 2);
+    if (error) {
+      handle_allocation_sampler_error(state, error, true);
+    }
   }
 
   state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
@@ -407,7 +418,10 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
 
   // Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
   dynamic_sampling_rate_reset(&state->cpu_dynamic_sampling_rate);
-  discrete_dynamic_sampler_reset(&state->allocation_sampler);
+  const char *error = discrete_dynamic_sampler_reset(&state->allocation_sampler);
+  if (error) {
+    handle_allocation_sampler_error(state, error, true);
+  }
 
   // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
   active_sampler_instance_state = state;
@@ -469,15 +483,19 @@ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE
   return stop(self_instance, /* optional_exception: */ Qnil);
 }
 
-static VALUE stop(VALUE self_instance, VALUE optional_exception) {
-  struct cpu_and_wall_time_worker_state *state;
-  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
-
+static void stop_state(struct cpu_and_wall_time_worker_state *state, VALUE optional_exception) {
   atomic_store(&state->should_run, false);
   state->failure_exception = optional_exception;
 
   // Disable the tracepoints as soon as possible, so the VM doesn't keep on calling them
   disable_tracepoints(state);
+}
+
+static VALUE stop(VALUE self_instance, VALUE optional_exception) {
+  struct cpu_and_wall_time_worker_state *state;
+  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
+
+  stop_state(state, optional_exception);
 
   return Qtrue;
 }
@@ -995,9 +1013,16 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
     return;
   }
 
-  if (state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler)) {
-    state->stats.allocation_skipped++;
-    return;
+  if (state->dynamic_sampling_rate_enabled) {
+    discrete_dynamic_sampler_should_sample_result result = discrete_dynamic_sampler_should_sample(&state->allocation_sampler);
+    if (result.error) {
+      handle_allocation_sampler_error(state, result.error, false);
+      return;
+    }
+    if (!result.should_sample) {
+      state->stats.allocation_skipped++;
+      return;
+    }
   }
 
   // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
@@ -1010,8 +1035,14 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
   safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
 
   if (state->dynamic_sampling_rate_enabled) {
-    // NOTE: To keep things lean when dynamic sampling rate is disabled we skip clock interactions.
-    uint64_t sampling_time_ns = discrete_dynamic_sampler_after_sample(&state->allocation_sampler);
+    // NOTE: To keep things lean when dynamic sampling rate is disabled we skip clock interactions which is
+    //       why we're fine with having this inside this conditional.
+    discrete_dynamic_sampler_after_sample_result result = discrete_dynamic_sampler_after_sample(&state->allocation_sampler);
+    if (result.error) {
+      handle_allocation_sampler_error(state, result.error, false);
+      return;
+    }
+    uint64_t sampling_time_ns = result.sampling_time_ns;
     state->stats.allocation_sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.allocation_sampling_time_ns_min);
     state->stats.allocation_sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.allocation_sampling_time_ns_max);
     state->stats.allocation_sampling_time_ns_total += sampling_time_ns;
@@ -1061,3 +1092,29 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
   // Return a dummy VALUE because we're called from rb_rescue2 which requires it
   return Qnil;
 }
+
+static void handle_allocation_sampler_error(struct cpu_and_wall_time_worker_state *state, const char *error, bool can_raise) {
+  if (error == NULL) {
+    return;
+  }
+
+  // Lets make sure our error messages explicilty mention this is for the allocation sampler. We may end up having more than one
+  // discrete sampler in the future.
+  VALUE error_rbstr = rb_str_new_cstr("allocation sampler - ");
+  rb_str_cat_cstr(error_rbstr, error);
+
+  if (can_raise) {
+    rb_raise(rb_eRuntimeError, "%"PRIsVALUE, error_rbstr);
+  } else {
+    // If we can't raise an immediate exception at the calling site, use the asynchronous flow through the main worker loop.
+    stop_state(state, rb_exc_new_str(rb_eRuntimeError, error_rbstr));
+  }
+}
+
+static VALUE _native_force_allocation_sampler_failure(DDTRACE_UNUSED VALUE self) {
+  struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
+
+  discrete_dynamic_sampler_testing_force_fail(&state->allocation_sampler);
+
+  return Qnil;
+}