Skip to content

Commit

Permalink
Merge pull request #3384 from DataDog/ivoanjo/prof-8917-crash-tracker…
Browse files Browse the repository at this point in the history
…-ruby

[PROF-8917] Add support for the libdatadog crash tracker
  • Loading branch information
ivoanjo committed May 13, 2024
2 parents 1b10c12 + 42f6ca5 commit 3bd8b05
Show file tree
Hide file tree
Showing 22 changed files with 778 additions and 119 deletions.
108 changes: 108 additions & 0 deletions ext/datadog_profiling_native_extension/crashtracker.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#include <ruby.h>
#include <datadog/common.h>
#include <libdatadog_helpers.h>

static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self);
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self);

// Used to report Ruby VM crashes.
// Once initialized, segfaults will be reported automatically using libdatadog.

void crashtracker_init(VALUE profiling_module) {
VALUE crashtracker_class = rb_define_class_under(profiling_module, "Crashtracker", rb_cObject);

rb_define_singleton_method(crashtracker_class, "_native_start_or_update_on_fork", _native_start_or_update_on_fork, -1);
rb_define_singleton_method(crashtracker_class, "_native_stop", _native_stop, 0);
}

static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self) {
VALUE options;
rb_scan_args(argc, argv, "0:", &options);

VALUE exporter_configuration = rb_hash_fetch(options, ID2SYM(rb_intern("exporter_configuration")));
VALUE path_to_crashtracking_receiver_binary = rb_hash_fetch(options, ID2SYM(rb_intern("path_to_crashtracking_receiver_binary")));
VALUE ld_library_path = rb_hash_fetch(options, ID2SYM(rb_intern("ld_library_path")));
VALUE tags_as_array = rb_hash_fetch(options, ID2SYM(rb_intern("tags_as_array")));
VALUE action = rb_hash_fetch(options, ID2SYM(rb_intern("action")));
VALUE upload_timeout_seconds = rb_hash_fetch(options, ID2SYM(rb_intern("upload_timeout_seconds")));

VALUE start_action = ID2SYM(rb_intern("start"));
VALUE update_on_fork_action = ID2SYM(rb_intern("update_on_fork"));

ENFORCE_TYPE(exporter_configuration, T_ARRAY);
ENFORCE_TYPE(tags_as_array, T_ARRAY);
ENFORCE_TYPE(path_to_crashtracking_receiver_binary, T_STRING);
ENFORCE_TYPE(ld_library_path, T_STRING);
ENFORCE_TYPE(action, T_SYMBOL);
ENFORCE_TYPE(upload_timeout_seconds, T_FIXNUM);

if (action != start_action && action != update_on_fork_action) rb_raise(rb_eArgError, "Unexpected action: %+"PRIsVALUE, action);

VALUE version = ddtrace_version();
ddog_prof_Endpoint endpoint = endpoint_from(exporter_configuration);

// Tags are heap-allocated, so after here we can't raise exceptions otherwise we'll leak this memory
// Start of exception-free zone to prevent leaks {{
ddog_Vec_Tag tags = convert_tags(tags_as_array);

ddog_prof_CrashtrackerConfiguration config = {
.additional_files = {},
// The Ruby VM already uses an alt stack to detect stack overflows so the crash handler must not overwrite it.
//
// @ivoanjo: Specifically, with `create_alt_stack = true` I saw a segfault, such as Ruby 2.6's bug with
// "Process.detach(fork { exit! }).instance_variable_get(:@foo)" being turned into a
// "-e:1:in `instance_variable_get': stack level too deep (SystemStackError)" by Ruby.
//
// The Ruby crash handler also seems to get confused when this option is enabled and
// "Process.kill('SEGV', Process.pid)" gets run.
.create_alt_stack = false,
.endpoint = endpoint,
.resolve_frames = DDOG_PROF_STACKTRACE_COLLECTION_ENABLED,
.timeout_secs = FIX2INT(upload_timeout_seconds),
};

ddog_prof_CrashtrackerMetadata metadata = {
.profiling_library_name = DDOG_CHARSLICE_C("dd-trace-rb"),
.profiling_library_version = char_slice_from_ruby_string(version),
.family = DDOG_CHARSLICE_C("ruby"),
.tags = &tags,
};

ddog_prof_EnvVar ld_library_path_env = {
.key = DDOG_CHARSLICE_C("LD_LIBRARY_PATH"),
.val = char_slice_from_ruby_string(ld_library_path),
};

ddog_prof_CrashtrackerReceiverConfig receiver_config = {
.args = {},
.env = {.ptr = &ld_library_path_env, .len = 1},
.path_to_receiver_binary = char_slice_from_ruby_string(path_to_crashtracking_receiver_binary),
.optional_stderr_filename = {},
.optional_stdout_filename = {},
};

ddog_prof_CrashtrackerResult result =
action == start_action ?
ddog_prof_Crashtracker_init(config, receiver_config, metadata) :
ddog_prof_Crashtracker_update_on_fork(config, receiver_config, metadata);

// Clean up before potentially raising any exceptions
ddog_Vec_Tag_drop(tags);
// }} End of exception-free zone to prevent leaks

if (result.tag == DDOG_PROF_CRASHTRACKER_RESULT_ERR) {
rb_raise(rb_eRuntimeError, "Failed to start/update the crash tracker: %"PRIsVALUE, get_error_details_and_drop(&result.err));
}

return Qtrue;
}

static VALUE _native_stop(DDTRACE_UNUSED VALUE _self) {
ddog_prof_CrashtrackerResult result = ddog_prof_Crashtracker_shutdown();

if (result.tag == DDOG_PROF_CRASHTRACKER_RESULT_ERR) {
rb_raise(rb_eRuntimeError, "Failed to stop the crash tracker: %"PRIsVALUE, get_error_details_and_drop(&result.err));
}

return Qtrue;
}
93 changes: 0 additions & 93 deletions ext/datadog_profiling_native_extension/http_transport.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@
static VALUE ok_symbol = Qnil; // :ok in Ruby
static VALUE error_symbol = Qnil; // :error in Ruby

static ID agentless_id; // id of :agentless in Ruby
static ID agent_id; // id of :agent in Ruby

static ID log_failure_to_process_tag_id; // id of :log_failure_to_process_tag in Ruby

static VALUE library_version_string = Qnil;

struct call_exporter_without_gvl_arguments {
Expand All @@ -30,9 +25,6 @@ inline static ddog_ByteSlice byte_slice_from_ruby_string(VALUE string);
static VALUE _native_validate_exporter(VALUE self, VALUE exporter_configuration);
static ddog_prof_Exporter_NewResult create_exporter(VALUE exporter_configuration, VALUE tags_as_array);
static VALUE handle_exporter_failure(ddog_prof_Exporter_NewResult exporter_result);
static ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration);
static ddog_Vec_Tag convert_tags(VALUE tags_as_array);
static void safely_log_failure_to_process_tag(ddog_Vec_Tag tags, VALUE err_details);
static VALUE _native_do_export(
VALUE self,
VALUE exporter_configuration,
Expand Down Expand Up @@ -60,9 +52,6 @@ void http_transport_init(VALUE profiling_module) {

ok_symbol = ID2SYM(rb_intern_const("ok"));
error_symbol = ID2SYM(rb_intern_const("error"));
agentless_id = rb_intern_const("agentless");
agent_id = rb_intern_const("agent");
log_failure_to_process_tag_id = rb_intern_const("log_failure_to_process_tag");

library_version_string = ddtrace_version();
rb_global_variable(&library_version_string);
Expand Down Expand Up @@ -116,88 +105,6 @@ static VALUE handle_exporter_failure(ddog_prof_Exporter_NewResult exporter_resul
rb_ary_new_from_args(2, error_symbol, get_error_details_and_drop(&exporter_result.err));
}

static ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration) {
ENFORCE_TYPE(exporter_configuration, T_ARRAY);

ID working_mode = SYM2ID(rb_ary_entry(exporter_configuration, 0)); // SYM2ID verifies its input so we can do this safely

if (working_mode != agentless_id && working_mode != agent_id) {
rb_raise(rb_eArgError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent");
}

if (working_mode == agentless_id) {
VALUE site = rb_ary_entry(exporter_configuration, 1);
VALUE api_key = rb_ary_entry(exporter_configuration, 2);
ENFORCE_TYPE(site, T_STRING);
ENFORCE_TYPE(api_key, T_STRING);

return ddog_prof_Endpoint_agentless(char_slice_from_ruby_string(site), char_slice_from_ruby_string(api_key));
} else { // agent_id
VALUE base_url = rb_ary_entry(exporter_configuration, 1);
ENFORCE_TYPE(base_url, T_STRING);

return ddog_prof_Endpoint_agent(char_slice_from_ruby_string(base_url));
}
}

__attribute__((warn_unused_result))
static ddog_Vec_Tag convert_tags(VALUE tags_as_array) {
ENFORCE_TYPE(tags_as_array, T_ARRAY);

long tags_count = RARRAY_LEN(tags_as_array);
ddog_Vec_Tag tags = ddog_Vec_Tag_new();

for (long i = 0; i < tags_count; i++) {
VALUE name_value_pair = rb_ary_entry(tags_as_array, i);

if (!RB_TYPE_P(name_value_pair, T_ARRAY)) {
ddog_Vec_Tag_drop(tags);
ENFORCE_TYPE(name_value_pair, T_ARRAY);
}

// Note: We can index the array without checking its size first because rb_ary_entry returns Qnil if out of bounds
VALUE tag_name = rb_ary_entry(name_value_pair, 0);
VALUE tag_value = rb_ary_entry(name_value_pair, 1);

if (!(RB_TYPE_P(tag_name, T_STRING) && RB_TYPE_P(tag_value, T_STRING))) {
ddog_Vec_Tag_drop(tags);
ENFORCE_TYPE(tag_name, T_STRING);
ENFORCE_TYPE(tag_value, T_STRING);
}

ddog_Vec_Tag_PushResult push_result =
ddog_Vec_Tag_push(&tags, char_slice_from_ruby_string(tag_name), char_slice_from_ruby_string(tag_value));

if (push_result.tag == DDOG_VEC_TAG_PUSH_RESULT_ERR) {
// libdatadog validates tags and may catch invalid tags that ddtrace didn't actually catch.
// We warn users about such tags, and then just ignore them.
safely_log_failure_to_process_tag(tags, get_error_details_and_drop(&push_result.err));
}
}

return tags;
}

static VALUE log_failure_to_process_tag(VALUE err_details) {
VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog"));
VALUE profiling_module = rb_const_get(datadog_module, rb_intern("Profiling"));
VALUE http_transport_class = rb_const_get(profiling_module, rb_intern("HttpTransport"));

return rb_funcall(http_transport_class, log_failure_to_process_tag_id, 1, err_details);
}

// Since we are calling into Ruby code, it may raise an exception. This method ensure that dynamically-allocated tags
// get cleaned before propagating the exception.
static void safely_log_failure_to_process_tag(ddog_Vec_Tag tags, VALUE err_details) {
int exception_state;
rb_protect(log_failure_to_process_tag, err_details, &exception_state);

if (exception_state) { // An exception was raised
ddog_Vec_Tag_drop(tags); // clean up
rb_jump_tag(exception_state); // "Re-raise" exception
}
}

// Note: This function handles a bunch of libdatadog dynamically-allocated objects, so it MUST not use any Ruby APIs
// which can raise exceptions, otherwise the objects will be leaked.
static VALUE perform_export(
Expand Down
86 changes: 86 additions & 0 deletions ext/datadog_profiling_native_extension/libdatadog_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <ruby.h>

static VALUE log_failure_to_process_tag(VALUE err_details);

const char *ruby_value_type_to_string(enum ruby_value_type type) {
return ruby_value_type_to_char_slice(type).ptr;
}
Expand Down Expand Up @@ -60,3 +62,87 @@ size_t read_ddogerr_string_and_drop(ddog_Error *error, char *string, size_t capa
ddog_Error_drop(error);
return error_msg_size;
}

__attribute__((warn_unused_result))
ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration) {
ENFORCE_TYPE(exporter_configuration, T_ARRAY);

VALUE exporter_working_mode = rb_ary_entry(exporter_configuration, 0);
ENFORCE_TYPE(exporter_working_mode, T_SYMBOL);
ID working_mode = SYM2ID(exporter_working_mode);

ID agentless_id = rb_intern("agentless");
ID agent_id = rb_intern("agent");

if (working_mode != agentless_id && working_mode != agent_id) {
rb_raise(rb_eArgError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent");
}

if (working_mode == agentless_id) {
VALUE site = rb_ary_entry(exporter_configuration, 1);
VALUE api_key = rb_ary_entry(exporter_configuration, 2);
ENFORCE_TYPE(site, T_STRING);
ENFORCE_TYPE(api_key, T_STRING);

return ddog_prof_Endpoint_agentless(char_slice_from_ruby_string(site), char_slice_from_ruby_string(api_key));
} else { // agent_id
VALUE base_url = rb_ary_entry(exporter_configuration, 1);
ENFORCE_TYPE(base_url, T_STRING);

return ddog_prof_Endpoint_agent(char_slice_from_ruby_string(base_url));
}
}

__attribute__((warn_unused_result))
ddog_Vec_Tag convert_tags(VALUE tags_as_array) {
ENFORCE_TYPE(tags_as_array, T_ARRAY);

long tags_count = RARRAY_LEN(tags_as_array);
ddog_Vec_Tag tags = ddog_Vec_Tag_new();

for (long i = 0; i < tags_count; i++) {
VALUE name_value_pair = rb_ary_entry(tags_as_array, i);

if (!RB_TYPE_P(name_value_pair, T_ARRAY)) {
ddog_Vec_Tag_drop(tags);
ENFORCE_TYPE(name_value_pair, T_ARRAY);
}

// Note: We can index the array without checking its size first because rb_ary_entry returns Qnil if out of bounds
VALUE tag_name = rb_ary_entry(name_value_pair, 0);
VALUE tag_value = rb_ary_entry(name_value_pair, 1);

if (!(RB_TYPE_P(tag_name, T_STRING) && RB_TYPE_P(tag_value, T_STRING))) {
ddog_Vec_Tag_drop(tags);
ENFORCE_TYPE(tag_name, T_STRING);
ENFORCE_TYPE(tag_value, T_STRING);
}

ddog_Vec_Tag_PushResult push_result =
ddog_Vec_Tag_push(&tags, char_slice_from_ruby_string(tag_name), char_slice_from_ruby_string(tag_value));

if (push_result.tag == DDOG_VEC_TAG_PUSH_RESULT_ERR) {
// libdatadog validates tags and may catch invalid tags that ddtrace didn't actually catch.
// We warn users about such tags, and then just ignore them.

int exception_state;
rb_protect(log_failure_to_process_tag, get_error_details_and_drop(&push_result.err), &exception_state);

// Since we are calling into Ruby code, it may raise an exception. Ensure that dynamically-allocated tags
// get cleaned before propagating the exception.
if (exception_state) {
ddog_Vec_Tag_drop(tags);
rb_jump_tag(exception_state); // "Re-raise" exception
}
}
}

return tags;
}

static VALUE log_failure_to_process_tag(VALUE err_details) {
VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog"));
VALUE logger = rb_funcall(datadog_module, rb_intern("logger"), 0);

return rb_funcall(logger, rb_intern("warn"), 1, rb_sprintf("Failed to add tag to profiling request: %"PRIsVALUE, err_details));
}
4 changes: 4 additions & 0 deletions ext/datadog_profiling_native_extension/libdatadog_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ ddog_CharSlice ruby_value_type_to_char_slice(enum ruby_value_type type);
inline static char* string_from_char_slice(ddog_CharSlice slice) {
return ruby_strndup(slice.ptr, slice.len);
}

ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration);

ddog_Vec_Tag convert_tags(VALUE tags_as_array);
2 changes: 2 additions & 0 deletions ext/datadog_profiling_native_extension/profiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ void collectors_dynamic_sampling_rate_init(VALUE profiling_module);
void collectors_idle_sampling_helper_init(VALUE profiling_module);
void collectors_stack_init(VALUE profiling_module);
void collectors_thread_context_init(VALUE profiling_module);
void crashtracker_init(VALUE profiling_module);
void http_transport_init(VALUE profiling_module);
void stack_recorder_init(VALUE profiling_module);

Expand Down Expand Up @@ -53,6 +54,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) {
collectors_idle_sampling_helper_init(profiling_module);
collectors_stack_init(profiling_module);
collectors_thread_context_init(profiling_module);
crashtracker_init(profiling_module);
http_transport_init(profiling_module);
stack_recorder_init(profiling_module);

Expand Down
10 changes: 10 additions & 0 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,16 @@ def initialize(*_)
o.env 'DD_PROFILING_UPLOAD_PERIOD'
o.default 60
end

# Enables reporting of information when the Ruby VM crashes.
#
# @default `DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED` environment variable as a boolean,
# otherwise `false`
option :experimental_crash_tracking_enabled do |o|
o.type :bool
o.env 'DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED'
o.default false
end
end

# @public_api
Expand Down
1 change: 1 addition & 0 deletions lib/datadog/profiling.rb
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def self.allocation_count # rubocop:disable Lint/NestedMethodDefinition (On purp
require_relative 'profiling/collectors/idle_sampling_helper'
require_relative 'profiling/collectors/stack'
require_relative 'profiling/collectors/thread_context'
require_relative 'profiling/crashtracker'
require_relative 'profiling/stack_recorder'
require_relative 'profiling/exporter'
require_relative 'profiling/flush'
Expand Down
Loading

0 comments on commit 3bd8b05

Please sign in to comment.