diff --git a/ext/datadog_profiling_native_extension/crashtracker.c b/ext/datadog_profiling_native_extension/crashtracker.c new file mode 100644 index 00000000000..a34ec04d1e4 --- /dev/null +++ b/ext/datadog_profiling_native_extension/crashtracker.c @@ -0,0 +1,108 @@ +#include +#include +#include + +static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self); +static VALUE _native_stop(DDTRACE_UNUSED VALUE _self); + +// Used to report Ruby VM crashes. +// Once initialized, segfaults will be reported automatically using libdatadog. + +void crashtracker_init(VALUE profiling_module) { + VALUE crashtracker_class = rb_define_class_under(profiling_module, "Crashtracker", rb_cObject); + + rb_define_singleton_method(crashtracker_class, "_native_start_or_update_on_fork", _native_start_or_update_on_fork, -1); + rb_define_singleton_method(crashtracker_class, "_native_stop", _native_stop, 0); +} + +static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self) { + VALUE options; + rb_scan_args(argc, argv, "0:", &options); + + VALUE exporter_configuration = rb_hash_fetch(options, ID2SYM(rb_intern("exporter_configuration"))); + VALUE path_to_crashtracking_receiver_binary = rb_hash_fetch(options, ID2SYM(rb_intern("path_to_crashtracking_receiver_binary"))); + VALUE ld_library_path = rb_hash_fetch(options, ID2SYM(rb_intern("ld_library_path"))); + VALUE tags_as_array = rb_hash_fetch(options, ID2SYM(rb_intern("tags_as_array"))); + VALUE action = rb_hash_fetch(options, ID2SYM(rb_intern("action"))); + VALUE upload_timeout_seconds = rb_hash_fetch(options, ID2SYM(rb_intern("upload_timeout_seconds"))); + + VALUE start_action = ID2SYM(rb_intern("start")); + VALUE update_on_fork_action = ID2SYM(rb_intern("update_on_fork")); + + ENFORCE_TYPE(exporter_configuration, T_ARRAY); + ENFORCE_TYPE(tags_as_array, T_ARRAY); + ENFORCE_TYPE(path_to_crashtracking_receiver_binary, T_STRING); + ENFORCE_TYPE(ld_library_path, T_STRING); + ENFORCE_TYPE(action, T_SYMBOL); + ENFORCE_TYPE(upload_timeout_seconds, T_FIXNUM); + + if (action != start_action && action != update_on_fork_action) rb_raise(rb_eArgError, "Unexpected action: %+"PRIsVALUE, action); + + VALUE version = ddtrace_version(); + ddog_prof_Endpoint endpoint = endpoint_from(exporter_configuration); + + // Tags are heap-allocated, so after here we can't raise exceptions otherwise we'll leak this memory + // Start of exception-free zone to prevent leaks {{ + ddog_Vec_Tag tags = convert_tags(tags_as_array); + + ddog_prof_CrashtrackerConfiguration config = { + .additional_files = {}, + // The Ruby VM already uses an alt stack to detect stack overflows so the crash handler must not overwrite it. + // + // @ivoanjo: Specifically, with `create_alt_stack = true` I saw a segfault, such as Ruby 2.6's bug with + // "Process.detach(fork { exit! }).instance_variable_get(:@foo)" being turned into a + // "-e:1:in `instance_variable_get': stack level too deep (SystemStackError)" by Ruby. + // + // The Ruby crash handler also seems to get confused when this option is enabled and + // "Process.kill('SEGV', Process.pid)" gets run. + .create_alt_stack = false, + .endpoint = endpoint, + .resolve_frames = DDOG_PROF_STACKTRACE_COLLECTION_ENABLED, + .timeout_secs = FIX2INT(upload_timeout_seconds), + }; + + ddog_prof_CrashtrackerMetadata metadata = { + .profiling_library_name = DDOG_CHARSLICE_C("dd-trace-rb"), + .profiling_library_version = char_slice_from_ruby_string(version), + .family = DDOG_CHARSLICE_C("ruby"), + .tags = &tags, + }; + + ddog_prof_EnvVar ld_library_path_env = { + .key = DDOG_CHARSLICE_C("LD_LIBRARY_PATH"), + .val = char_slice_from_ruby_string(ld_library_path), + }; + + ddog_prof_CrashtrackerReceiverConfig receiver_config = { + .args = {}, + .env = {.ptr = &ld_library_path_env, .len = 1}, + .path_to_receiver_binary = char_slice_from_ruby_string(path_to_crashtracking_receiver_binary), + .optional_stderr_filename = {}, + .optional_stdout_filename = {}, + }; + + ddog_prof_CrashtrackerResult result = + action == start_action ? + ddog_prof_Crashtracker_init(config, receiver_config, metadata) : + ddog_prof_Crashtracker_update_on_fork(config, receiver_config, metadata); + + // Clean up before potentially raising any exceptions + ddog_Vec_Tag_drop(tags); + // }} End of exception-free zone to prevent leaks + + if (result.tag == DDOG_PROF_CRASHTRACKER_RESULT_ERR) { + rb_raise(rb_eRuntimeError, "Failed to start/update the crash tracker: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + } + + return Qtrue; +} + +static VALUE _native_stop(DDTRACE_UNUSED VALUE _self) { + ddog_prof_CrashtrackerResult result = ddog_prof_Crashtracker_shutdown(); + + if (result.tag == DDOG_PROF_CRASHTRACKER_RESULT_ERR) { + rb_raise(rb_eRuntimeError, "Failed to stop the crash tracker: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + } + + return Qtrue; +} diff --git a/ext/datadog_profiling_native_extension/http_transport.c b/ext/datadog_profiling_native_extension/http_transport.c index d5690238710..02fb136e0ba 100644 --- a/ext/datadog_profiling_native_extension/http_transport.c +++ b/ext/datadog_profiling_native_extension/http_transport.c @@ -11,11 +11,6 @@ static VALUE ok_symbol = Qnil; // :ok in Ruby static VALUE error_symbol = Qnil; // :error in Ruby -static ID agentless_id; // id of :agentless in Ruby -static ID agent_id; // id of :agent in Ruby - -static ID log_failure_to_process_tag_id; // id of :log_failure_to_process_tag in Ruby - static VALUE library_version_string = Qnil; struct call_exporter_without_gvl_arguments { @@ -30,9 +25,6 @@ inline static ddog_ByteSlice byte_slice_from_ruby_string(VALUE string); static VALUE _native_validate_exporter(VALUE self, VALUE exporter_configuration); static ddog_prof_Exporter_NewResult create_exporter(VALUE exporter_configuration, VALUE tags_as_array); static VALUE handle_exporter_failure(ddog_prof_Exporter_NewResult exporter_result); -static ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration); -static ddog_Vec_Tag convert_tags(VALUE tags_as_array); -static void safely_log_failure_to_process_tag(ddog_Vec_Tag tags, VALUE err_details); static VALUE _native_do_export( VALUE self, VALUE exporter_configuration, @@ -60,9 +52,6 @@ void http_transport_init(VALUE profiling_module) { ok_symbol = ID2SYM(rb_intern_const("ok")); error_symbol = ID2SYM(rb_intern_const("error")); - agentless_id = rb_intern_const("agentless"); - agent_id = rb_intern_const("agent"); - log_failure_to_process_tag_id = rb_intern_const("log_failure_to_process_tag"); library_version_string = ddtrace_version(); rb_global_variable(&library_version_string); @@ -116,88 +105,6 @@ static VALUE handle_exporter_failure(ddog_prof_Exporter_NewResult exporter_resul rb_ary_new_from_args(2, error_symbol, get_error_details_and_drop(&exporter_result.err)); } -static ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration) { - ENFORCE_TYPE(exporter_configuration, T_ARRAY); - - ID working_mode = SYM2ID(rb_ary_entry(exporter_configuration, 0)); // SYM2ID verifies its input so we can do this safely - - if (working_mode != agentless_id && working_mode != agent_id) { - rb_raise(rb_eArgError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent"); - } - - if (working_mode == agentless_id) { - VALUE site = rb_ary_entry(exporter_configuration, 1); - VALUE api_key = rb_ary_entry(exporter_configuration, 2); - ENFORCE_TYPE(site, T_STRING); - ENFORCE_TYPE(api_key, T_STRING); - - return ddog_prof_Endpoint_agentless(char_slice_from_ruby_string(site), char_slice_from_ruby_string(api_key)); - } else { // agent_id - VALUE base_url = rb_ary_entry(exporter_configuration, 1); - ENFORCE_TYPE(base_url, T_STRING); - - return ddog_prof_Endpoint_agent(char_slice_from_ruby_string(base_url)); - } -} - -__attribute__((warn_unused_result)) -static ddog_Vec_Tag convert_tags(VALUE tags_as_array) { - ENFORCE_TYPE(tags_as_array, T_ARRAY); - - long tags_count = RARRAY_LEN(tags_as_array); - ddog_Vec_Tag tags = ddog_Vec_Tag_new(); - - for (long i = 0; i < tags_count; i++) { - VALUE name_value_pair = rb_ary_entry(tags_as_array, i); - - if (!RB_TYPE_P(name_value_pair, T_ARRAY)) { - ddog_Vec_Tag_drop(tags); - ENFORCE_TYPE(name_value_pair, T_ARRAY); - } - - // Note: We can index the array without checking its size first because rb_ary_entry returns Qnil if out of bounds - VALUE tag_name = rb_ary_entry(name_value_pair, 0); - VALUE tag_value = rb_ary_entry(name_value_pair, 1); - - if (!(RB_TYPE_P(tag_name, T_STRING) && RB_TYPE_P(tag_value, T_STRING))) { - ddog_Vec_Tag_drop(tags); - ENFORCE_TYPE(tag_name, T_STRING); - ENFORCE_TYPE(tag_value, T_STRING); - } - - ddog_Vec_Tag_PushResult push_result = - ddog_Vec_Tag_push(&tags, char_slice_from_ruby_string(tag_name), char_slice_from_ruby_string(tag_value)); - - if (push_result.tag == DDOG_VEC_TAG_PUSH_RESULT_ERR) { - // libdatadog validates tags and may catch invalid tags that ddtrace didn't actually catch. - // We warn users about such tags, and then just ignore them. - safely_log_failure_to_process_tag(tags, get_error_details_and_drop(&push_result.err)); - } - } - - return tags; -} - -static VALUE log_failure_to_process_tag(VALUE err_details) { - VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog")); - VALUE profiling_module = rb_const_get(datadog_module, rb_intern("Profiling")); - VALUE http_transport_class = rb_const_get(profiling_module, rb_intern("HttpTransport")); - - return rb_funcall(http_transport_class, log_failure_to_process_tag_id, 1, err_details); -} - -// Since we are calling into Ruby code, it may raise an exception. This method ensure that dynamically-allocated tags -// get cleaned before propagating the exception. -static void safely_log_failure_to_process_tag(ddog_Vec_Tag tags, VALUE err_details) { - int exception_state; - rb_protect(log_failure_to_process_tag, err_details, &exception_state); - - if (exception_state) { // An exception was raised - ddog_Vec_Tag_drop(tags); // clean up - rb_jump_tag(exception_state); // "Re-raise" exception - } -} - // Note: This function handles a bunch of libdatadog dynamically-allocated objects, so it MUST not use any Ruby APIs // which can raise exceptions, otherwise the objects will be leaked. static VALUE perform_export( diff --git a/ext/datadog_profiling_native_extension/libdatadog_helpers.c b/ext/datadog_profiling_native_extension/libdatadog_helpers.c index f6f8b69b1b9..101f2b50c7e 100644 --- a/ext/datadog_profiling_native_extension/libdatadog_helpers.c +++ b/ext/datadog_profiling_native_extension/libdatadog_helpers.c @@ -2,6 +2,8 @@ #include +static VALUE log_failure_to_process_tag(VALUE err_details); + const char *ruby_value_type_to_string(enum ruby_value_type type) { return ruby_value_type_to_char_slice(type).ptr; } @@ -60,3 +62,87 @@ size_t read_ddogerr_string_and_drop(ddog_Error *error, char *string, size_t capa ddog_Error_drop(error); return error_msg_size; } + +__attribute__((warn_unused_result)) +ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration) { + ENFORCE_TYPE(exporter_configuration, T_ARRAY); + + VALUE exporter_working_mode = rb_ary_entry(exporter_configuration, 0); + ENFORCE_TYPE(exporter_working_mode, T_SYMBOL); + ID working_mode = SYM2ID(exporter_working_mode); + + ID agentless_id = rb_intern("agentless"); + ID agent_id = rb_intern("agent"); + + if (working_mode != agentless_id && working_mode != agent_id) { + rb_raise(rb_eArgError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent"); + } + + if (working_mode == agentless_id) { + VALUE site = rb_ary_entry(exporter_configuration, 1); + VALUE api_key = rb_ary_entry(exporter_configuration, 2); + ENFORCE_TYPE(site, T_STRING); + ENFORCE_TYPE(api_key, T_STRING); + + return ddog_prof_Endpoint_agentless(char_slice_from_ruby_string(site), char_slice_from_ruby_string(api_key)); + } else { // agent_id + VALUE base_url = rb_ary_entry(exporter_configuration, 1); + ENFORCE_TYPE(base_url, T_STRING); + + return ddog_prof_Endpoint_agent(char_slice_from_ruby_string(base_url)); + } +} + +__attribute__((warn_unused_result)) +ddog_Vec_Tag convert_tags(VALUE tags_as_array) { + ENFORCE_TYPE(tags_as_array, T_ARRAY); + + long tags_count = RARRAY_LEN(tags_as_array); + ddog_Vec_Tag tags = ddog_Vec_Tag_new(); + + for (long i = 0; i < tags_count; i++) { + VALUE name_value_pair = rb_ary_entry(tags_as_array, i); + + if (!RB_TYPE_P(name_value_pair, T_ARRAY)) { + ddog_Vec_Tag_drop(tags); + ENFORCE_TYPE(name_value_pair, T_ARRAY); + } + + // Note: We can index the array without checking its size first because rb_ary_entry returns Qnil if out of bounds + VALUE tag_name = rb_ary_entry(name_value_pair, 0); + VALUE tag_value = rb_ary_entry(name_value_pair, 1); + + if (!(RB_TYPE_P(tag_name, T_STRING) && RB_TYPE_P(tag_value, T_STRING))) { + ddog_Vec_Tag_drop(tags); + ENFORCE_TYPE(tag_name, T_STRING); + ENFORCE_TYPE(tag_value, T_STRING); + } + + ddog_Vec_Tag_PushResult push_result = + ddog_Vec_Tag_push(&tags, char_slice_from_ruby_string(tag_name), char_slice_from_ruby_string(tag_value)); + + if (push_result.tag == DDOG_VEC_TAG_PUSH_RESULT_ERR) { + // libdatadog validates tags and may catch invalid tags that ddtrace didn't actually catch. + // We warn users about such tags, and then just ignore them. + + int exception_state; + rb_protect(log_failure_to_process_tag, get_error_details_and_drop(&push_result.err), &exception_state); + + // Since we are calling into Ruby code, it may raise an exception. Ensure that dynamically-allocated tags + // get cleaned before propagating the exception. + if (exception_state) { + ddog_Vec_Tag_drop(tags); + rb_jump_tag(exception_state); // "Re-raise" exception + } + } + } + + return tags; +} + +static VALUE log_failure_to_process_tag(VALUE err_details) { + VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog")); + VALUE logger = rb_funcall(datadog_module, rb_intern("logger"), 0); + + return rb_funcall(logger, rb_intern("warn"), 1, rb_sprintf("Failed to add tag to profiling request: %"PRIsVALUE, err_details)); +} diff --git a/ext/datadog_profiling_native_extension/libdatadog_helpers.h b/ext/datadog_profiling_native_extension/libdatadog_helpers.h index 307d4d0fb3e..99bfd50b342 100644 --- a/ext/datadog_profiling_native_extension/libdatadog_helpers.h +++ b/ext/datadog_profiling_native_extension/libdatadog_helpers.h @@ -40,3 +40,7 @@ ddog_CharSlice ruby_value_type_to_char_slice(enum ruby_value_type type); inline static char* string_from_char_slice(ddog_CharSlice slice) { return ruby_strndup(slice.ptr, slice.len); } + +ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration); + +ddog_Vec_Tag convert_tags(VALUE tags_as_array); diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index df8bfa244d8..f256937722b 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -19,6 +19,7 @@ void collectors_dynamic_sampling_rate_init(VALUE profiling_module); void collectors_idle_sampling_helper_init(VALUE profiling_module); void collectors_stack_init(VALUE profiling_module); void collectors_thread_context_init(VALUE profiling_module); +void crashtracker_init(VALUE profiling_module); void http_transport_init(VALUE profiling_module); void stack_recorder_init(VALUE profiling_module); @@ -53,6 +54,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { collectors_idle_sampling_helper_init(profiling_module); collectors_stack_init(profiling_module); collectors_thread_context_init(profiling_module); + crashtracker_init(profiling_module); http_transport_init(profiling_module); stack_recorder_init(profiling_module); diff --git a/lib/datadog/core/configuration/settings.rb b/lib/datadog/core/configuration/settings.rb index d676687d366..de4a2b75457 100644 --- a/lib/datadog/core/configuration/settings.rb +++ b/lib/datadog/core/configuration/settings.rb @@ -415,6 +415,16 @@ def initialize(*_) o.env 'DD_PROFILING_UPLOAD_PERIOD' o.default 60 end + + # Enables reporting of information when the Ruby VM crashes. + # + # @default `DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED` environment variable as a boolean, + # otherwise `false` + option :experimental_crash_tracking_enabled do |o| + o.type :bool + o.env 'DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED' + o.default false + end end # @public_api diff --git a/lib/datadog/profiling.rb b/lib/datadog/profiling.rb index 79b3da8127b..604e9ac4462 100644 --- a/lib/datadog/profiling.rb +++ b/lib/datadog/profiling.rb @@ -143,6 +143,7 @@ def self.allocation_count # rubocop:disable Lint/NestedMethodDefinition (On purp require_relative 'profiling/collectors/idle_sampling_helper' require_relative 'profiling/collectors/stack' require_relative 'profiling/collectors/thread_context' + require_relative 'profiling/crashtracker' require_relative 'profiling/stack_recorder' require_relative 'profiling/exporter' require_relative 'profiling/flush' diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb index 19f3b8830a3..cd6acf551b2 100644 --- a/lib/datadog/profiling/component.rb +++ b/lib/datadog/profiling/component.rb @@ -72,8 +72,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) exporter = build_profiler_exporter(settings, recorder, worker, internal_metadata: internal_metadata) transport = build_profiler_transport(settings, agent_settings) scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport, interval: upload_period_seconds) + crashtracker = build_crashtracker(settings, transport) + profiler = Profiling::Profiler.new(worker: worker, scheduler: scheduler, optional_crashtracker: crashtracker) - [Profiling::Profiler.new(worker: worker, scheduler: scheduler), { profiling_enabled: true }] + [profiler, { profiling_enabled: true }] end private_class_method def self.build_thread_context_collector(settings, recorder, optional_tracer, timeline_enabled) @@ -110,6 +112,28 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) ) end + private_class_method def self.build_crashtracker(settings, transport) + return unless settings.profiling.advanced.experimental_crash_tracking_enabled + + # By default, the transport is an instance of HttpTransport, which validates the configuration and makes + # it available for us to use here. + # But we support overriding the transport with a user-specific one, which may e.g. write stuff to a file, + # and thus can't really provide a valid configuration to talk to a Datadog agent. Thus, in this situation, + # we can't use the crashtracker, even if enabled. + unless transport.respond_to?(:exporter_configuration) + Datadog.logger.warn( + 'Cannot enable profiling crash tracking as a custom settings.profiling.exporter.transport is configured' + ) + return + end + + Datadog::Profiling::Crashtracker.new( + exporter_configuration: transport.exporter_configuration, + tags: Datadog::Profiling::TagBuilder.call(settings: settings), + upload_timeout_seconds: settings.profiling.upload.timeout_seconds, + ) + end + private_class_method def self.enable_gc_profiling?(settings) return false unless settings.profiling.advanced.gc_enabled diff --git a/lib/datadog/profiling/crashtracker.rb b/lib/datadog/profiling/crashtracker.rb new file mode 100644 index 00000000000..00080bd8419 --- /dev/null +++ b/lib/datadog/profiling/crashtracker.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +require 'libdatadog' + +module Datadog + module Profiling + # Used to report Ruby VM crashes. + # The interesting bits are implemented as native code and using libdatadog. + # + # NOTE: The crashtracker native state is a singleton; so even if you create multiple instances of `Crashtracker` + # and start them, it only works as "last writer wins". Same for stop -- there's only one state, so calling stop + # on it will stop the crash tracker, regardless of which instance started it. + # + # Methods prefixed with _native_ are implemented in `crashtracker.c` + class Crashtracker + private + + attr_reader \ + :exporter_configuration, + :tags_as_array, + :path_to_crashtracking_receiver_binary, + :ld_library_path, + :upload_timeout_seconds + + public + + def initialize( + exporter_configuration:, + tags:, + upload_timeout_seconds:, + path_to_crashtracking_receiver_binary: Libdatadog.path_to_crashtracking_receiver_binary, + ld_library_path: Libdatadog.ld_library_path + ) + @exporter_configuration = exporter_configuration + @tags_as_array = tags.to_a + @upload_timeout_seconds = upload_timeout_seconds + @path_to_crashtracking_receiver_binary = path_to_crashtracking_receiver_binary + @ld_library_path = ld_library_path + end + + def start + start_or_update_on_fork(action: :start) + end + + def reset_after_fork + start_or_update_on_fork(action: :update_on_fork) + end + + def stop + begin + self.class._native_stop + Datadog.logger.debug('Crash tracking stopped successfully') + rescue => e + Datadog.logger.error("Failed to stop crash tracking: #{e.message}") + end + end + + private + + def start_or_update_on_fork(action:) + unless path_to_crashtracking_receiver_binary + Datadog.logger.warn( + "Cannot #{action} profiling crash tracking as no path_to_crashtracking_receiver_binary was found" + ) + return + end + + unless ld_library_path + Datadog.logger.warn( + "Cannot #{action} profiling crash tracking as no ld_library_path was found" + ) + return + end + + begin + self.class._native_start_or_update_on_fork( + action: action, + exporter_configuration: exporter_configuration, + path_to_crashtracking_receiver_binary: path_to_crashtracking_receiver_binary, + ld_library_path: ld_library_path, + tags_as_array: tags_as_array, + upload_timeout_seconds: Integer(upload_timeout_seconds), + ) + Datadog.logger.debug("Crash tracking #{action} successful") + rescue => e + Datadog.logger.error("Failed to #{action} crash tracking: #{e.message}") + end + end + end + end +end diff --git a/lib/datadog/profiling/http_transport.rb b/lib/datadog/profiling/http_transport.rb index 14dae1cf6fd..8b2f98bde7d 100644 --- a/lib/datadog/profiling/http_transport.rb +++ b/lib/datadog/profiling/http_transport.rb @@ -7,6 +7,8 @@ module Profiling # Used to report profiling data to Datadog. # Methods prefixed with _native_ are implemented in `http_transport.c` class HttpTransport + attr_reader :exporter_configuration + def initialize(agent_settings:, site:, api_key:, upload_timeout_seconds:) @upload_timeout_milliseconds = (upload_timeout_seconds * 1_000).to_i @@ -14,19 +16,19 @@ def initialize(agent_settings:, site:, api_key:, upload_timeout_seconds:) @exporter_configuration = if agentless?(site, api_key) - [:agentless, site, api_key] + [:agentless, site, api_key].freeze else - [:agent, base_url_from(agent_settings)] + [:agent, base_url_from(agent_settings)].freeze end - status, result = validate_exporter(@exporter_configuration) + status, result = validate_exporter(exporter_configuration) raise(ArgumentError, "Failed to initialize transport: #{result}") if status == :error end def export(flush) status, result = do_export( - exporter_configuration: @exporter_configuration, + exporter_configuration: exporter_configuration, upload_timeout_milliseconds: @upload_timeout_milliseconds, # why "timespec"? @@ -66,12 +68,6 @@ def export(flush) end end - # Used to log soft failures in `ddog_Vec_tag_push` (e.g. we still report the profile in these cases) - # Called from native code - def self.log_failure_to_process_tag(failure_details) - Datadog.logger.warn("Failed to add tag to profiling request: #{failure_details}") - end - private def base_url_from(agent_settings) @@ -136,7 +132,7 @@ def do_export( end def config_without_api_key - [@exporter_configuration[0..1]].to_h + [exporter_configuration[0..1]].to_h end end end diff --git a/lib/datadog/profiling/profiler.rb b/lib/datadog/profiling/profiler.rb index b90494eb21e..3231eabc330 100644 --- a/lib/datadog/profiling/profiler.rb +++ b/lib/datadog/profiling/profiler.rb @@ -8,21 +8,24 @@ class Profiler private - attr_reader :worker, :scheduler + attr_reader :worker, :scheduler, :optional_crashtracker public - def initialize(worker:, scheduler:) + def initialize(worker:, scheduler:, optional_crashtracker:) @worker = worker @scheduler = scheduler + @optional_crashtracker = optional_crashtracker end def start after_fork! do + optional_crashtracker.reset_after_fork if optional_crashtracker worker.reset_after_fork scheduler.reset_after_fork end + optional_crashtracker.start if optional_crashtracker worker.start(on_failure_proc: proc { component_failed(:worker) }) scheduler.start(on_failure_proc: proc { component_failed(:scheduler) }) end @@ -32,6 +35,7 @@ def shutdown! stop_worker stop_scheduler + optional_crashtracker.stop if optional_crashtracker end private @@ -51,6 +55,9 @@ def component_failed(failed_component) 'See previous log messages for details.' ) + # We explicitly not stop the crash tracker in this situation, under the assumption that, if a component failed, + # we're operating in a degraded state and crash tracking may still be helpful. + if failed_component == :worker stop_scheduler elsif failed_component == :scheduler diff --git a/sig/datadog/profiling/component.rbs b/sig/datadog/profiling/component.rbs index a2e8490dcff..9e74c5c7ea0 100644 --- a/sig/datadog/profiling/component.rbs +++ b/sig/datadog/profiling/component.rbs @@ -26,6 +26,11 @@ module Datadog Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings ) -> untyped + def self.build_crashtracker: ( + untyped settings, + untyped transport, + ) -> Datadog::Profiling::Crashtracker? + def self.enable_gc_profiling?: (untyped settings) -> bool def self.enable_allocation_profiling?: (untyped settings) -> bool def self.get_heap_sample_every: (untyped settings) -> ::Integer diff --git a/sig/datadog/profiling/crashtracker.rbs b/sig/datadog/profiling/crashtracker.rbs new file mode 100644 index 00000000000..60b2057880c --- /dev/null +++ b/sig/datadog/profiling/crashtracker.rbs @@ -0,0 +1,44 @@ +module Datadog + module Profiling + class Crashtracker + type exporter_configuration_array = [:agentless | :agent, untyped] + + private + + attr_reader exporter_configuration: exporter_configuration_array + attr_reader tags_as_array: ::Array[[::String, ::String]] + attr_reader path_to_crashtracking_receiver_binary: ::String + attr_reader ld_library_path: ::String + attr_reader upload_timeout_seconds: ::Integer + + public + + def initialize: ( + exporter_configuration: exporter_configuration_array, + tags: ::Hash[::String, ::String], + upload_timeout_seconds: ::Integer, + ?path_to_crashtracking_receiver_binary: ::String, + ?ld_library_path: ::String, + ) -> void + + def start: -> void + def stop: -> void + def reset_after_fork: -> void + + private + + def start_or_update_on_fork: (action: :start | :update_on_fork) -> void + + def self._native_start_or_update_on_fork: ( + action: :start | :update_on_fork, + exporter_configuration: exporter_configuration_array, + path_to_crashtracking_receiver_binary: ::String, + ld_library_path: ::String, + tags_as_array: ::Array[[::String, ::String]], + upload_timeout_seconds: ::Integer, + ) -> void + + def self._native_stop: -> void + end + end +end diff --git a/sig/datadog/profiling/http_transport.rbs b/sig/datadog/profiling/http_transport.rbs index 0e27cdfac45..4713ebfb57e 100644 --- a/sig/datadog/profiling/http_transport.rbs +++ b/sig/datadog/profiling/http_transport.rbs @@ -3,6 +3,8 @@ module Datadog class HttpTransport type exporter_configuration_array = [:agentless | :agent, untyped] + attr_reader exporter_configuration: exporter_configuration_array + @upload_timeout_milliseconds: ::Integer @exporter_configuration: exporter_configuration_array @@ -15,8 +17,6 @@ module Datadog def export: (Datadog::Profiling::Flush flush) -> bool - def self.log_failure_to_process_tag: (::String failure_details) -> void - private def base_url_from: (Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings) -> ::String diff --git a/sig/datadog/profiling/profiler.rbs b/sig/datadog/profiling/profiler.rbs index 37b360d4abf..b9c3180d41e 100644 --- a/sig/datadog/profiling/profiler.rbs +++ b/sig/datadog/profiling/profiler.rbs @@ -7,12 +7,14 @@ module Datadog attr_reader worker: Datadog::Profiling::Collectors::CpuAndWallTimeWorker attr_reader scheduler: Datadog::Profiling::Scheduler + attr_reader optional_crashtracker: Datadog::Profiling::Crashtracker public def initialize: ( worker: Datadog::Profiling::Collectors::CpuAndWallTimeWorker, scheduler: Datadog::Profiling::Scheduler, + optional_crashtracker: Datadog::Profiling::Crashtracker?, ) -> void def start: () -> void diff --git a/spec/datadog/core/configuration/settings_spec.rb b/spec/datadog/core/configuration/settings_spec.rb index 87900dde35c..b88625e75fb 100644 --- a/spec/datadog/core/configuration/settings_spec.rb +++ b/spec/datadog/core/configuration/settings_spec.rb @@ -765,6 +765,41 @@ .to(90) end end + + describe '#experimental_crash_tracking_enabled' do + subject(:experimental_crash_tracking_enabled) { settings.profiling.advanced.experimental_crash_tracking_enabled } + + context 'when DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED' do + around do |example| + ClimateControl.modify('DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED' => environment) do + example.run + end + end + + context 'is not defined' do + let(:environment) { nil } + + it { is_expected.to be false } + end + + [true, false].each do |value| + context "is defined as #{value}" do + let(:environment) { value.to_s } + + it { is_expected.to be value } + end + end + end + end + + describe '#experimental_crash_tracking_enabled=' do + it 'updates the #experimental_crash_tracking_enabled setting' do + expect { settings.profiling.advanced.experimental_crash_tracking_enabled = true } + .to change { settings.profiling.advanced.experimental_crash_tracking_enabled } + .from(false) + .to(true) + end + end end describe '#upload' do diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb index b4740cf2c71..3ff73da8c4f 100644 --- a/spec/datadog/profiling/component_spec.rb +++ b/spec/datadog/profiling/component_spec.rb @@ -388,6 +388,7 @@ expect(Datadog::Profiling::Profiler).to receive(:new).with( worker: instance_of(Datadog::Profiling::Collectors::CpuAndWallTimeWorker), scheduler: anything, + optional_crashtracker: anything, ) build_profiler_component @@ -530,6 +531,61 @@ build_profiler_component end end + + context 'when crash tracking is disabled' do + before { settings.profiling.advanced.experimental_crash_tracking_enabled = false } + + it 'does not initialize the crash tracker' do + expect(Datadog::Profiling::Crashtracker).to_not receive(:new) + + build_profiler_component + end + end + + context 'when crash tracking is enabled' do + before { settings.profiling.advanced.experimental_crash_tracking_enabled = true } + + it 'initializes the crash tracker' do + expect(Datadog::Profiling::Crashtracker).to receive(:new).with( + exporter_configuration: array_including(:agent), + tags: hash_including('runtime' => 'ruby'), + upload_timeout_seconds: settings.profiling.upload.timeout_seconds, + ) + + build_profiler_component + end + + context 'when a custom transport is provided' do + let(:custom_transport) { double('Custom transport') } + + before do + settings.profiling.exporter.transport = custom_transport + allow(Datadog.logger).to receive(:warn) + end + + it 'warns that crash tracking will not be enabled' do + expect(Datadog.logger).to receive(:warn).with(/Cannot enable profiling crash tracking/) + + build_profiler_component + end + + it 'does not initialize the crash tracker' do + expect(Datadog::Profiling::Crashtracker).to_not receive(:new) + + build_profiler_component + end + end + + it 'initializes the profiler instance with the crash tracker' do + expect(Datadog::Profiling::Profiler).to receive(:new).with( + worker: anything, + scheduler: anything, + optional_crashtracker: instance_of(Datadog::Profiling::Crashtracker), + ) + + build_profiler_component + end + end end end diff --git a/spec/datadog/profiling/crashtracker_spec.rb b/spec/datadog/profiling/crashtracker_spec.rb new file mode 100644 index 00000000000..67f6258a970 --- /dev/null +++ b/spec/datadog/profiling/crashtracker_spec.rb @@ -0,0 +1,207 @@ +require 'datadog/profiling/spec_helper' +require 'datadog/profiling/crashtracker' + +require 'webrick' + +RSpec.describe Datadog::Profiling::Crashtracker do + before do + skip_if_profiling_not_supported(self) + + crash_tracker_pids = `pgrep -f libdatadog-crashtracking-receiver` + expect(crash_tracker_pids).to be_empty, "No crash tracker process should be running, found #{crash_tracker_pids}" + end + + let(:exporter_configuration) { [:agent, 'http://localhost:6006'] } + + let(:crashtracker_options) do + { + exporter_configuration: exporter_configuration, + tags: { 'tag1' => 'value1', 'tag2' => 'value2' }, + upload_timeout_seconds: 123, + } + end + + subject(:crashtracker) { described_class.new(**crashtracker_options) } + + describe '#start' do + subject(:start) { crashtracker.start } + + context 'when _native_start_or_update_on_fork raises an exception' do + it 'logs the exception' do + expect(described_class).to receive(:_native_start_or_update_on_fork) { raise 'Test failure' } + expect(Datadog.logger).to receive(:error).with(/Failed to start crash tracking: Test failure/) + + start + end + end + + context 'when path_to_crashtracking_receiver_binary is nil' do + subject(:crashtracker) { described_class.new(**crashtracker_options, path_to_crashtracking_receiver_binary: nil) } + + it 'logs a warning' do + expect(Datadog.logger).to receive(:warn).with(/no path_to_crashtracking_receiver_binary was found/) + + start + end + end + + context 'when ld_library_path is nil' do + subject(:crashtracker) { described_class.new(**crashtracker_options, ld_library_path: nil) } + + it 'logs a warning' do + expect(Datadog.logger).to receive(:warn).with(/no ld_library_path was found/) + + start + end + end + + it 'starts the crash tracker' do + start + + expect(`pgrep -f libdatadog-crashtracking-receiver`).to_not be_empty + + crashtracker.stop + end + + context 'when calling start multiple times in a row' do + it 'only starts the crash tracker once' do + 3.times { crashtracker.start } + + expect(`pgrep -f libdatadog-crashtracking-receiver`.lines.size).to be 1 + + crashtracker.stop + end + end + + context 'when upload_timeout_seconds is not an Integer' do + let(:crashtracker_options) { { **super(), upload_timeout_seconds: 12.34 } } + + it 'converts it to an Integer before calling _native_start_or_update_on_fork' do + expect(described_class) + .to receive(:_native_start_or_update_on_fork).with(hash_including(upload_timeout_seconds: 12)) + + start + end + end + end + + describe '#reset_after_fork' do + subject(:reset_after_fork) { crashtracker.reset_after_fork } + + context 'when called in a fork' do + before { crashtracker.start } + after { crashtracker.stop } + + it 'starts a second crash tracker for the fork' do + expect_in_fork do + crashtracker.reset_after_fork + + expect(`pgrep -f libdatadog-crashtracking-receiver`.lines.size).to be 2 + + crashtracker.stop + + expect(`pgrep -f libdatadog-crashtracking-receiver`.lines.size).to be 1 + end + end + end + end + + describe '#stop' do + subject(:stop) { crashtracker.stop } + + context 'when _native_stop_crashtracker raises an exception' do + it 'logs the exception' do + expect(described_class).to receive(:_native_stop) { raise 'Test failure' } + expect(Datadog.logger).to receive(:error).with(/Failed to stop crash tracking: Test failure/) + + stop + end + end + + it 'stops the crash tracker' do + crashtracker.start + + stop + + expect(`pgrep -f libdatadog-crashtracking-receiver`).to be_empty + end + end + + context 'integration testing' do + shared_context 'HTTP server' do + let(:server) do + WEBrick::HTTPServer.new( + Port: 0, + Logger: log, + AccessLog: access_log, + StartCallback: -> { init_signal.push(1) } + ) + end + let(:hostname) { '127.0.0.1' } + let(:log) { WEBrick::Log.new(StringIO.new, WEBrick::Log::WARN) } + let(:access_log_buffer) { StringIO.new } + let(:access_log) { [[access_log_buffer, WEBrick::AccessLog::COMBINED_LOG_FORMAT]] } + let(:server_proc) do + proc do |req, res| + messages << req.tap { req.body } # Read body, store message before socket closes. + res.body = '{}' + end + end + let(:init_signal) { Queue.new } + + let(:messages) { [] } + + before do + server.mount_proc('/', &server_proc) + @server_thread = Thread.new { server.start } + init_signal.pop + end + + after do + unless RSpec.current_example.skipped? + # When the test is skipped, server has not been initialized and @server_thread would be nil; thus we only + # want to touch them when the test actually run, otherwise we would cause the server to start (incorrectly) + # and join to be called on a nil @server_thread + server.shutdown + @server_thread.join + end + end + end + + include_context 'HTTP server' + + let(:request) { messages.first } + let(:port) { server[:Port] } + + let(:exporter_configuration) { [:agent, "http://#{hostname}:#{port}"] } + + it 'reports crashes via http' do + fork_expectations = proc do |status:, stdout:, stderr:| + expect(Signal.signame(status.termsig)).to eq('SEGV').or eq('ABRT') + expect(stderr).to include('[BUG] Segmentation fault') + end + + expect_in_fork(fork_expectations: fork_expectations) do + crashtracker.start + + Process.kill('SEGV', Process.pid) + end + + crash_report = JSON.parse(request.body, symbolize_names: true)[:payload].first + + expect(crash_report[:stack_trace]).to_not be_empty + expect(crash_report[:tags]).to include('signum:11', 'signame:SIGSEGV') + + crash_report_message = JSON.parse(crash_report[:message], symbolize_names: true) + + expect(crash_report_message[:metadata]).to include( + profiling_library_name: 'dd-trace-rb', + profiling_library_version: Datadog::VERSION::STRING, + family: 'ruby', + tags: ['tag1:value1', 'tag2:value2'], + ) + expect(crash_report_message[:files][:'/proc/self/maps']).to_not be_empty + expect(crash_report_message[:os_info]).to_not be_empty + end + end +end diff --git a/spec/datadog/profiling/http_transport_spec.rb b/spec/datadog/profiling/http_transport_spec.rb index 472e100874f..bc900a3dec1 100644 --- a/spec/datadog/profiling/http_transport_spec.rb +++ b/spec/datadog/profiling/http_transport_spec.rb @@ -279,18 +279,23 @@ end end + describe '#exporter_configuration' do + it 'returns the current exporter configuration' do + expect(http_transport.exporter_configuration).to eq [:agent, 'http://192.168.0.1:12345/'] + end + end + context 'integration testing' do shared_context 'HTTP server' do let(:server) do WEBrick::HTTPServer.new( - Port: port, + Port: 0, Logger: log, AccessLog: access_log, StartCallback: -> { init_signal.push(1) } ) end let(:hostname) { '127.0.0.1' } - let(:port) { 6006 } let(:log) { WEBrick::Log.new($stderr, WEBrick::Log::WARN) } let(:access_log_buffer) { StringIO.new } let(:access_log) { [[access_log_buffer, WEBrick::AccessLog::COMBINED_LOG_FORMAT]] } @@ -326,7 +331,7 @@ let(:request) { messages.first } let(:hostname) { '127.0.0.1' } - let(:port) { '6006' } + let(:port) { server[:Port] } shared_examples 'correctly reports profiling data' do it 'correctly reports profiling data' do @@ -378,7 +383,7 @@ it 'exports data via http to the agent url' do http_transport.export(flush) - expect(request.request_uri.to_s).to eq 'http://127.0.0.1:6006/profiling/v1/input' + expect(request.request_uri.to_s).to eq "http://127.0.0.1:#{port}/profiling/v1/input" end context 'when code provenance data is not available' do diff --git a/spec/datadog/profiling/profiler_spec.rb b/spec/datadog/profiling/profiler_spec.rb index 32bd0a7bf79..98ee1722754 100644 --- a/spec/datadog/profiling/profiler_spec.rb +++ b/spec/datadog/profiling/profiler_spec.rb @@ -6,10 +6,13 @@ RSpec.describe Datadog::Profiling::Profiler do before { skip_if_profiling_not_supported(self) } - subject(:profiler) { described_class.new(worker: worker, scheduler: scheduler) } + subject(:profiler) do + described_class.new(worker: worker, scheduler: scheduler, optional_crashtracker: optional_crashtracker) + end let(:worker) { instance_double(Datadog::Profiling::Collectors::CpuAndWallTimeWorker) } let(:scheduler) { instance_double(Datadog::Profiling::Scheduler) } + let(:optional_crashtracker) { nil } describe '#start' do subject(:start) { profiler.start } @@ -21,6 +24,19 @@ start end + context 'when a crash tracker instance is provided' do + let(:optional_crashtracker) { instance_double(Datadog::Profiling::Crashtracker) } + + it 'signals the crash tracker to start before other components' do + expect(optional_crashtracker).to receive(:start).ordered + + expect(worker).to receive(:start).ordered + expect(scheduler).to receive(:start).ordered + + start + end + end + context 'when called after a fork' do before { skip('Spec requires Ruby VM supporting fork') unless PlatformHelpers.supports_fork? } @@ -37,6 +53,26 @@ start end end + + context 'when a crash tracker instance is provided' do + let(:optional_crashtracker) { instance_double(Datadog::Profiling::Crashtracker) } + + it 'resets the crash tracker before other coponents, as well as restarts it before other components' do + profiler # make sure instance is created in parent, so it detects the forking + + expect_in_fork do + expect(optional_crashtracker).to receive(:reset_after_fork).ordered + expect(worker).to receive(:reset_after_fork).ordered + expect(scheduler).to receive(:reset_after_fork).ordered + + expect(optional_crashtracker).to receive(:start).ordered + expect(worker).to receive(:start).ordered + expect(scheduler).to receive(:start).ordered + + start + end + end + end end end @@ -51,11 +87,26 @@ shutdown! end + + context 'when a crash tracker instance is provided' do + let(:optional_crashtracker) { instance_double(Datadog::Profiling::Crashtracker) } + + it 'signals the crash tracker to stop, after other components have stopped' do + expect(worker).to receive(:stop).ordered + allow(scheduler).to receive(:enabled=) + expect(scheduler).to receive(:stop).ordered + + expect(optional_crashtracker).to receive(:stop).ordered + + shutdown! + end + end end describe 'Component failure handling' do let(:worker) { instance_double(Datadog::Profiling::Collectors::CpuAndWallTimeWorker, start: nil) } let(:scheduler) { instance_double(Datadog::Profiling::Scheduler, start: nil) } + let(:optional_crashtracker) { instance_double(Datadog::Profiling::Crashtracker, start: nil) } before { allow(Datadog.logger).to receive(:warn) } @@ -69,10 +120,12 @@ on_failure.call end - it 'logs the issue' do + before do allow(scheduler).to receive(:enabled=) allow(scheduler).to receive(:stop) + end + it 'logs the issue' do expect(Datadog.logger).to receive(:warn).with(/worker component/) worker_on_failure @@ -84,6 +137,12 @@ worker_on_failure end + + it 'does not stop the crashtracker' do + expect(optional_crashtracker).to_not receive(:stop) + + worker_on_failure + end end context 'when the scheduler failed' do @@ -96,9 +155,11 @@ on_failure.call end - it 'logs the issue' do + before do allow(worker).to receive(:stop) + end + it 'logs the issue' do expect(Datadog.logger).to receive(:warn).with(/scheduler component/) scheduler_on_failure @@ -109,6 +170,12 @@ scheduler_on_failure end + + it 'does not stop the crashtracker' do + expect(optional_crashtracker).to_not receive(:stop) + + scheduler_on_failure + end end context 'when unknown component failed' do diff --git a/spec/datadog/tracing/transport/http/adapters/net_integration_spec.rb b/spec/datadog/tracing/transport/http/adapters/net_integration_spec.rb index 2c70b30cac8..9edf8756dc0 100644 --- a/spec/datadog/tracing/transport/http/adapters/net_integration_spec.rb +++ b/spec/datadog/tracing/transport/http/adapters/net_integration_spec.rb @@ -7,7 +7,7 @@ require 'datadog/core/transport/http/adapters/net' RSpec.describe 'Adapters::Net tracing integration tests' do - before { skip unless ENV['TEST_DATADOG_INTEGRATION'] } + before { skip('Skipping test as ENV["TEST_DATADOG_INTEGRATION"] is not set') unless ENV['TEST_DATADOG_INTEGRATION'] } subject(:adapter) { Datadog::Core::Transport::HTTP::Adapters::Net.new(agent_settings) } @@ -26,14 +26,14 @@ # HTTP let(:server) do WEBrick::HTTPServer.new( - Port: port, + Port: 0, Logger: log, AccessLog: access_log, StartCallback: -> { init_signal.push(1) } ) end let(:hostname) { '127.0.0.1' } - let(:port) { 6218 } + let(:port) { server[:Port] } let(:log) { WEBrick::Log.new(log_buffer) } let(:log_buffer) { StringIO.new } let(:access_log) { [[log_buffer, WEBrick::AccessLog::COMBINED_LOG_FORMAT]] } diff --git a/vendor/rbs/libdatadog/0/libdatadog.rbs b/vendor/rbs/libdatadog/0/libdatadog.rbs index e47cb4dd456..04a25f36cfd 100644 --- a/vendor/rbs/libdatadog/0/libdatadog.rbs +++ b/vendor/rbs/libdatadog/0/libdatadog.rbs @@ -1,3 +1,5 @@ module Libdatadog + def self.path_to_crashtracking_receiver_binary: () -> ::String? + def self.ld_library_path: () -> ::String? def self.pkgconfig_folder: () -> ::String? end