Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable GC profiling on Ruby 3 by default due to Ruby VM Ractor-related bug #2354

Merged
merged 2 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct cpu_and_wall_time_worker_state {
// telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
// atomic operations. stdatomic.h seems a nice thing to reach out for.
volatile bool should_run;

bool gc_profiling_enabled;
VALUE cpu_and_wall_time_collector_instance;

// When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
Expand All @@ -76,7 +76,12 @@ struct cpu_and_wall_time_worker_state {
};

static VALUE _native_new(VALUE klass);
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
static VALUE _native_initialize(
DDTRACE_UNUSED VALUE _self,
VALUE self_instance,
VALUE cpu_and_wall_time_collector_instance,
VALUE gc_profiling_enabled
);
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
Expand Down Expand Up @@ -128,7 +133,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
Comment on lines -131 to +136
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For context, the 2 to 3 change here is the number of positional arguments to the method. This is documented here.

Effectively there’s three ways to “wire up” a C function to be called as a Ruby method:

  • if you pass in a number >= 0 then that’s the number of positional arguments to the method. E.g. def foo(a, b, c) => rb_define_method(..., foo_implementation, 3) and foo_implementation is expected to have a signature that receives self + 3 other arguments, e.g. VALUE foo_implementation(VALUE self, VALUE a, VALUE b, VALUE c).
  • if you pass in -1 then Ruby will pass the arguments as a C array
  • if you pas in -2 then Ruby will pass in the arguments as a Ruby array (e.g. it’s the C equivalent of def foo(*args))

I moved it from 2 to 3 since I was passing the new gc_profiling_enabled value

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
Expand Down Expand Up @@ -156,17 +161,26 @@ static VALUE _native_new(VALUE klass) {
struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));

state->should_run = false;
state->gc_profiling_enabled = false;
state->cpu_and_wall_time_collector_instance = Qnil;
state->failure_exception = Qnil;
state->gc_tracepoint = Qnil;

return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
}

static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
static VALUE _native_initialize(
DDTRACE_UNUSED VALUE _self,
VALUE self_instance,
VALUE cpu_and_wall_time_collector_instance,
VALUE gc_profiling_enabled
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);

Expand Down Expand Up @@ -203,7 +217,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one

install_sigprof_signal_handler(handle_sampling_signal);
rb_tracepoint_enable(state->gc_tracepoint);
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);

// Release GVL, get to the actual work!
int exception_state;
Expand Down
1 change: 0 additions & 1 deletion ext/ddtrace_profiling_native_extension/ruby_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

void raise_unexpected_type(
VALUE value,
DDTRACE_UNUSED enum ruby_value_type _type,
const char *value_name,
const char *type_name,
const char *file,
Expand Down
6 changes: 4 additions & 2 deletions ext/ddtrace_profiling_native_extension/ruby_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ static inline int check_if_pending_exception(void) {
// Ruby has a Check_Type(value, type) that is roughly equivalent to this BUT Ruby's version is rather cryptic when it fails
// e.g. "wrong argument type nil (expected String)". This is a replacement that prints more information to help debugging.
#define ENFORCE_TYPE(value, type) \
{ if (RB_UNLIKELY(!RB_TYPE_P(value, type))) raise_unexpected_type(value, type, ADD_QUOTES(value), ADD_QUOTES(type), __FILE__, __LINE__, __func__); }
{ if (RB_UNLIKELY(!RB_TYPE_P(value, type))) raise_unexpected_type(value, ADD_QUOTES(value), ADD_QUOTES(type), __FILE__, __LINE__, __func__); }

#define ENFORCE_BOOLEAN(value) \
{ if (RB_UNLIKELY(value != Qtrue && value != Qfalse)) raise_unexpected_type(value, ADD_QUOTES(value), "true or false", __FILE__, __LINE__, __func__); }

// Called by ENFORCE_TYPE; should not be used directly
NORETURN(void raise_unexpected_type(
VALUE value,
enum ruby_value_type type,
const char *value_name,
const char *type_name,
const char *file,
Expand Down
17 changes: 17 additions & 0 deletions lib/datadog/core/configuration/components.rb
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ def build_profiler(settings, agent_settings, tracer)
recorder: recorder,
max_frames: settings.profiling.advanced.max_frames,
tracer: tracer,
gc_profiling_enabled: should_enable_gc_profiling?(settings)
)
else
trace_identifiers_helper = Profiling::TraceIdentifiers::Helper.new(
Expand Down Expand Up @@ -328,6 +329,22 @@ def build_profiler_transport(settings, agent_settings)
upload_timeout_seconds: settings.profiling.upload.timeout_seconds,
)
end

def should_enable_gc_profiling?(settings)
return true if Gem::Version.new(RUBY_VERSION) < Gem::Version.new('3')

# See comments on the setting definition for more context on why it exists.
if settings.profiling.advanced.force_enable_gc_profiling
Datadog.logger.debug(
'Profiling time/resources spent in Garbage Collection force enabled. Do not use Ractors in combination ' \
'with this option as profiles will be incomplete.'
)

true
else
false
end
end
end

attr_reader \
Expand Down
19 changes: 19 additions & 0 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,25 @@ def initialize(*_)
o.default { env_to_bool('DD_PROFILING_FORCE_ENABLE_NEW', false) }
o.lazy
end

# Forces enabling of profiling of time/resources spent in Garbage Collection.
#
# Note that setting this to "false" (or not setting it) will not prevent the feature from being
# being automatically enabled in the future.
#
# This toggle was added because, although this feature is safe and enabled by default on Ruby 2.x,
# on Ruby 3.x it can break in applications that make use of Ractors due to a Ruby VM bug
# (https://bugs.ruby-lang.org/issues/19112).
#
# If you use Ruby 3.x and your application does not use Ractors (or if your Ruby has been patched), the
# feature is fully safe to enable and this toggle can be used to do so.
#
# We expect that once the above issue is patched, we'll automatically re-enable the feature on fixed Ruby
# versions.
option :force_enable_gc_profiling do |o|
o.default { env_to_bool('DD_PROFILING_FORCE_ENABLE_GC', false) }
o.lazy
end
end

# @public_api
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ def initialize(
recorder:,
max_frames:,
tracer:,
gc_profiling_enabled:,
cpu_and_wall_time_collector: CpuAndWallTime.new(recorder: recorder, max_frames: max_frames, tracer: tracer)
)
self.class._native_initialize(self, cpu_and_wall_time_collector)
self.class._native_initialize(self, cpu_and_wall_time_collector, gc_profiling_enabled)
@worker_thread = nil
@failure_exception = nil
@start_stop_mutex = Mutex.new
Expand Down
47 changes: 47 additions & 0 deletions spec/datadog/core/configuration/components_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1031,11 +1031,58 @@
recorder: instance_of(Datadog::Profiling::StackRecorder),
max_frames: settings.profiling.advanced.max_frames,
tracer: tracer,
gc_profiling_enabled: anything,
)

build_profiler
end

context 'on Ruby 2.x' do
before { skip 'Behavior does not apply to current Ruby version' if RUBY_VERSION >= '3' }

it 'initializes a CpuAndWallTimeWorker collector with gc_profiling_enabled set to true' do
expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including(
gc_profiling_enabled: true,
)

build_profiler
end
end

context 'on Ruby 3.x' do
before { skip 'Behavior does not apply to current Ruby version' if RUBY_VERSION < '3' }

it 'initializes a CpuAndWallTimeWorker collector with gc_profiling_enabled set to false' do
expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including(
gc_profiling_enabled: false,
)

build_profiler
end

context 'when force_enable_gc_profiling is enabled' do
before do
settings.profiling.advanced.force_enable_gc_profiling = true

allow(Datadog.logger).to receive(:debug)
end

it 'initializes a CpuAndWallTimeWorker collector with gc_profiling_enabled set to true' do
expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including(
gc_profiling_enabled: true,
)

build_profiler
end

it 'logs a debug message' do
expect(Datadog.logger).to receive(:debug).with(/Garbage Collection force enabled/)

build_profiler
end
end
end

it 'sets up the Profiler with the CpuAndWallTimeWorker collector' do
expect(Datadog::Profiling::Profiler).to receive(:new).with(
[instance_of(Datadog::Profiling::Collectors::CpuAndWallTimeWorker)],
Expand Down
35 changes: 35 additions & 0 deletions spec/datadog/core/configuration/settings_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,41 @@
.to(true)
end
end

describe '#force_enable_gc_profiling' do
subject(:force_enable_gc_profiling) { settings.profiling.advanced.force_enable_gc_profiling }

context 'when DD_PROFILING_FORCE_ENABLE_GC' do
around do |example|
ClimateControl.modify('DD_PROFILING_FORCE_ENABLE_GC' => environment) do
example.run
end
end

context 'is not defined' do
let(:environment) { nil }

it { is_expected.to be false }
end

{ 'true' => true, 'false' => false }.each do |string, value|
context "is defined as #{string}" do
let(:environment) { string }

it { is_expected.to be value }
end
end
end
end

describe '#force_enable_gc_profiling=' do
it 'updates the #force_enable_gc_profiling setting' do
expect { settings.profiling.advanced.force_enable_gc_profiling = true }
.to change { settings.profiling.advanced.force_enable_gc_profiling }
.from(false)
.to(true)
end
end
end

describe '#upload' do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
before { skip_if_profiling_not_supported(self) }

let(:recorder) { Datadog::Profiling::StackRecorder.new }
let(:gc_profiling_enabled) { true }

subject(:cpu_and_wall_time_worker) { described_class.new(recorder: recorder, max_frames: 400, tracer: nil) }
subject(:cpu_and_wall_time_worker) do
described_class.new(recorder: recorder, max_frames: 400, tracer: nil, gc_profiling_enabled: gc_profiling_enabled)
end

describe '.new' do
it 'creates the garbage collection tracepoint in the disabled state' do
Expand Down Expand Up @@ -47,7 +50,12 @@

allow(Datadog.logger).to receive(:warn)

another_instance = described_class.new(recorder: Datadog::Profiling::StackRecorder.new, max_frames: 400, tracer: nil)
another_instance = described_class.new(
recorder: Datadog::Profiling::StackRecorder.new,
max_frames: 400,
tracer: nil,
gc_profiling_enabled: gc_profiling_enabled,
)
another_instance.start

exception = try_wait_until(backoff: 0.01) { another_instance.send(:failure_exception) }
Expand All @@ -61,10 +69,24 @@
expect(described_class::Testing._native_current_sigprof_signal_handler).to be :profiling
end

it 'enables the garbage collection tracepoint' do
start
context 'when gc_profiling_enabled is true' do
let(:gc_profiling_enabled) { true }

it 'enables the garbage collection tracepoint' do
start

expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to be_enabled
expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to be_enabled
end
end

context 'when gc_profiling_enabled is false' do
let(:gc_profiling_enabled) { false }

it 'does not enable the garbage collection tracepoint' do
start

expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to_not be_enabled
end
end

context 'when a previous signal handler existed' do
Expand Down Expand Up @@ -117,6 +139,15 @@
end

it 'records garbage collection cycles' do
if RUBY_VERSION.start_with?('3.')
skip(
'This test (and feature...) is broken on Ruby 3 if any Ractors get used due to a bug in the VM during ' \
'Ractor GC, see https://bugs.ruby-lang.org/issues/19112 for details. ' \
'For that reason, we disable this feature on Ruby 3 by default by passing `gc_profiling_enabled: false` during ' \
'profiler initialization.'
)
end

start

described_class::Testing._native_trigger_sample
Expand Down