Skip to content

Commit

Permalink
[PLAT-14867] Make sure restart alerts don't trigger for small time up…
Browse files Browse the repository at this point in the history
…dates during NTP sync

Summary:
Currently some customers are seeing bogus DB node restart alerts.
The reason is that node boot time (and process start time) metrics experience 1 second changes during NTP time syncs.
This diff makes sure we only trigger node/process restart alerts in case node/process boot time increased significantly (> 10 seconds).

Test Plan:
Restart DB node via cloud console.
Make sure both DB node and DB instance restart alerts are triggered with WARNING severity.
Restart DB node via cloud console 2 more times with 1 minute delay between restarts.
Make sure both DB node and DB instance restart alerts are triggered with SEVERE severity.
Wait for 30 minutes for alerts to clean.

Reviewers: vbansal, #yba-api-review!

Reviewed By: vbansal

Subscribers: sanketh, yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D37773
  • Loading branch information
anmalysh-yb committed Sep 5, 2024
1 parent 6614afb commit 1153b56
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ private List<Collector.MetricFamilySamples> getPrecalculatedMetrics() {
// Prometheus client library expects empty string in case metric has no unit
unit = StringUtils.EMPTY;
}
if (!unit.isEmpty() && !metricName.endsWith("_" + unit)) {
// Seems like one of the metrics have invalid unit.
// Let's just clean unit for this one and log it.
log.warn("Metric name {} should end with '{}'", metricName, "_" + unit);
unit = StringUtils.EMPTY;
}
Collector.Type type = metrics.get(0).getType().getPrometheusType();

List<Collector.MetricFamilySamples.Sample> samples =
Expand Down
13 changes: 7 additions & 6 deletions managed/src/main/resources/alert/alert_templates.yml
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ templates:
name: DB Instance restart
description: Unexpected Master or TServer process restart(s) occurred during
last 30 minutes
queryTemplate: max by (universe_uuid) (changes(yb_node_boot_time{universe_uuid="__universeUuid__"}[30m])
queryTemplate: max by (universe_uuid) ((sum_over_time((increase(yb_node_boot_time{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m]))
and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid="__universeUuid__"}[35m])
== 0)) {{ query_condition }} {{ query_threshold }}
createForNewCustomer: true
Expand All @@ -335,7 +335,7 @@ templates:
labels:
affected_node_names: >-
{{ range $index, $element := query "max by (universe_uuid, node_name)
(changes(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[30m])
((sum_over_time((increase(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m]))
and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid='{{ $labels.universe_uuid }}'}[35m])
== 0)) {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
annotations:
Expand Down Expand Up @@ -1079,7 +1079,8 @@ templates:
NODE_RESTART:
name: DB node restart
description: Unexpected DB node restart(s) occurred during last 30 minutes
queryTemplate: max by (universe_uuid) (changes(node_boot_time_seconds{universe_uuid="__universeUuid__"}[30m]))
queryTemplate: max by (universe_uuid) (sum_over_time(
(increase(node_boot_time_seconds{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m]))
{{ query_condition }} {{ query_threshold }}
createForNewCustomer: true
defaultThresholdMap:
Expand All @@ -1094,7 +1095,7 @@ templates:
labels:
affected_node_names: >-
{{ range $index, $element := query "max by (universe_uuid, node_name)
(changes(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[30m]))
(sum_over_time((increase(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m]))
{{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
annotations:
summary: >-
Expand Down Expand Up @@ -1906,7 +1907,7 @@ templates:
name: High clock drift
description: Local clock on the node has drift too far from the actual time in the past 10 minutes.
queryTemplate: max by (universe_uuid) (yb_node_clock_drift_check_ms{universe_uuid="__universeUuid__"}) {{ query_condition }} {{ query_threshold }}
createForNewCustomer: true
createForNewCustomer: true
defaultThresholdMap:
WARNING:
threshold: 200.0
Expand All @@ -1925,4 +1926,4 @@ templates:
summary: >-
Clock drift is high for universe '{{ $labels.source_name }}'.
Current drift from actual clock is {{ $value }} milliseconds.
Affected nodes: {{ $labels.affected_node_names }}
Affected nodes: {{ $labels.affected_node_names }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Copyright (c) YugaByte, Inc.

-- Recreate alert definition to count by
update alert_definition set config_written = false where configuration_uuid IN
(select uuid from alert_configuration where template in ('DB_INSTANCE_RESTART','NODE_RESTART'));
2 changes: 1 addition & 1 deletion managed/src/main/resources/health/node_health.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ YB_NODE_CONTROLLER_CHECK = MetricDefinition(
YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition(
"yb_node_clock_drift_check_ms",
"Time Drift between nodes within limits",
"millisec")
"ms")
YB_NODE_NTP_SERVICE_STATUS = MetricDefinition(
"yb_node_ntp_service_status",
"If chronyd or ntp(d) services are running")
Expand Down

0 comments on commit 1153b56

Please sign in to comment.