[PLAT-14867] Make sure restart alerts don't trigger for small time up…

…dates during NTP sync Summary: Currently some customers are seeing bogus DB node restart alerts. The reason is that node boot time (and process start time) metrics experience 1 second changes during NTP time syncs. This diff makes sure we only trigger node/process restart alerts in case node/process boot time increased significantly (> 10 seconds). Test Plan: Restart DB node via cloud console. Make sure both DB node and DB instance restart alerts are triggered with WARNING severity. Restart DB node via cloud console 2 more times with 1 minute delay between restarts. Make sure both DB node and DB instance restart alerts are triggered with SEVERE severity. Wait for 30 minutes for alerts to clean. Reviewers: vbansal, #yba-api-review! Reviewed By: vbansal Subscribers: sanketh, yugaware Differential Revision: https://phorge.dev.yugabyte.com/D37773
yugabyte · Sep 5, 2024 · 1153b56 · 1153b56
1 parent 6614afb
commit 1153b56
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 7 deletions.
diff --git a/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java b/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java
@@ -128,6 +128,12 @@ private List<Collector.MetricFamilySamples> getPrecalculatedMetrics() {
         // Prometheus client library expects empty string in case metric has no unit
         unit = StringUtils.EMPTY;
       }
+      if (!unit.isEmpty() && !metricName.endsWith("_" + unit)) {
+        // Seems like one of the metrics have invalid unit.
+        // Let's just clean unit for this one and log it.
+        log.warn("Metric name {} should end with '{}'", metricName, "_" + unit);
+        unit = StringUtils.EMPTY;
+      }
       Collector.Type type = metrics.get(0).getType().getPrometheusType();
 
       List<Collector.MetricFamilySamples.Sample> samples =

diff --git a/managed/src/main/resources/alert/alert_templates.yml b/managed/src/main/resources/alert/alert_templates.yml
@@ -319,7 +319,7 @@ templates:
     name: DB Instance restart
     description: Unexpected Master or TServer process restart(s) occurred during
       last 30 minutes
-    queryTemplate: max by (universe_uuid) (changes(yb_node_boot_time{universe_uuid="__universeUuid__"}[30m])
+    queryTemplate: max by (universe_uuid) ((sum_over_time((increase(yb_node_boot_time{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m]))
       and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid="__universeUuid__"}[35m])
       == 0)) {{ query_condition }} {{ query_threshold }}
     createForNewCustomer: true
@@ -335,7 +335,7 @@ templates:
     labels:
       affected_node_names: >-
         {{ range $index, $element := query "max by (universe_uuid, node_name)
-        (changes(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[30m])
+        ((sum_over_time((increase(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m]))
         and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid='{{ $labels.universe_uuid }}'}[35m])
         == 0)) {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
     annotations:
@@ -1079,7 +1079,8 @@ templates:
   NODE_RESTART:
     name: DB node restart
     description: Unexpected DB node restart(s) occurred during last 30 minutes
-    queryTemplate: max by (universe_uuid) (changes(node_boot_time_seconds{universe_uuid="__universeUuid__"}[30m]))
+    queryTemplate: max by (universe_uuid) (sum_over_time(
+      (increase(node_boot_time_seconds{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m]))
       {{ query_condition }} {{ query_threshold }}
     createForNewCustomer: true
     defaultThresholdMap:
@@ -1094,7 +1095,7 @@ templates:
     labels:
       affected_node_names: >-
         {{ range $index, $element := query "max by (universe_uuid, node_name)
-        (changes(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[30m]))
+        (sum_over_time((increase(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m]))
         {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
     annotations:
       summary: >-
@@ -1906,7 +1907,7 @@ templates:
     name: High clock drift
     description: Local clock on the node has drift too far from the actual time in the past 10 minutes.
     queryTemplate: max by (universe_uuid) (yb_node_clock_drift_check_ms{universe_uuid="__universeUuid__"}) {{ query_condition }} {{ query_threshold }}
-    createForNewCustomer: true 
+    createForNewCustomer: true
     defaultThresholdMap:
       WARNING:
         threshold: 200.0
@@ -1925,4 +1926,4 @@ templates:
       summary: >-
         Clock drift is high for universe '{{ $labels.source_name }}'.
         Current drift from actual clock is {{ $value }} milliseconds.
-        Affected nodes: {{ $labels.affected_node_names }}
+        Affected nodes: {{ $labels.affected_node_names }}
diff --git a/managed/src/main/resources/db/migration/default_/postgres/V373__Restart_Alerts_Update.sql b/managed/src/main/resources/db/migration/default_/postgres/V373__Restart_Alerts_Update.sql
@@ -0,0 +1,5 @@
+-- Copyright (c) YugaByte, Inc.
+
+-- Recreate alert definition to count by
+update alert_definition set config_written = false where configuration_uuid IN
+  (select uuid from alert_configuration where template in ('DB_INSTANCE_RESTART','NODE_RESTART'));
diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template
@@ -226,7 +226,7 @@ YB_NODE_CONTROLLER_CHECK = MetricDefinition(
 YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition(
     "yb_node_clock_drift_check_ms",
     "Time Drift between nodes within limits",
-    "millisec")
+    "ms")
 YB_NODE_NTP_SERVICE_STATUS = MetricDefinition(
     "yb_node_ntp_service_status",
     "If chronyd or ntp(d) services are running")