diff --git a/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java b/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java index 26e0bd5ab075..86842bc1eba8 100644 --- a/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java +++ b/managed/src/main/java/com/yugabyte/yw/controllers/MetricsController.java @@ -128,6 +128,12 @@ private List getPrecalculatedMetrics() { // Prometheus client library expects empty string in case metric has no unit unit = StringUtils.EMPTY; } + if (!unit.isEmpty() && !metricName.endsWith("_" + unit)) { + // Seems like one of the metrics have invalid unit. + // Let's just clean unit for this one and log it. + log.warn("Metric name {} should end with '{}'", metricName, "_" + unit); + unit = StringUtils.EMPTY; + } Collector.Type type = metrics.get(0).getType().getPrometheusType(); List samples = diff --git a/managed/src/main/resources/alert/alert_templates.yml b/managed/src/main/resources/alert/alert_templates.yml index c8b4b34a6e06..0f5dac96287d 100644 --- a/managed/src/main/resources/alert/alert_templates.yml +++ b/managed/src/main/resources/alert/alert_templates.yml @@ -319,7 +319,7 @@ templates: name: DB Instance restart description: Unexpected Master or TServer process restart(s) occurred during last 30 minutes - queryTemplate: max by (universe_uuid) (changes(yb_node_boot_time{universe_uuid="__universeUuid__"}[30m]) + queryTemplate: max by (universe_uuid) ((sum_over_time((increase(yb_node_boot_time{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m])) and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid="__universeUuid__"}[35m]) == 0)) {{ query_condition }} {{ query_threshold }} createForNewCustomer: true @@ -335,7 +335,7 @@ templates: labels: affected_node_names: >- {{ range $index, $element := query "max by (universe_uuid, node_name) - (changes(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[30m]) + ((sum_over_time((increase(yb_node_boot_time{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m])) and on (universe_uuid) (max_over_time(ybp_universe_update_in_progress{universe_uuid='{{ $labels.universe_uuid }}'}[35m]) == 0)) {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }} annotations: @@ -1079,7 +1079,8 @@ templates: NODE_RESTART: name: DB node restart description: Unexpected DB node restart(s) occurred during last 30 minutes - queryTemplate: max by (universe_uuid) (changes(node_boot_time_seconds{universe_uuid="__universeUuid__"}[30m])) + queryTemplate: max by (universe_uuid) (sum_over_time( + (increase(node_boot_time_seconds{universe_uuid="__universeUuid__"}[1m]) > bool 10)[30m:1m])) {{ query_condition }} {{ query_threshold }} createForNewCustomer: true defaultThresholdMap: @@ -1094,7 +1095,7 @@ templates: labels: affected_node_names: >- {{ range $index, $element := query "max by (universe_uuid, node_name) - (changes(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[30m])) + (sum_over_time((increase(node_boot_time_seconds{universe_uuid='{{ $labels.universe_uuid }}'}[1m]) > bool 10)[30m:1m])) {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }} annotations: summary: >- @@ -1906,7 +1907,7 @@ templates: name: High clock drift description: Local clock on the node has drift too far from the actual time in the past 10 minutes. queryTemplate: max by (universe_uuid) (yb_node_clock_drift_check_ms{universe_uuid="__universeUuid__"}) {{ query_condition }} {{ query_threshold }} - createForNewCustomer: true + createForNewCustomer: true defaultThresholdMap: WARNING: threshold: 200.0 @@ -1925,4 +1926,4 @@ templates: summary: >- Clock drift is high for universe '{{ $labels.source_name }}'. Current drift from actual clock is {{ $value }} milliseconds. - Affected nodes: {{ $labels.affected_node_names }} \ No newline at end of file + Affected nodes: {{ $labels.affected_node_names }} diff --git a/managed/src/main/resources/db/migration/default_/postgres/V373__Restart_Alerts_Update.sql b/managed/src/main/resources/db/migration/default_/postgres/V373__Restart_Alerts_Update.sql new file mode 100644 index 000000000000..1bff0292be8d --- /dev/null +++ b/managed/src/main/resources/db/migration/default_/postgres/V373__Restart_Alerts_Update.sql @@ -0,0 +1,5 @@ +-- Copyright (c) YugaByte, Inc. + +-- Recreate alert definition to count by +update alert_definition set config_written = false where configuration_uuid IN + (select uuid from alert_configuration where template in ('DB_INSTANCE_RESTART','NODE_RESTART')); diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index fa58d37aa51f..eb6232a24685 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -226,7 +226,7 @@ YB_NODE_CONTROLLER_CHECK = MetricDefinition( YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition( "yb_node_clock_drift_check_ms", "Time Drift between nodes within limits", - "millisec") + "ms") YB_NODE_NTP_SERVICE_STATUS = MetricDefinition( "yb_node_ntp_service_status", "If chronyd or ntp(d) services are running")