Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-131446 / 25.04 / Fix enclosure alert #14594

Merged
merged 3 commits into from
Sep 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 64 additions & 48 deletions src/middlewared/middlewared/alert/source/enclosure_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,34 @@
#
# Licensed under the terms of the TrueNAS Enterprise License Agreement
# See the file LICENSE.IX for complete terms and conditions
from dataclasses import dataclass

from middlewared.alert.base import AlertClass, AlertCategory, AlertLevel, Alert, AlertSource
from middlewared.alert.base import (
AlertClass,
AlertCategory,
AlertLevel,
Alert,
AlertSource,
)


@dataclass(slots=True, frozen=True, kw_only=True)
class BadElement:
enc_name: str
descriptor: str
status: str
value: str
value_raw: int

def args(self):
return [self.enc_name, self.descriptor, self.status, self.value, self.value_raw]


class EnclosureUnhealthyAlertClass(AlertClass):
category = AlertCategory.HARDWARE
level = AlertLevel.CRITICAL
title = "Enclosure Status Is Not Healthy"
text = "Enclosure (%s): Element \"%s\" is reporting a status of \"%s\" with a value of \"%s\". (raw value \"%s\")"
text = 'Enclosure (%s): Element "%s" is reporting a status of "%s" with a value of "%s". (raw value "%s")'
products = ("SCALE_ENTERPRISE",)


Expand All @@ -26,65 +45,62 @@ class EnclosureStatusAlertSource(AlertSource):
products = ("SCALE_ENTERPRISE",)
failover_related = True
run_on_backup_node = False
bad = ('critical', 'noncritical', 'unknown', 'unrecoverable')
bad_elements = []

async def should_report(self, enclosure, element):
should_report = True
if element['status'].lower() in self.bad and element['value'] != 'None':
if element['name'] == 'Enclosure':
# this is an element that provides an "overview" for all the other elements
# i.e. if a power supply element is reporting critical, this will (should)
# report critical as well. Sometimes, however, this will constantly report
# a bad status, just ignore it #11918
should_report = False
elif enclosure['name'] == 'ECStream 3U16+4R-4X6G.3 d10c' and element['descriptor'] == '1.8V Sensor':
# The 1.8V sensor is bugged on the echostream enclosure (Z-series). The
# management chip loses it's mind and claims undervoltage, but scoping
# this confirms the voltage is fine. Ignore alerts from this element. #10077
should_report = False
else:
should_report = False

return should_report
bad = ("critical", "noncritical", "unknown", "unrecoverable")
bad_elements: list | list[tuple[BadElement, int]] = list()

async def should_report(self, ele_type: str, ele_value: dict[str]):
"""We only want to raise an alert for an element's status
if it meets a certain criteria"""
if not ele_value["value"]:
# if we don't have an actual value, doesn't
# matter what status the element is reporting
# we'll skip it so we don't raise alarm to
# end-user unnecessarily
return False
elif ele_value["status"].lower() not in self.bad:
return False

return True

async def check(self):
good_enclosures = []
bad_elements = []
for enc in await self.middleware.call('enclosure.query'):
good_enclosures.append([enc['name']])

for element_values in enc['elements']:
for value in element_values['elements']:
if await self.should_report(enc, value):
args = [
enc['name'],
value['name'],
value['status'],
value['value'],
value['value_raw']
]
for i, (another_args, count) in enumerate(self.bad_elements):
if another_args == args:
bad_elements.append((args, count + 1))
good_enclosures, bad_elements = [], []
for enc in await self.middleware.call("enclosure2.query"):
good_enclosures.append([f"{enc['name']} (id: {enc['id']})")
enc["elements"].pop("Array Device Slot") # dont care about disk slots
for element_type, element_values in enc["elements"].items():
for ele_value in element_values.values():
if await self.should_report(element_type, ele_value):
current_bad_element = BadElement(
enc_name=enc["name"],
descriptor=ele_value["descriptor"],
status=ele_value["status"],
value=ele_value["value"],
value_raw=ele_value["value_raw"],
)
for previous_bad_element, count in self.bad_elements:
if previous_bad_element == current_bad_element:
bad_elements.append((current_bad_element, count + 1))
break
else:
bad_elements.append((args, 1))
bad_elements.append((current_bad_element, 1))

self.bad_elements = bad_elements

alerts = []
for args, count in bad_elements:
# We only report unhealthy enclosure elements if they were unhealthy 5 probes in a row (1 probe = 1 minute)
for current_bad_element, count in bad_elements:
# We only report unhealthy enclosure elements if
# they were unhealthy 5 probes in a row (1 probe = 1 minute)
if count >= 5:
try:
good_enclosures.remove(args[:1])
good_enclosures.remove(current_bad_element.enc_name)
except ValueError:
pass

alerts.append(Alert(EnclosureUnhealthyAlertClass, args=args))
alerts.append(
Alert(EnclosureUnhealthyAlertClass, args=current_bad_element.args())
)

for args in good_enclosures:
alerts.append(Alert(EnclosureHealthyAlertClass, args=args))
for enclosure in good_enclosures:
alerts.append(Alert(EnclosureHealthyAlertClass, args=enclosure))

return alerts
Loading