Skip to content

Commit

Permalink
Support ASIC/SDK health event (#3129)
Browse files Browse the repository at this point in the history
#### What I did

Support ASIC/SDK health event
1. config asic-sdk-health-event suppress
2. show asic-sdk-health-event [received|suppress]
3. sonic-clear asic-sdk-health-event

Depends on sonic-net/sonic-buildimage#17879

#### How to verify it

Unit test
  • Loading branch information
stephenxs authored May 13, 2024
1 parent a01a0a6 commit 8629b68
Show file tree
Hide file tree
Showing 12 changed files with 680 additions and 1 deletion.
24 changes: 23 additions & 1 deletion clear/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import click
import utilities_common.cli as clicommon
import utilities_common.multi_asic as multi_asic_util
from sonic_py_common import multi_asic
from sonic_py_common.general import getstatusoutput_noshell_pipe
from flow_counter_util.route import exit_if_route_flow_counter_not_support
from utilities_common import util_base
from show.plugins.pbh import read_pbh_counters
from config.plugins.pbh import serialize_pbh_counters
from . import plugins


# This is from the aliases example:
# https://github.com/pallets/click/blob/57c6f09611fc47ca80db0bd010f05998b3c0aa95/examples/aliases/aliases.py
class Config(object):
Expand Down Expand Up @@ -550,6 +550,28 @@ def route(prefix, vrf, namespace):
helper = util_base.UtilHelper()
helper.load_and_register_plugins(plugins, cli)

# ("sonic-clear asic-sdk-health-event")
@cli.command()
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def asic_sdk_health_event(db, namespace):
"""Clear received ASIC/SDK health events"""
if multi_asic.get_num_asics() > 1:
namespace_list = multi_asic.get_namespaces_from_linux()
else:
namespace_list = [multi_asic.DEFAULT_NAMESPACE]

for ns in namespace_list:
if namespace and namespace != ns:
continue

state_db = db.db_clients[ns]
keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*")
for key in keys:
state_db.delete(state_db.STATE_DB, key);


if __name__ == '__main__':
cli()
119 changes: 119 additions & 0 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7548,5 +7548,124 @@ def date(date, time):
clicommon.run_command(['timedatectl', 'set-time', date_time])


#
# 'asic-sdk-health-event' group ('config asic-sdk-health-event ...')
#
@config.group()
def asic_sdk_health_event():
"""Configuring asic-sdk-health-event"""
pass


@asic_sdk_health_event.group()
def suppress():
"""Suppress ASIC/SDK health event"""
pass


def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, namespace):
ctx = click.get_current_context()

if multi_asic.get_num_asics() > 1:
namespace_list = multi_asic.get_namespaces_from_linux()
else:
namespace_list = [DEFAULT_NAMESPACE]

severityCapabilities = {
"fatal": "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY",
"warning": "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY",
"notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY"
}

if category_list:
categories = {"software", "firmware", "cpu_hw", "asic_hw"}

if category_list == 'none':
suppressedCategoriesList = []
elif category_list == 'all':
suppressedCategoriesList = list(categories)
else:
suppressedCategoriesList = category_list.split(',')

unsupportCategories = set(suppressedCategoriesList) - categories
if unsupportCategories:
ctx.fail("Invalid category(ies): {}".format(unsupportCategories))

for ns in namespace_list:
if namespace and namespace != ns:
continue

config_db = db.cfgdb_clients[ns]
state_db = db.db_clients[ns]

entry_name = "SWITCH_CAPABILITY|switch"
if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"):
ctx.fail("ASIC/SDK health event is not supported on the platform")

if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]):
ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity))

entry = config_db.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)
need_remove = False
noarg = True

if category_list:
noarg = False
if suppressedCategoriesList:
entry["categories"] = suppressedCategoriesList
elif entry.get("categories"):
entry.pop("categories")
need_remove = True

if max_events is not None:
noarg = False
if max_events > 0:
entry["max_events"] = max_events
elif entry.get("max_events"):
entry.pop("max_events")
need_remove = True

if noarg:
ctx.fail("At least one argument should be provided!")

if entry:
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, entry)
elif need_remove:
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def fatal(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'fatal', category_list, max_events, namespace)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def warning(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'warning', category_list, max_events, namespace)


@suppress.command()
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
help='Option needed for multi-asic only: provide namespace name',
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
@clicommon.pass_db
def notice(db, category_list, max_events, namespace):
handle_asic_sdk_health_suppress(db, 'notice', category_list, max_events, namespace)


if __name__ == '__main__':
config()
156 changes: 156 additions & 0 deletions doc/Command-Reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
* [ARP & NDP](#arp--ndp)
* [ARP show commands](#arp-show-commands)
* [NDP show commands](#ndp-show-commands)
* [ASIC SDK health event](#asic-sdk-health-event)
* [ASIC SDK health event config commands](#asic-sdk-health-event-config-commands)
* [ASIC SDK health event show commands](#asic-sdk-health-event-show-commands)
* [ASIC SDK health event clear commands](#asic-sdk-health-event-clear-commands)
* [BFD](#bfd)
* [BFD show commands](#bfd-show-commands)
* [BGP](#bgp)
Expand Down Expand Up @@ -1930,6 +1934,158 @@ This command is used to display: ACL rules, tables and their priority, ACL packe
If the `PACKETS COUNT` and `BYTES COUNT` fields have some numeric value it means that it is a SONiC ACL's and those counters are created in SONiC `COUNTERS_DB`.
## ASIC SDK health event
### ASIC SDK health event config commands
**config asic-sdk-health-event suppress **
This command is for a customer to configure the categories that he/she wants to suppress for a certain severity.
- Usage:
```
config config asic-sdk-health-event suppress <severity> [--category-list <category-list>|<none>|<all>] [--max-events <max-events>]
```
- Parameters:
- severity: Specify the severity whose ASIC/SDK health events to be suppressed. It can be one of `fatal`, `warning`, and `notice`.
- category-list: Specify the categories from which the ASIC/SDK health events to be suppressed. It is a list whose element is one of `software`, `firmware`, `cpu_hw`, `asic_hw` separated by a comma.
If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. In this case, it will not be stored in the CONFIG_DB.
If the category-list is `all`, all the categories are suppressed and none category will be notified for `severity`.
- max-events: Specify the maximum number of events of the severity to be stored in the STATE_DB.
There is no limitation if the max-events is 0. In this case, it will not be stored in the CONFIG_DB.
- Examples:
```
admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal --category-list cpu_hw,software --max-events 10240
```
This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. Maximum number of such events in the STATE_DB is 10240.
### ASIC SDK health event show commands
**show asic-sdk-health-event received**
This command displays the received ASIC/SDK health events.
- Usage:
```
show asic-sdk-health-event received [-n <asicname>]
```
- Details:
- show asic-sdk-health-event received: Display the ASIC/SDK health events received on all ASICs
- show asic-sdk-health-event received -n asic0: Display all the ASIC/SDK health events received on asic0
- Example:
```
admin@sonic:~$ show asic-sdk-health-event received
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
```
- Example on a multi ASIC system:
```
admin@sonic:~$ show asic-sdk-health-event received
asic0:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
asic1:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
```
Optionally, you can specify the asic name in order to display the ASIC/SDK health events received on that particular ASIC on a multi ASIC system
- Example:
```
admin@sonic:~$ show asic-sdk-health-event received -n asic1
asic1:
Time Severity Category Description
------------------- ----------- --------- -----------------
2023-10-20 05:07:34 fatal firmware Command timeout
```
**show asic-sdk-health-event suppress-configuration**
This command displays the suppressed category list and maximum number of events of ASIC/SDK health events.
- Usage:
```
show asic-sdk-health-event suppressed-category-list [-n <asicname>]
```
- Details:
- show asic-sdk-health-event suppress-configuration: Display the ASIC/SDK health event suppress category list and maximum number of events on all ASICs
- show asic-sdk-health-event suppress-configuration -n asic0: Display all the ASIC/SDK health event suppress category list and maximum number of events on asic0
- Example:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
notice none 1024
warning firmware,asic_hw 10240
```
- Example on a multi ASIC system:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
asic0:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
notice none 1024
warning firmware,asic_hw 10240
asic1:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
```
Optionally, you can specify the asic name in order to display the ASIC/SDK health event suppress category list on that particular ASIC on a multi ASIC system
- Example:
```
admin@sonic:~$ show asic-sdk-health-event suppress-configuration -n asic1
asic1:
Severity Suppressed category-list Max events
---------- -------------------------- ------------
fatal software unlimited
```
### ASIC SDK health event clear commands
**sonic-clear asic-sdk-health-event**
This command clears all the received ASIC/SDK health events.
- Usage:
```
sonic-clear asic-sdk-health-event [-n <asicname>]
```
- Details:
- sonic-clear asic-sdk-health-event: Clear the ASIC/SDK health events received on all ASICs
- sonic-clear asic-sdk-health-event -n asic0: Display all the ASIC/SDK health events received on asic0
- Example:
```
admin@sonic:~$ sonic-clear asic-sdk-health-event
```
Go Back To [Beginning of the document](#) or [Beginning of this section](#asic-sdk-health-event)
## ARP & NDP
Expand Down
2 changes: 2 additions & 0 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1952,6 +1952,8 @@ main() {
# 1st counter snapshot early. Need 2 snapshots to make sense of counters trend.
save_counter_snapshot $asic 1

save_cmd "show asic-sdk-health-event received" "asic.sdk.health.event" &

save_cmd "systemd-analyze blame" "systemd.analyze.blame" &
save_cmd "systemd-analyze dump" "systemd.analyze.dump" &
save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" &
Expand Down
Loading

0 comments on commit 8629b68

Please sign in to comment.