Skip to content

Commit

Permalink
ASIC/SDK health event
Browse files Browse the repository at this point in the history
Signed-off-by: Stephen Sun <stephens@nvidia.com>
  • Loading branch information
stephenxs committed Apr 16, 2024
1 parent 1941023 commit 93acfe5
Show file tree
Hide file tree
Showing 12 changed files with 796 additions and 18 deletions.
1 change: 1 addition & 0 deletions orchagent/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ CFLAGS_SAI = -I /usr/include/sai
swssdir = $(datadir)/swss

dist_swss_DATA = \
eliminate_events.lua \
rif_rates.lua \
pfc_detect_innovium.lua \
pfc_detect_mellanox.lua \
Expand Down
63 changes: 63 additions & 0 deletions orchagent/eliminate_events.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
-- KEYS - None
-- ARGV - None

local state_db = "6"
local config_db = "4"

local result = {}

redis.call('SELECT', config_db)
local severity_keys = redis.call('KEYS', 'SUPPRESS_ASIC_SDK_HEALTH_EVENT*')
if #severity_keys == 0 then
return result
end

local max_events = {}
for i = 1, #severity_keys, 1 do
local max_event = redis.call('HGET', severity_keys[i], 'max_events')
if max_event then
max_events[string.sub(severity_keys[i], 32, -1)] = tonumber(max_event)
end
end

if not next (max_events) then
return result
end

redis.call('SELECT', state_db)
local events = {}

local event_keys = redis.call('KEYS', 'ASIC_SDK_HEALTH_EVENT_TABLE*')

if #event_keys == 0 then
return result
end

for i = 1, #event_keys, 1 do
local severity = redis.call('HGET', event_keys[i], 'severity')
if max_events[severity] ~= nil then
if events[severity] == nil then
events[severity] = {}
end
table.insert(events[severity], event_keys[i])
end
end

for severity in pairs(max_events) do
local number_received_events = 0
if events[severity] ~= nil then
number_received_events = #events[severity]
end
if number_received_events > max_events[severity] then
table.sort(events[severity])
local number_to_eliminate = number_received_events - max_events[severity]
for i = 1, number_to_eliminate, 1 do
redis.call('DEL', events[severity][i])
end
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", eliminated " .. number_to_eliminate)
else
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", not exceeding the maximum")
end
end

return result
23 changes: 23 additions & 0 deletions orchagent/notifications.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ extern "C" {

#include "logger.h"
#include "notifications.h"
#include "switchorch.h"

extern SwitchOrch *gSwitchOrch;

#ifdef ASAN_ENABLED
#include <sanitizer/lsan_interface.h>
Expand Down Expand Up @@ -40,6 +43,11 @@ void on_switch_shutdown_request(sai_object_id_t switch_id)
/* TODO: Later a better restart story will be told here */
SWSS_LOG_ERROR("Syncd stopped");

if (gSwitchOrch->isFatalEventReceived())
{
abort();
}

/*
The quick_exit() is used instead of the exit() to avoid a following data race:
* the exit() calls the destructors for global static variables (e.g.BufferOrch::m_buffer_type_maps)
Expand All @@ -59,3 +67,18 @@ void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, s
// don't use this event handler, because it runs by libsairedis in a separate thread
// which causes concurrency access to the DB
}

void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
sai_switch_asic_sdk_health_severity_t severity,
sai_timespec_t timestamp,
sai_switch_asic_sdk_health_category_t category,
sai_switch_health_data_t data,
const sai_u8_list_t description)
{
gSwitchOrch->onSwitchAsicSdkHealthEvent(switch_id,
severity,
timestamp,
category,
data,
description);
}
8 changes: 8 additions & 0 deletions orchagent/notifications.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,12 @@ void on_twamp_session_event(uint32_t count, sai_twamp_session_event_notification
// The function prototype information can be found here:
// https://github.com/sonic-net/sonic-sairedis/blob/master/meta/NotificationSwitchShutdownRequest.cpp#L49
void on_switch_shutdown_request(sai_object_id_t switch_id);

void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, sai_port_host_tx_ready_status_t m_portHostTxReadyStatus);

void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
sai_switch_asic_sdk_health_severity_t severity,
sai_timespec_t timestamp,
sai_switch_asic_sdk_health_category_t category,
sai_switch_health_data_t data,
const sai_u8_list_t description);
2 changes: 2 additions & 0 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ bool OrchDaemon::init()
TableConnector app_switch_table(m_applDb, APP_SWITCH_TABLE_NAME);
TableConnector conf_asic_sensors(m_configDb, CFG_ASIC_SENSORS_TABLE_NAME);
TableConnector conf_switch_hash(m_configDb, CFG_SWITCH_HASH_TABLE_NAME);
TableConnector conf_suppress_asic_sdk_health_categories(m_configDb, CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME);

vector<TableConnector> switch_tables = {
conf_switch_hash,
conf_asic_sensors,
conf_suppress_asic_sdk_health_categories,
app_switch_table
};

Expand Down
1 change: 1 addition & 0 deletions orchagent/p4orch/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ p4orch_tests_SOURCES = $(ORCHAGENT_DIR)/orch.cpp \
$(ORCHAGENT_DIR)/flex_counter/flow_counter_handler.cpp \
$(ORCHAGENT_DIR)/port/port_capabilities.cpp \
$(ORCHAGENT_DIR)/port/porthlpr.cpp \
$(ORCHAGENT_DIR)/notifications.cpp \
$(P4ORCH_DIR)/p4oidmapper.cpp \
$(P4ORCH_DIR)/p4orch.cpp \
$(P4ORCH_DIR)/p4orch_util.cpp \
Expand Down
2 changes: 2 additions & 0 deletions orchagent/p4orch/tests/test_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ sai_object_id_t kMirrorSessionOid1 = 9001;
char *gMirrorSession2 = "mirror-session-2";
sai_object_id_t kMirrorSessionOid2 = 9002;
sai_object_id_t gUnderlayIfId;
string gMyAsicName = "";
event_handle_t g_events_handle;

#define DEFAULT_BATCH_SIZE 128
#define DEFAULT_MAX_BULK_SIZE 1000
Expand Down
Loading

0 comments on commit 93acfe5

Please sign in to comment.