diff --git a/orchagent/Makefile.am b/orchagent/Makefile.am index e7743ab44d0..7e1d0b7143e 100644 --- a/orchagent/Makefile.am +++ b/orchagent/Makefile.am @@ -17,6 +17,7 @@ CFLAGS_SAI = -I /usr/include/sai swssdir = $(datadir)/swss dist_swss_DATA = \ + eliminate_events.lua \ rif_rates.lua \ pfc_detect_innovium.lua \ pfc_detect_mellanox.lua \ diff --git a/orchagent/eliminate_events.lua b/orchagent/eliminate_events.lua new file mode 100644 index 00000000000..871e6c1fb0f --- /dev/null +++ b/orchagent/eliminate_events.lua @@ -0,0 +1,63 @@ +-- KEYS - None +-- ARGV - None + +local state_db = "6" +local config_db = "4" + +local result = {} + +redis.call('SELECT', config_db) +local severity_keys = redis.call('KEYS', 'SUPPRESS_ASIC_SDK_HEALTH_EVENT*') +if #severity_keys == 0 then + return result +end + +local max_events = {} +for i = 1, #severity_keys, 1 do + local max_event = redis.call('HGET', severity_keys[i], 'max_events') + if max_event then + max_events[string.sub(severity_keys[i], 32, -1)] = tonumber(max_event) + end +end + +if not next (max_events) then + return result +end + +redis.call('SELECT', state_db) +local events = {} + +local event_keys = redis.call('KEYS', 'ASIC_SDK_HEALTH_EVENT_TABLE*') + +if #event_keys == 0 then + return result +end + +for i = 1, #event_keys, 1 do + local severity = redis.call('HGET', event_keys[i], 'severity') + if max_events[severity] ~= nil then + if events[severity] == nil then + events[severity] = {} + end + table.insert(events[severity], event_keys[i]) + end +end + +for severity in pairs(max_events) do + local number_received_events = 0 + if events[severity] ~= nil then + number_received_events = #events[severity] + end + if number_received_events > max_events[severity] then + table.sort(events[severity]) + local number_to_eliminate = number_received_events - max_events[severity] + for i = 1, number_to_eliminate, 1 do + redis.call('DEL', events[severity][i]) + end + table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", eliminated " .. number_to_eliminate) + else + table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", not exceeding the maximum") + end +end + +return result diff --git a/orchagent/notifications.cpp b/orchagent/notifications.cpp index 9455620fb57..72a62b97fc4 100644 --- a/orchagent/notifications.cpp +++ b/orchagent/notifications.cpp @@ -4,6 +4,9 @@ extern "C" { #include "logger.h" #include "notifications.h" +#include "switchorch.h" + +extern SwitchOrch *gSwitchOrch; #ifdef ASAN_ENABLED #include @@ -40,6 +43,11 @@ void on_switch_shutdown_request(sai_object_id_t switch_id) /* TODO: Later a better restart story will be told here */ SWSS_LOG_ERROR("Syncd stopped"); + if (gSwitchOrch->isFatalEventReceived()) + { + abort(); + } + /* The quick_exit() is used instead of the exit() to avoid a following data race: * the exit() calls the destructors for global static variables (e.g.BufferOrch::m_buffer_type_maps) @@ -59,3 +67,18 @@ void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, s // don't use this event handler, because it runs by libsairedis in a separate thread // which causes concurrency access to the DB } + +void on_switch_asic_sdk_health_event(sai_object_id_t switch_id, + sai_switch_asic_sdk_health_severity_t severity, + sai_timespec_t timestamp, + sai_switch_asic_sdk_health_category_t category, + sai_switch_health_data_t data, + const sai_u8_list_t description) +{ + gSwitchOrch->onSwitchAsicSdkHealthEvent(switch_id, + severity, + timestamp, + category, + data, + description); +} diff --git a/orchagent/notifications.h b/orchagent/notifications.h index 403b358a12b..f639d332c52 100644 --- a/orchagent/notifications.h +++ b/orchagent/notifications.h @@ -12,4 +12,12 @@ void on_twamp_session_event(uint32_t count, sai_twamp_session_event_notification // The function prototype information can be found here: // https://github.com/sonic-net/sonic-sairedis/blob/master/meta/NotificationSwitchShutdownRequest.cpp#L49 void on_switch_shutdown_request(sai_object_id_t switch_id); + void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, sai_port_host_tx_ready_status_t m_portHostTxReadyStatus); + +void on_switch_asic_sdk_health_event(sai_object_id_t switch_id, + sai_switch_asic_sdk_health_severity_t severity, + sai_timespec_t timestamp, + sai_switch_asic_sdk_health_category_t category, + sai_switch_health_data_t data, + const sai_u8_list_t description); diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index 05e58c6ae92..568c84e7339 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -117,10 +117,12 @@ bool OrchDaemon::init() TableConnector app_switch_table(m_applDb, APP_SWITCH_TABLE_NAME); TableConnector conf_asic_sensors(m_configDb, CFG_ASIC_SENSORS_TABLE_NAME); TableConnector conf_switch_hash(m_configDb, CFG_SWITCH_HASH_TABLE_NAME); + TableConnector conf_suppress_asic_sdk_health_categories(m_configDb, CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME); vector switch_tables = { conf_switch_hash, conf_asic_sensors, + conf_suppress_asic_sdk_health_categories, app_switch_table }; diff --git a/orchagent/p4orch/tests/Makefile.am b/orchagent/p4orch/tests/Makefile.am index d541bbe6372..2c7301311f5 100644 --- a/orchagent/p4orch/tests/Makefile.am +++ b/orchagent/p4orch/tests/Makefile.am @@ -35,6 +35,7 @@ p4orch_tests_SOURCES = $(ORCHAGENT_DIR)/orch.cpp \ $(ORCHAGENT_DIR)/flex_counter/flow_counter_handler.cpp \ $(ORCHAGENT_DIR)/port/port_capabilities.cpp \ $(ORCHAGENT_DIR)/port/porthlpr.cpp \ + $(ORCHAGENT_DIR)/notifications.cpp \ $(P4ORCH_DIR)/p4oidmapper.cpp \ $(P4ORCH_DIR)/p4orch.cpp \ $(P4ORCH_DIR)/p4orch_util.cpp \ diff --git a/orchagent/p4orch/tests/test_main.cpp b/orchagent/p4orch/tests/test_main.cpp index 0170588a425..96d005ea240 100644 --- a/orchagent/p4orch/tests/test_main.cpp +++ b/orchagent/p4orch/tests/test_main.cpp @@ -35,6 +35,8 @@ sai_object_id_t kMirrorSessionOid1 = 9001; char *gMirrorSession2 = "mirror-session-2"; sai_object_id_t kMirrorSessionOid2 = 9002; sai_object_id_t gUnderlayIfId; +string gMyAsicName = ""; +event_handle_t g_events_handle; #define DEFAULT_BATCH_SIZE 128 #define DEFAULT_MAX_BULK_SIZE 1000 diff --git a/orchagent/switchorch.cpp b/orchagent/switchorch.cpp index 06dc36e4723..737ef2155c1 100644 --- a/orchagent/switchorch.cpp +++ b/orchagent/switchorch.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "switchorch.h" #include "crmorch.h" @@ -10,6 +11,9 @@ #include "macaddress.h" #include "return_code.h" #include "saihelper.h" +#include "sai_serialize.h" +#include "notifications.h" +#include "redisapi.h" using namespace std; using namespace swss; @@ -20,6 +24,8 @@ extern sai_acl_api_t *sai_acl_api; extern sai_hash_api_t *sai_hash_api; extern MacAddress gVxlanMacAddress; extern CrmOrch *gCrmOrch; +extern event_handle_t g_events_handle; +extern string gMyAsicName; const map switch_attribute_map = { @@ -47,6 +53,43 @@ const map packet_action_map = {"trap", SAI_PACKET_ACTION_TRAP} }; +const map switch_asic_sdk_health_event_severity_to_switch_attribute_map = +{ + {"fatal", SAI_SWITCH_ATTR_REG_FATAL_SWITCH_ASIC_SDK_HEALTH_CATEGORY}, + {"warning", SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY}, + {"notice", SAI_SWITCH_ATTR_REG_NOTICE_SWITCH_ASIC_SDK_HEALTH_CATEGORY} +}; + +const map switch_asic_sdk_health_event_severity_reverse_map = +{ + {SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL, "fatal"}, + {SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_WARNING, "warning"}, + {SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_NOTICE, "notice"}, +}; + +const map switch_asic_sdk_health_event_category_reverse_map = +{ + {SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_SW, "software"}, + {SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, "firmware"}, + {SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_CPU_HW, "cpu_hw"}, + {SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW, "asic_hw"} +}; + +const map switch_asic_sdk_health_event_category_map = +{ + {"software", SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_SW}, + {"firmware", SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW}, + {"cpu_hw", SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_CPU_HW}, + {"asic_hw", SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW} +}; + +const std::set switch_asic_sdk_health_event_category_universal_set = +{ + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_SW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_CPU_HW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW +}; const std::set switch_non_sai_attribute_set = {"ordered_ecmp"}; @@ -77,12 +120,15 @@ SwitchOrch::SwitchOrch(DBConnector *db, vector& connectors, Tabl m_db(db), m_stateDb(new DBConnector(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0)), m_asicSensorsTable(new Table(m_stateDb.get(), ASIC_TEMPERATURE_INFO_TABLE_NAME)), - m_sensorsPollerTimer (new SelectableTimer((timespec { .tv_sec = DEFAULT_ASIC_SENSORS_POLLER_INTERVAL, .tv_nsec = 0 }))) + m_sensorsPollerTimer (new SelectableTimer((timespec { .tv_sec = DEFAULT_ASIC_SENSORS_POLLER_INTERVAL, .tv_nsec = 0 }))), + m_stateDbForNotification(new DBConnector(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0)), + m_asicSdkHealthEventTable(new Table(m_stateDbForNotification.get(), STATE_ASIC_SDK_HEALTH_EVENT_TABLE_NAME)) { m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK"); auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK"); Orch::addExecutor(restartCheckNotifier); + initAsicSdkHealthEventNotification(); set_switch_pfc_dlr_init_capability(); initSensorsTable(); querySwitchTpidCapability(); @@ -93,6 +139,96 @@ SwitchOrch::SwitchOrch(DBConnector *db, vector& connectors, Tabl Orch::addExecutor(executorT); } +void SwitchOrch::initAsicSdkHealthEventNotification() +{ + sai_attribute_t attr; + sai_status_t status; + vector fvVector; + vector> reg_severities = { + {SAI_SWITCH_ATTR_REG_FATAL_SWITCH_ASIC_SDK_HEALTH_CATEGORY, SWITCH_CAPABILITY_TABLE_REG_FATAL_ASIC_SDK_HEALTH_CATEGORY, "fatal"}, + {SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY, SWITCH_CAPABILITY_TABLE_REG_WARNING_ASIC_SDK_HEALTH_CATEGORY, "warning"}, + {SAI_SWITCH_ATTR_REG_NOTICE_SWITCH_ASIC_SDK_HEALTH_CATEGORY, SWITCH_CAPABILITY_TABLE_REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY, "notice"} + }; + + bool supported = querySwitchCapability(SAI_OBJECT_TYPE_SWITCH, SAI_SWITCH_ATTR_SWITCH_ASIC_SDK_HEALTH_EVENT_NOTIFY); + if (supported) + { + attr.id = SAI_SWITCH_ATTR_SWITCH_ASIC_SDK_HEALTH_EVENT_NOTIFY; + attr.value.ptr = (void *)on_switch_asic_sdk_health_event; + status = sai_switch_api->set_switch_attribute(gSwitchId, &attr); + if (status != SAI_STATUS_SUCCESS) + { + SWSS_LOG_ERROR("Failed to register ASIC/SDK health event handler: %s", sai_serialize_status(status).c_str()); + supported = false; + } + else + { + fvVector.emplace_back(SWITCH_CAPABILITY_TABLE_ASIC_SDK_HEALTH_EVENT_CAPABLE, "true"); + } + } + else + { + SWSS_LOG_NOTICE("ASIC/SDK health event is not supported"); + } + + if (!supported) + { + fvVector.emplace_back(SWITCH_CAPABILITY_TABLE_ASIC_SDK_HEALTH_EVENT_CAPABLE, "false"); + for (auto c : reg_severities) + { + fvVector.emplace_back(get<1>(c), "false"); + } + set_switch_capability(fvVector); + + return; + } + + for (auto c : reg_severities) + { + supported = querySwitchCapability(SAI_OBJECT_TYPE_SWITCH, get<0>(c)); + if (supported) + { + status = registerAsicSdkHealthEventCategories(get<0>(c), get<2>(c)); + supported = (status == SAI_STATUS_SUCCESS); + } + else + { + SWSS_LOG_NOTICE("Unsupport to register ASIC/SDK health categories for severity %s", get<2>(c).c_str()); + } + + if (supported) + { + m_supportedAsicSdkHealthEventAttributes.insert(get<0>(c)); + fvVector.emplace_back(get<1>(c), "true"); + } + else + { + fvVector.emplace_back(get<1>(c), "false"); + } + } + + set_switch_capability(fvVector); + + try + { + // Load the Lua script to eliminate oldest entries + string eliminateEventsLuaScript = swss::loadLuaScript("eliminate_events.lua"); + m_eliminateEventsSha = swss::loadRedisScript(m_stateDb.get(), eliminateEventsLuaScript); + + // Init timer + auto interv = timespec { .tv_sec = ASIC_SDK_HEALTH_EVENT_ELIMINATE_INTERVAL, .tv_nsec = 0 }; + m_eliminateEventsTimer = new SelectableTimer(interv); + auto executor = new ExecutableTimer(m_eliminateEventsTimer, this, "ASIC_SDK_HEALTH_EVENT_ELIMINATE_TIMER"); + Orch::addExecutor(executor); + m_eliminateEventsTimer->start(); + } + catch (...) + { + // This can happen only on mock test. If it happens on a real switch, we should log an error message + SWSS_LOG_ERROR("Unable to load the Lua script to eliminate events\n"); + } +} + void SwitchOrch::initAclGroupsBindToSwitch() { // Create an ACL group per stage, INGRESS, EGRESS and PRE_INGRESS @@ -726,6 +862,129 @@ void SwitchOrch::doCfgSwitchHashTableTask(Consumer &consumer) } } +sai_status_t SwitchOrch::registerAsicSdkHealthEventCategories(sai_switch_attr_t saiSeverity, const string &severityString, const string &suppressed_category_list) +{ + sai_status_t status; + set interested_categories_set = switch_asic_sdk_health_event_category_universal_set; + + SWSS_LOG_INFO("Register ASIC/SDK health event for severity %s(%d) with categories [%s] suppressed", severityString.c_str(), saiSeverity, suppressed_category_list.c_str()); + + if (!suppressed_category_list.empty()) + { + auto &&categories = tokenize(suppressed_category_list, ','); + for (auto category : categories) + { + try + { + interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category)); + } + catch (std::out_of_range &e) + { + SWSS_LOG_ERROR("Unknown ASIC/SDK health category %s to suppress", category.c_str()); + continue; + } + } + } + + vector sai_categories(interested_categories_set.begin(), interested_categories_set.end()); + sai_attribute_t attr; + + attr.id = saiSeverity; + attr.value.s32list.count = (uint32_t)sai_categories.size(); + attr.value.s32list.list = sai_categories.data(); + status = sai_switch_api->set_switch_attribute(gSwitchId, &attr); + + if (status != SAI_STATUS_SUCCESS) + { + SWSS_LOG_ERROR("Failed to register ASIC/SDK health event categories for severity %s, status: %s", severityString.c_str(), sai_serialize_status(status).c_str()); + } + + return status; +} + +void SwitchOrch::doCfgSuppressAsicSdkHealthEventTableTask(Consumer &consumer) +{ + SWSS_LOG_ENTER(); + + auto &map = consumer.m_toSync; + auto it = map.begin(); + + while (it != map.end()) + { + auto keyOpFieldsValues = it->second; + auto key = kfvKey(keyOpFieldsValues); + auto op = kfvOp(keyOpFieldsValues); + + SWSS_LOG_INFO("KEY: %s, OP: %s", key.c_str(), op.c_str()); + + if (key.empty()) + { + SWSS_LOG_ERROR("Failed to parse switch hash key: empty string"); + it = map.erase(it); + continue; + } + + sai_switch_attr_t saiSeverity; + try + { + saiSeverity = switch_asic_sdk_health_event_severity_to_switch_attribute_map.at(key); + } + catch (std::out_of_range &e) + { + SWSS_LOG_ERROR("Unknown severity %s in SUPPRESS_ASIC_SDK_HEALTH_EVENT table", key.c_str()); + it = map.erase(it); + continue; + } + + if (op == SET_COMMAND) + { + bool categoriesConfigured = false; + bool continueMainLoop = false; + for (const auto &cit : kfvFieldsValues(keyOpFieldsValues)) + { + auto fieldName = fvField(cit); + auto fieldValue = fvValue(cit); + + SWSS_LOG_INFO("FIELD: %s, VALUE: %s", fieldName.c_str(), fieldValue.c_str()); + + if (m_supportedAsicSdkHealthEventAttributes.find(saiSeverity) == m_supportedAsicSdkHealthEventAttributes.end()) + { + SWSS_LOG_NOTICE("Unsupport to register categories on severity %d", saiSeverity); + it = map.erase(it); + continueMainLoop = true; + break; + } + + if (fieldName == "categories") + { + registerAsicSdkHealthEventCategories(saiSeverity, key, fieldValue); + categoriesConfigured = true; + } + } + + if (continueMainLoop) + { + continue; + } + + if (!categoriesConfigured) + { + registerAsicSdkHealthEventCategories(saiSeverity, key); + } + } + else if (op == DEL_COMMAND) + { + registerAsicSdkHealthEventCategories(saiSeverity, key); + } + else + { + SWSS_LOG_ERROR("Unknown operation(%s)", op.c_str()); + } + + it = map.erase(it); + } +} + void SwitchOrch::doTask(Consumer &consumer) { SWSS_LOG_ENTER(); @@ -744,6 +1003,10 @@ void SwitchOrch::doTask(Consumer &consumer) { doCfgSwitchHashTableTask(consumer); } + else if (tableName == CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME) + { + doCfgSuppressAsicSdkHealthEventTableTask(consumer); + } else { SWSS_LOG_ERROR("Unknown table : %s", tableName.c_str()); @@ -799,6 +1062,79 @@ void SwitchOrch::restartCheckReply(const string &op, const string &data, std::ve checkRestartReadyDone(); } +void SwitchOrch::onSwitchAsicSdkHealthEvent(sai_object_id_t switch_id, + sai_switch_asic_sdk_health_severity_t severity, + sai_timespec_t timestamp, + sai_switch_asic_sdk_health_category_t category, + sai_switch_health_data_t data, + const sai_u8_list_t &description) +{ + std::vector values; + const string &severity_str = switch_asic_sdk_health_event_severity_reverse_map.at(severity); + const string &category_str = switch_asic_sdk_health_event_category_reverse_map.at(category); + string description_str; + const std::time_t &t = (std::time_t)timestamp.tv_sec; + stringstream time_ss; + time_ss << std::put_time(std::localtime(&t), "%Y-%m-%d %H:%M:%S"); + + switch (data.data_type) + { + case SAI_HEALTH_DATA_TYPE_GENERAL: + { + vector description_with_terminator(description.list, description.list + description.count); + // Add the terminate character + description_with_terminator.push_back(0); + description_str = string(reinterpret_cast(description_with_terminator.data())); + // Remove unprintable characters but keep CR and NL + if (description_str.end() != + description_str.erase(std::remove_if( + description_str.begin(), + description_str.end(), + [](unsigned char x) { + return (x != 0x0d) && (x != 0x0a) && !std::isprint(x); + }), + description_str.end())) + { + SWSS_LOG_NOTICE("Unprintable characters in description of ASIC/SDK health event"); + } + break; + } + default: + SWSS_LOG_ERROR("Unknown data type %d when receiving ASIC/SDK health event", data.data_type); + // Do not return. The ASIC/SDK health event will still be recorded but without the description + break; + } + + event_params_t params = { + { "sai_timestamp", time_ss.str() }, + { "severity", severity_str }, + { "category", category_str }, + { "description", description_str }}; + + if (0 == gMyAsicName.size()) + { + SWSS_LOG_NOTICE("[%s] ASIC/SDK health event occurred at %s, category %s: %s", severity_str.c_str(), time_ss.str().c_str(), category_str.c_str(), description_str.c_str()); + } + else + { + SWSS_LOG_NOTICE("[%s] ASIC/SDK health event occurred at %s, asic %s, category %s: %s", severity_str.c_str(), time_ss.str().c_str(), gMyAsicName.c_str(), category_str.c_str(), description_str.c_str()); + params["asic_name"] = gMyAsicName; + } + + values.emplace_back("severity", severity_str); + values.emplace_back("category", category_str); + values.emplace_back("description", description_str); + + m_asicSdkHealthEventTable->set(time_ss.str(),values); + + event_publish(g_events_handle, "asic-sdk-health-event", ¶ms); + + if (severity == SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL) + { + m_fatalEventCount++; + } +} + bool SwitchOrch::setAgingFDB(uint32_t sec) { sai_attribute_t attr; @@ -912,6 +1248,14 @@ void SwitchOrch::doTask(SelectableTimer &timer) } } } + else if (&timer == m_eliminateEventsTimer) + { + auto ret = swss::runRedisScript(*m_stateDb, m_eliminateEventsSha, {}, {}); + for (auto str: ret) + { + SWSS_LOG_INFO("Eliminate ASIC/SDK health %s", str.c_str()); + } + } } void SwitchOrch::initSensorsTable() diff --git a/orchagent/switchorch.h b/orchagent/switchorch.h index 7135bcdc395..bf915774d6f 100644 --- a/orchagent/switchorch.h +++ b/orchagent/switchorch.h @@ -16,6 +16,12 @@ #define SWITCH_CAPABILITY_TABLE_PFC_DLR_INIT_CAPABLE "PFC_DLR_INIT_CAPABLE" #define SWITCH_CAPABILITY_TABLE_PORT_EGRESS_SAMPLE_CAPABLE "PORT_EGRESS_SAMPLE_CAPABLE" +#define ASIC_SDK_HEALTH_EVENT_ELIMINATE_INTERVAL 3600 +#define SWITCH_CAPABILITY_TABLE_ASIC_SDK_HEALTH_EVENT_CAPABLE "ASIC_SDK_HEALTH_EVENT" +#define SWITCH_CAPABILITY_TABLE_REG_FATAL_ASIC_SDK_HEALTH_CATEGORY "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY" +#define SWITCH_CAPABILITY_TABLE_REG_WARNING_ASIC_SDK_HEALTH_CATEGORY "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY" +#define SWITCH_CAPABILITY_TABLE_REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY" + struct WarmRestartCheck { bool checkRestartReadyState; @@ -46,11 +52,24 @@ class SwitchOrch : public Orch bool checkOrderedEcmpEnable() { return m_orderedEcmpEnable; } + void onSwitchAsicSdkHealthEvent(sai_object_id_t switch_id, + sai_switch_asic_sdk_health_severity_t severity, + sai_timespec_t timestamp, + sai_switch_asic_sdk_health_category_t category, + sai_switch_health_data_t data, + const sai_u8_list_t &description); + + inline bool isFatalEventReceived() const + { + return (m_fatalEventCount != 0); + } + private: void doTask(Consumer &consumer); void doTask(swss::SelectableTimer &timer); void doCfgSwitchHashTableTask(Consumer &consumer); void doCfgSensorsTableTask(Consumer &consumer); + void doCfgSuppressAsicSdkHealthEventTableTask(Consumer &consumer); void doAppSwitchTableTask(Consumer &consumer); void initSensorsTable(); void querySwitchTpidCapability(); @@ -79,6 +98,8 @@ class SwitchOrch : public Orch swss::NotificationConsumer* m_restartCheckNotificationConsumer; void doTask(swss::NotificationConsumer& consumer); + void doAsicSdkHealthEventNotificationConsumerTask(swss::NotificationConsumer& consumer); + void doRestartCheckNotificationConsumerTask(swss::NotificationConsumer& consumer); swss::DBConnector *m_db; swss::Table m_switchTable; std::map m_aclGroups; @@ -99,6 +120,17 @@ class SwitchOrch : public Orch bool m_orderedEcmpEnable = false; bool m_PfcDlrInitEnable = false; + // ASIC SDK health event + std::shared_ptr m_stateDbForNotification = nullptr; + std::shared_ptr m_asicSdkHealthEventTable = nullptr; + std::set m_supportedAsicSdkHealthEventAttributes; + std::string m_eliminateEventsSha; + swss::SelectableTimer* m_eliminateEventsTimer = nullptr; + uint32_t m_fatalEventCount = 0; + + void initAsicSdkHealthEventNotification(); + sai_status_t registerAsicSdkHealthEventCategories(sai_switch_attr_t saiSeverity, const std::string &severityString, const std::string &suppressed_category_list=""); + // Switch hash SAI defaults struct { struct { diff --git a/tests/mock_tests/Makefile.am b/tests/mock_tests/Makefile.am index 96c95b121b8..db410907c51 100644 --- a/tests/mock_tests/Makefile.am +++ b/tests/mock_tests/Makefile.am @@ -57,6 +57,7 @@ tests_SOURCES = aclorch_ut.cpp \ mux_rollback_ut.cpp \ warmrestartassist_ut.cpp \ test_failure_handling.cpp \ + switchorch_ut.cpp \ warmrestarthelper_ut.cpp \ neighorch_ut.cpp \ twamporch_ut.cpp \ diff --git a/tests/mock_tests/mock_table.cpp b/tests/mock_tests/mock_table.cpp index 4d512a98354..df5e7e57533 100644 --- a/tests/mock_tests/mock_table.cpp +++ b/tests/mock_tests/mock_table.cpp @@ -23,6 +23,26 @@ namespace swss using namespace testing_db; + void merge_values(std::vector &existing_values, const std::vector &values) + { + std::vector new_values(values); + std::set field_set; + for (auto &value : values) + { + field_set.insert(fvField(value)); + } + for (auto &value : existing_values) + { + auto &field = fvField(value); + if (field_set.find(field) != field_set.end()) + { + continue; + } + new_values.push_back(value); + } + existing_values.swap(new_values); + } + bool Table::get(const std::string &key, std::vector &ovalues) { auto table = gDB[m_pipe->getDbId()][getTableName()]; @@ -61,7 +81,15 @@ namespace swss const std::string &prefix) { auto &table = gDB[m_pipe->getDbId()][getTableName()]; - table[key] = values; + auto iter = table.find(key); + if (iter == table.end()) + { + table[key] = values; + } + else + { + merge_values(iter->second, values); + } } void Table::getKeys(std::vector &keys) @@ -95,22 +123,7 @@ namespace swss } else { - std::vector new_values(values); - std::set field_set; - for (auto &value : values) - { - field_set.insert(fvField(value)); - } - for (auto &value : iter->second) - { - auto &field = fvField(value); - if (field_set.find(field) != field_set.end()) - { - continue; - } - new_values.push_back(value); - } - iter->second.swap(new_values); + merge_values(iter->second, values); } } diff --git a/tests/mock_tests/switchorch_ut.cpp b/tests/mock_tests/switchorch_ut.cpp new file mode 100644 index 00000000000..53c1ad673ae --- /dev/null +++ b/tests/mock_tests/switchorch_ut.cpp @@ -0,0 +1,288 @@ +#define private public // make Directory::m_values available to clean it. +#include "directory.h" +#undef private +#define protected public +#include "orch.h" +#undef protected +#include "ut_helper.h" +#include "mock_orchagent_main.h" +#include "mock_table.h" +#include "mock_response_publisher.h" + +extern void on_switch_asic_sdk_health_event(sai_object_id_t switch_id, + sai_switch_asic_sdk_health_severity_t severity, + sai_timespec_t timestamp, + sai_switch_asic_sdk_health_category_t category, + sai_switch_health_data_t data, + const sai_u8_list_t description); + +namespace switchorch_test +{ + using namespace std; + + sai_switch_api_t ut_sai_switch_api; + sai_switch_api_t *pold_sai_switch_api; + + shared_ptr m_app_db; + shared_ptr m_config_db; + shared_ptr m_state_db; + + sai_switch_attr_t _ut_stub_asic_sdk_health_event_attribute_to_check; + set _ut_stub_asic_sdk_health_event_passed_categories; + + bool _ut_reg_event_unsupported; + + sai_status_t _ut_stub_sai_set_switch_attribute( + _In_ sai_object_id_t switch_id, + _In_ const sai_attribute_t *attr) + { + switch (attr[0].id) + { + case SAI_SWITCH_ATTR_SWITCH_ASIC_SDK_HEALTH_EVENT_NOTIFY: + if (_ut_reg_event_unsupported) + { + return SAI_STATUS_NOT_IMPLEMENTED; + } + break; + case SAI_SWITCH_ATTR_REG_FATAL_SWITCH_ASIC_SDK_HEALTH_CATEGORY: + case SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY: + case SAI_SWITCH_ATTR_REG_NOTICE_SWITCH_ASIC_SDK_HEALTH_CATEGORY: + if (_ut_stub_asic_sdk_health_event_attribute_to_check == attr[0].id) + { + auto *passed_category_list = reinterpret_cast(attr[0].value.s32list.list); + _ut_stub_asic_sdk_health_event_passed_categories = set(passed_category_list, passed_category_list + attr[0].value.s32list.count); + } + return SAI_STATUS_SUCCESS; + default: + break; + } + return pold_sai_switch_api->set_switch_attribute(switch_id, attr); + } + + void _hook_sai_apis() + { + ut_sai_switch_api = *sai_switch_api; + pold_sai_switch_api = sai_switch_api; + ut_sai_switch_api.set_switch_attribute = _ut_stub_sai_set_switch_attribute; + sai_switch_api = &ut_sai_switch_api; + } + + void _unhook_sai_apis() + { + sai_switch_api = pold_sai_switch_api; + } + + struct SwitchOrchTest : public ::testing::Test + { + SwitchOrchTest() + { + } + + void SetUp() override + { + _ut_reg_event_unsupported = false; + + map profile = { + { "SAI_VS_SWITCH_TYPE", "SAI_VS_SWITCH_TYPE_BCM56850" }, + { "KV_DEVICE_MAC_ADDRESS", "20:03:04:05:06:00" } + }; + + ut_helper::initSaiApi(profile); + + sai_attribute_t attr; + + attr.id = SAI_SWITCH_ATTR_INIT_SWITCH; + attr.value.booldata = true; + + auto status = sai_switch_api->create_switch(&gSwitchId, 1, &attr); + ASSERT_EQ(status, SAI_STATUS_SUCCESS); + } + + void initSwitchOrch() + { + // Init switch and create dependencies + m_app_db = make_shared("APPL_DB", 0); + m_config_db = make_shared("CONFIG_DB", 0); + m_state_db = make_shared("STATE_DB", 0); + + TableConnector stateDbSwitchTable(m_state_db.get(), "SWITCH_CAPABILITY"); + TableConnector conf_asic_sensors(m_config_db.get(), CFG_ASIC_SENSORS_TABLE_NAME); + TableConnector app_switch_table(m_app_db.get(), APP_SWITCH_TABLE_NAME); + TableConnector conf_suppress_asic_sdk_health_categories(m_config_db.get(), CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME); + + vector switch_tables = { + conf_asic_sensors, + conf_suppress_asic_sdk_health_categories, + app_switch_table + }; + + ASSERT_EQ(gSwitchOrch, nullptr); + gSwitchOrch = new SwitchOrch(m_app_db.get(), switch_tables, stateDbSwitchTable); + } + + void TearDown() override + { + gDirectory.m_values.clear(); + + delete gSwitchOrch; + gSwitchOrch = nullptr; + + ut_helper::uninitSaiApi(); + } + }; + + TEST_F(SwitchOrchTest, SwitchOrchTestSuppressCategories) + { + initSwitchOrch(); + _hook_sai_apis(); + + vector ts; + std::deque entries; + Table suppressAsicSdkHealthEventTable = Table(m_config_db.get(), CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME); + set all_categories({ + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_SW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_CPU_HW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW}); + set empty_category; + + // case: severity: fatal, operation: suppress all categories + entries.push_back({"fatal", "SET", + { + {"categories", "software,firmware,cpu_hw,asic_hw"} + }}); + auto consumer = dynamic_cast(gSwitchOrch->getExecutor(CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME)); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_FATAL_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, empty_category); + + // case: severity: warning, operation: suppress partial categories + entries.push_back({"warning", "SET", + { + {"categories", "software,cpu_hw,invalid_category"} + }}); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, set({ + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW})); + + // case: invalid severity, nothing changed (to satisfy coverate) + entries.push_back({"warninga", "SET", + { + {"categories", "software,cpu_hw,asic_hw"} + }}); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, set({ + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW})); + + // case: severity: warning, operation: set max_events only, which means to remove suppress list + entries.push_back({"warning", "SET", + { + {"max_events", "10"} + }}); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, set({ + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_SW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_CPU_HW, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_ASIC_HW})); + + // case: severity: notice, operation: suppress no category + entries.push_back({"notice", "DEL", {}}); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_NOTICE_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, all_categories); + + _unhook_sai_apis(); + } + + TEST_F(SwitchOrchTest, SwitchOrchTestCheckCapability) + { + initSwitchOrch(); + + string value; + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_ASIC_SDK_HEALTH_EVENT_CAPABLE, value); + ASSERT_EQ(value, "true"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_FATAL_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "true"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_WARNING_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "true"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "true"); + } + + TEST_F(SwitchOrchTest, SwitchOrchTestCheckCapabilityUnsupported) + { + _ut_reg_event_unsupported = true; + _hook_sai_apis(); + initSwitchOrch(); + + string value; + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_ASIC_SDK_HEALTH_EVENT_CAPABLE, value); + ASSERT_EQ(value, "false"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_FATAL_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "false"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_WARNING_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "false"); + gSwitchOrch->m_switchTable.hget("switch", SWITCH_CAPABILITY_TABLE_REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY, value); + ASSERT_EQ(value, "false"); + + // case: unsupported severity. To satisfy coverage. + vector ts; + std::deque entries; + Table suppressAsicSdkHealthEventTable = Table(m_config_db.get(), CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME); + entries.push_back({"fatal", "SET", + { + {"categories", "software,firmware,cpu_hw,asic_hw"} + }}); + set empty_category; + auto consumer = dynamic_cast(gSwitchOrch->getExecutor(CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME)); + consumer->addToSync(entries); + entries.clear(); + _ut_stub_asic_sdk_health_event_attribute_to_check = SAI_SWITCH_ATTR_REG_FATAL_SWITCH_ASIC_SDK_HEALTH_CATEGORY; + _ut_stub_asic_sdk_health_event_passed_categories = empty_category; + static_cast(gSwitchOrch)->doTask(); + ASSERT_EQ(_ut_stub_asic_sdk_health_event_passed_categories, empty_category); + } + + TEST_F(SwitchOrchTest, SwitchOrchTestHandleEvent) + { + initSwitchOrch(); + + sai_timespec_t timestamp = {.tv_sec = 1701160447, .tv_nsec = 538710245}; + sai_switch_health_data_t data = {.data_type = SAI_HEALTH_DATA_TYPE_GENERAL}; + vector data_from_sai({100, 101, 115, 99, 114, 105, 112, 116, 105, 245, 111, 110, 245, 10, 123, 125, 100, 100}); + sai_u8_list_t description; + description.list = data_from_sai.data(); + description.count = (uint32_t)(data_from_sai.size() - 2); + on_switch_asic_sdk_health_event(gSwitchId, + SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL, + timestamp, + SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, + data, + description); + + string key = "2023-11-28 08:34:07"; + string value; + gSwitchOrch->m_asicSdkHealthEventTable->hget(key, "category", value); + ASSERT_EQ(value, "firmware"); + gSwitchOrch->m_asicSdkHealthEventTable->hget(key, "severity", value); + ASSERT_EQ(value, "fatal"); + gSwitchOrch->m_asicSdkHealthEventTable->hget(key, "description", value); + ASSERT_EQ(value, "description\n{}"); + } +}