Skip to content

Commit

Permalink
[Mellanox] Extend mellanox platform API to report SFP error event (#4365
Browse files Browse the repository at this point in the history
)

* extend mellanox platform API to report SFP error event
* remove unnecessary loop code
* install enum34 to pmon to support using Enum
  • Loading branch information
keboliu authored and abdosi committed Apr 15, 2020
1 parent c3e030b commit e4bd7ab
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 26 deletions.
3 changes: 2 additions & 1 deletion dockers/docker-platform-monitor/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ RUN apt-get update && \
rrdtool \
python-smbus \
ethtool \
dmidecode
dmidecode && \
pip install enum34

{% if docker_platform_monitor_debs.strip() -%}
# Copy locally-built Debian package dependencies
Expand Down
15 changes: 1 addition & 14 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,25 +433,12 @@ def get_change_event(self, timeout=0):
timeout = MAX_SELECT_DELAY
while True:
status = self.sfp_event.check_sfp_status(port_dict, timeout)
if not port_dict == {}:
if bool(port_dict):
break
else:
status = self.sfp_event.check_sfp_status(port_dict, timeout)

if status:
# get_change_event has the meaning of retrieving all the notifications through a single call.
# Typically this is implemented via a select framework which requires the underlay file-reading
# interface able to retrieve all notifications without blocking once the fd has been selected.
# However, sdk doesn't provide any interface satisfied the requirement. as a result,
# check_sfp_status returns only one notification may indicate more notifications in its queue.
# In this sense, we have to iterate in a loop to get all the notifications in case that
# the first call returns at least one.
i = 0
while i < self.MAX_SELECT_EVENT_RETURNED:
status = self.sfp_event.check_sfp_status(port_dict, 0)
if not status:
break
i = i + 1
return True, {'sfp':port_dict}
else:
return True, {'sfp':{}}
94 changes: 83 additions & 11 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,69 @@
from python_sdk_api.sx_api import *
from sonic_daemon_base.daemon_base import Logger

SDK_SFP_STATE_IN = 0x1
# SFP status from PMAOS register
# 0x1 plug in
# 0x2 plug out
# 0x3 plug in with error
# 0x4 disabled, at this status SFP eeprom is not accessible,
# and presence status also will be not present,
# so treate it as plug out.
SDK_SFP_STATE_IN = 0x1
SDK_SFP_STATE_OUT = 0x2
SDK_SFP_STATE_ERR = 0x3
SDK_SFP_STATE_DIS = 0x4

# SFP status that will be handled by XCVRD
STATUS_PLUGIN = '1'
STATUS_PLUGOUT = '0'
STATUS_UNKNOWN = '2'
STATUS_ERR_I2C_STUCK = '2'
STATUS_ERR_BAD_EEPROM = '3'
STATUS_ERR_UNSUPPORTED_CABLE = '4'
STATUS_ERR_HIGH_TEMP = '5'
STATUS_ERR_BAD_CABLE = '6'

# SFP status used in this file only, will not expose to XCVRD
# STATUS_ERROR will be mapped to different status according to the error code
STATUS_UNKNOWN = '-1'
STATUS_ERROR = '-2'

# SFP error code, only valid when SFP at SDK_SFP_STATE_ERR status
# Only 0x2, 0x3, 0x5, 0x6 and 0x7 will block the eeprom access,
# so will only report above errors to XCVRD and other errors will be
# printed to syslog.

'''
0x0: "Power_Budget_Exceeded",
0x1: "Long_Range_for_non_MLNX_cable_or_module",
0x2: "Bus_stuck",
0x3: "bad_or_unsupported_EEPROM",
0x4: "Enforce_part_number_list",
0x5: "unsupported_cable",
0x6: "High_Temperature",
0x7: "bad_cable",
0x8: "PMD_type_is_not_enabled",
0x9: "[internal]Laster_TEC_failure",
0xa: "[internal]High_current",
0xb: "[internal]High_voltage",
0xd: "[internal]High_power",
0xe: "[internal]Module_state_machine_fault",
0xc: "pcie_system_power_slot_Exceeded"
'''

# SFP errors that will block eeprom accessing
sdk_sfp_err_type_dict = {
0x2: STATUS_ERR_I2C_STUCK,
0x3: STATUS_ERR_BAD_EEPROM,
0x5: STATUS_ERR_UNSUPPORTED_CABLE,
0x6: STATUS_ERR_HIGH_TEMP,
0x7: STATUS_ERR_BAD_CABLE
}

sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
SDK_SFP_STATE_ERR: STATUS_ERROR,
SDK_SFP_STATE_DIS: STATUS_PLUGOUT,
}

# system level event/error
Expand Down Expand Up @@ -174,7 +228,7 @@ def check_sfp_status(self, port_change, timeout):

for fd in read:
if fd == self.rx_fd_p.fd:
success, port_list, module_state = self.on_pmpe(self.rx_fd_p)
success, port_list, module_state, error_type = self.on_pmpe(self.rx_fd_p)
if not success:
logger.log_error("failed to read from {}".format(fd))
break
Expand All @@ -192,15 +246,23 @@ def check_sfp_status(self, port_change, timeout):
found += 1
continue

# If get SFP status error(0x3) from SDK, then need to read the error_type to get the detailed error
if sfp_state == STATUS_ERROR:
if error_type in sdk_sfp_err_type_dict.keys():
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
sfp_state = sdk_sfp_err_type_dict[error_type]
else:
# For errors don't block the eeprom accessing, we don't report it to XCVRD
logger.log_info("SFP error on port but not blocking eeprom read, error_type {}".format(error_type))
found +=1
continue

for port in port_list:
logger.log_info("SFP on port {} state {}".format(port, sfp_state))
port_change[port] = sfp_state
found += 1

if found == 0:
return False
else:
return True
return found != 0

def on_pmpe(self, fd_p):
''' on port module plug event handler '''
Expand Down Expand Up @@ -228,7 +290,17 @@ def on_pmpe(self, fd_p):
port_list_size = pmpe_t.list_size
logical_port_list = pmpe_t.log_port_list
module_state = pmpe_t.module_state

error_type = pmpe_t.error_type
module_id = pmpe_t.module_id

if module_state == SDK_SFP_STATE_ERR:
logger.log_error("Receive PMPE error event on module {}: status {} error type {}".format(module_id, module_state, error_type))
elif module_state == SDK_SFP_STATE_DIS:
logger.log_info("Receive PMPE disable event on module {}: status {}".format(module_id, module_state))
elif module_state == SDK_SFP_STATE_IN or module_state == SDK_SFP_STATE_OUT:
logger.log_info("Receive PMPE plug in/out event on module {}: status {}".format(module_id, module_state))
else:
logger.log_error("Receive PMPE unknown event on module {}: status {}".format(module_id, module_state))
for i in xrange(port_list_size):
logical_port = sx_port_log_id_t_arr_getitem(logical_port_list, i)
rc = sx_api_port_device_get(self.handle, 1 , 0, port_attributes_list, port_cnt_p)
Expand All @@ -247,4 +319,4 @@ def on_pmpe(self, fd_p):
delete_sx_port_attributes_t_arr(port_attributes_list)
delete_uint32_t_p(port_cnt_p)

return status, label_port_list, module_state,
return status, label_port_list, module_state, error_type

0 comments on commit e4bd7ab

Please sign in to comment.