Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] mlnx-sfpd init flow enhancement #3294

Merged
merged 9 commits into from
Aug 8, 2019
104 changes: 73 additions & 31 deletions platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'

SFPD_LIVENESS_EXPIRE_SECS = 30

SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'

sfp_value_status_dict = {
SDK_SFP_STATE_IN: STATUS_PLUGIN,
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
Expand Down Expand Up @@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
class MlnxSfpd:
''' Listen to plugin/plugout cable events '''

SX_OPEN_RETRIES = 20
SX_OPEN_RETRIES = 30
SX_OPEN_TIMEOUT = 5
SELECT_TIMEOUT = 1

def __init__(self):
Expand All @@ -75,7 +78,6 @@ class MlnxSfpd:
# Allocate SDK fd and user channel structures
self.rx_fd_p = new_sx_fd_t_p()
self.user_channel_p = new_sx_user_channel_t_p()

self.state_db = SonicV2Connector(host=REDIS_HOSTIP)

# Register our signal handlers
Expand All @@ -98,37 +100,78 @@ class MlnxSfpd:
def initialize(self):
self.state_db.connect("STATE_DB")

# open SDK API handle
# retry at most SX_OPEN_RETRIES times to wait
# until SDK is started during system startup
retry = 1
while True:
rc, self.handle = sx_api_open(None)
if rc == SX_STATUS_SUCCESS:
break

log_warning("failed to open SDK API handle... retrying {}".format(retry))
swid_cnt_p = None

time.sleep(2 ** retry)
retry += 1

if retry > self.SX_OPEN_RETRIES:
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))

rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
try:
# Wait for SDK daemon to be started with detect the sdk_ready file
retry = 0
while not os.path.exists(SDK_DAEMON_READY_FILE):
stephenxs marked this conversation as resolved.
Show resolved Hide resolved
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
else:
log_info("SDK daemon not started yet, retry {} times".format(retry))
retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)

self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p
# to make sure SDK daemon has started
time.sleep(self.SX_OPEN_TIMEOUT)

rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
rc, self.handle = sx_api_open(None)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))

rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
if rc != SX_STATUS_SUCCESS:
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))

self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
self.user_channel_p.channel.fd = self.rx_fd_p

# Wait for switch to be created and initialized inside SDK
retry = 0
swid_cnt_p = new_uint32_t_p()
uint32_t_p_assign(swid_cnt_p, 0)
swid_cnt = 0
while True:
if retry >= self.SX_OPEN_RETRIES:
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
else:
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
if rc == SX_STATUS_SUCCESS:
swid_cnt = uint32_t_p_value(swid_cnt_p)
if swid_cnt > 0:
delete_uint32_t_p(swid_cnt_p)
swid_cnt_p = None
break
else:
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
else:
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))

retry = retry + 1
time.sleep(self.SX_OPEN_TIMEOUT)

# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
SX_ACCESS_CMD_REGISTER,
self.swid,
SX_TRAP_ID_PMPE,
self.user_channel_p)

if rc != SX_STATUS_SUCCESS:
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))

self.running = True
except Exception as e:
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
if swid_cnt_p is not None:
delete_uint32_t_p(swid_cnt_p)
self.deinitialize()

def deinitialize(self):
# remove mlnx-sfpd liveness key in DB if not expired yet
Expand Down Expand Up @@ -156,7 +199,6 @@ class MlnxSfpd:
log_error("sx_api_close exited with error, rc {}".format(rc))

def run(self):
self.running = True

while self.running:
try:
Expand Down