Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLS Silverstone: enable the init sequence for all of the QSFPDD CMIS4… #95

Merged
merged 4 commits into from
Dec 1, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 52 additions & 18 deletions device/celestica/x86_64-cel_silverstone-r0/plugins/sfputil.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,16 +409,32 @@ def _write_byte(self, port_num, devid, page, off, val):
f.seek(addr)
f.write(chr(val))
except Exception as ex:
log_info("write failed: {0}".format(ex))
log_info("PORT {0}: pg={1}, ofs={2}: write failed: {3}".format(port_num, page, off, ex))
finally:
f.close()

def _init_cmis_module_custom(self, port_num, xcvr, hwsku):
# As of now, init sequence is only necessary for 'INNOLIGHT T-DP4CNT-N00'
if xcvr != 'INNOLIGHT T-DP4CNT-N00':
return True

log_info("PORT {0}: {1}: _init_cmis_module_custom".format(port_num, xcvr))
def _get_application_code(self, port_num, host_intf):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no guarantee that 100G PAM4 mode is always the application 2, and hence we'll need to decode the application advertisement and figure the correct application code

buf = self._read_eeprom_devid(port_num, self.IDENTITY_EEPROM_ADDR, 85, 36)
if buf is None:
return 1
tbl = self.parse_media_type(buf, 0)
app = 1
if tbl is not None:
zhenggen-xu marked this conversation as resolved.
Show resolved Hide resolved
offset = 0
hid = int(buf[1 + offset], 16)
while (app <= 8) and (hid != 0) and (hid != 0xff):
(h, m) = self.parse_application(tbl, buf[1 + offset], buf[2 + offset])
if h in host_intf:
log_info("PORT {0}: {1}-{2} looks good".format(port_num, app, h))
break
app += 1
offset += 4
hid = int(buf[1 + offset], 16)
return app if app <= 8 else 1

def _init_cmis4_module(self, port_num, xcvr, hwsku):

log_info("PORT {0}: {1}: _init_cmis4_module".format(port_num, xcvr))

# Allow 1s for software reset
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, -1, 26, 0x08)
Expand All @@ -429,15 +445,18 @@ def _init_cmis_module_custom(self, port_num, xcvr, hwsku):
# Hi-Power
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, -1, 26, 0x00)
# Application selection
app = 1
if '128x100' in hwsku:
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 145, 0x21)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 146, 0x21)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 147, 0x25)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 148, 0x25)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 149, 0x29)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 150, 0x29)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 151, 0x2d)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 152, 0x2d)
app = self._get_application_code(port_num, '100GAUI-2 C2M (Annex 135G),100GBASE-CR2 (Clause 136)')
if app != 1:
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 145, (app << 4) | 0x01)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 146, (app << 4) | 0x01)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 147, (app << 4) | 0x05)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 148, (app << 4) | 0x05)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 149, (app << 4) | 0x09)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 150, (app << 4) | 0x09)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 151, (app << 4) | 0x0d)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 152, (app << 4) | 0x0d)
else:
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 145, 0x11)
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 146, 0x11)
Expand All @@ -450,7 +469,7 @@ def _init_cmis_module_custom(self, port_num, xcvr, hwsku):
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 143, 0xff)
# Initialize datapath
self._write_byte(port_num, self.IDENTITY_EEPROM_ADDR, 0x10, 128, 0x00)
time.sleep(0.5)
time.sleep(1)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UT result shows that we need a few seconds for the actual configuration status correctly reflected on the register, but we don't really want to wait too long in the xcvrd, so I slightly enlarge the delay from 500ms to 1sec, hope this could reduce the redudant errors in the syslog

# Validate configuration status
buf = self._read_eeprom_devid(port_num, self.IDENTITY_EEPROM_ADDR, self._page_to_flat(202, 0x11), 4)
err = "".join(buf)
Expand All @@ -477,7 +496,15 @@ def _init_cmis_module(self, port_num):
#log_info("_init_cmis_module: p={0}, id='{1}', name='{2}', part='{3}'".format(port_num, buf[0], name, part))
xcvr = name.upper() + " " + part.upper()

return self._init_cmis_module_custom(port_num, xcvr, self.hwsku)
# Dispatch as per the CMIS revision id
buf = self._read_eeprom_devid(port_num, self.IDENTITY_EEPROM_ADDR, 0x0, 4)
rev = 0x30
if buf is not None:
rev = int(buf[1], 16)
if rev >= 0x40:
return self._init_cmis4_module(port_num, xcvr, self.hwsku)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the CMIS3 modules we have are working fine with 4x100G in its default application mode, and hence this init sequence will only be performed on CMIS4 modules.

Note: The definition of the datapath initialiation slightly differs from CMIS3 to CMIS4, and hence we can't use _init_cmis4_module() on CMIS3 modules.

return True

def get_transceiver_change_event(self, timeout=0):
"""
Expand Down Expand Up @@ -524,7 +551,14 @@ def get_transceiver_change_event(self, timeout=0):
# initiate QSFP-DD software reset if failure counter > 2
if self.mod_failure[x] > 2:
self.mod_failure[x] = 0
self._write_byte(x, self.IDENTITY_EEPROM_ADDR, -1, 26, 0x08)
# perform a software reset if it's not a DAC (Passive Copper)
# Note:
# This is actually a hotfix for 'Eoptolink EOLD-134HG-02-M6, QSFPDD CMIS v3',
# its module state could get stuck at ModuleLowPwr upon reinsertion, and a software
# reset is necessary to get it recovered.
buf = self._read_eeprom_devid(x, self.IDENTITY_EEPROM_ADDR, 85, 1)
if (buf is not None) and (int(buf[0], 16) != 0x03):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to address the DAC failure below:
pmon#sfputil.py: write failed: [Errno 110] Connection timed out

Some of the DAC (Passive Copper) do not support software reset, and hence such a write attempt will fail

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of DAC, should we even add failure counter and come here, See line 553:
self.mod_failure[x] += 1

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another generic question, it seems we are keep monitoring all the settings and do reset/init etc all the time. It is a bit concerning if we that could be disruptive. Thinking maybe we could only do these steps when we got the optical insert event, and finish the bring up process once. of course, we will do it again if insert happens again. Let me know.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We'll actually do nothing in the case of DAC, so I'm not sure why we need to keep the counter advanced? Do we want to have any other automatic recovery for DAC?

  2. The reason why this is not handled in the insertion event is that, the I2C read could fail and it was observed during the QA test last week. However, we could add a inited flag to make sure this is only performed once and only once, let me update the code logic

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1- that is exactly my point, we should not advance the counter and then avoid the write here, we should eliminate the counter advance in the code earlier so it won't even be a failure case.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy that, and thanks, I'll update the code accordingly

self._write_byte(x, self.IDENTITY_EEPROM_ADDR, -1, 26, 0x08)

# Monitoring the QSFP-DD initialization state, and reinitiate it if necessary
buf = self._read_eeprom_devid(x, self.IDENTITY_EEPROM_ADDR, self._page_to_flat(202, 0x11), 4)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ static int smbus_access(struct i2c_adapter *adapter, u16 addr,
goto Done;
}

#if 1 /* 100 kHz */
#if 0 /* 100 kHz */
Copy link
Author

@ds952811 ds952811 Nov 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, this is unlikely to be the root cause of I2C failure reported yesterday, however it's the only thing under our control without having the Celestica engaged, that's why I did this.
Please note it's always working on our test unit no matter if high-speed (BIT6-0x40) is enabled or not.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we turn off the high speed mode, how slow would it be? I don't feel we should do this unless we feel it was necessary.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We will have ~30% delays for each I2C operations when the BIT6 (high-speed) is cleared
  2. Based on the latest test report from Broadcom QA, slowing down the clock does not improve the transceiver compatibility.
    And hence, I'll revert this change.

Note: None of the QSFPDD optics is available at my side, so I'm counting on QA for the verifications

e.g.
Dante,

It appears Eoptolink 3.0 broke. It took ~17 minutes for link up after remove/insert
Please see the logs below

Regards,
-kevin

......
Nov 25 21:47:25.015491 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0
Nov 25 21:47:25.015491 sonic INFO pmon#sfputil.py: PORT 31: Unable to initialize the module
Nov 25 21:48:06.595530 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0
Nov 25 21:48:06.595530 sonic INFO pmon#sfputil.py: PORT 31: Unable to initialize the module
Nov 25 21:49:36.731519 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0
Nov 25 21:49:36.731519 sonic INFO pmon#sfputil.py: PORT 31: Unable to initialize the module
Nov 25 21:50:11.383533 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0
Nov 25 21:50:11.383533 sonic INFO pmon#sfputil.py: PORT 31: Unable to initialize the module
Nov 25 21:50:39.103534 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0
Nov 25 21:50:39.103534 sonic INFO pmon#sfputil.py: PORT 31: Unable to initialize the module
Nov 25 21:51:34.543512 sonic INFO pmon#sfputil.py: PORT 31: unable to read PAGE0

iowrite8(portid | 0x40, pci_bar + REG_ID0);
#else
iowrite8(portid, pci_bar + REG_ID0);
Expand Down Expand Up @@ -2297,4 +2297,4 @@ module_exit(silverstone_exit);
MODULE_AUTHOR("Celestica Inc.");
MODULE_DESCRIPTION("Celestica Silverstone platform driver");
MODULE_VERSION(MOD_VERSION);
MODULE_LICENSE("GPL");
MODULE_LICENSE("GPL");