Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Auto correct PSU voltage threshold (WA) #10394

Merged
merged 2 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -120,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
Expand All @@ -181,6 +184,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
Expand All @@ -137,6 +140,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
70 changes: 70 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@

try:
import os
import time
from sonic_platform_base.psu_base import PsuBase
from sonic_py_common.logger import Logger
from .device_data import DeviceDataManager
from .led import PsuLed, SharedLed, ComponentFaultyIndicator
from . import utils
from .vpd_parser import VpdParser
Expand Down Expand Up @@ -411,6 +413,7 @@ def get_voltage_high_threshold(self):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'max' in capability:
max_voltage = utils.read_int_from_file(self.psu_voltage_max, log_func=logger.log_info)
max_voltage = InvalidPsuVolWA.run(self, max_voltage, self.psu_voltage_max)
return float(max_voltage) / 1000

return None
Expand All @@ -431,6 +434,7 @@ def get_voltage_low_threshold(self):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'min' in capability:
min_voltage = utils.read_int_from_file(self.psu_voltage_min, log_func=logger.log_info)
min_voltage = InvalidPsuVolWA.run(self, min_voltage, self.psu_voltage_min)
return float(min_voltage) / 1000

return None
Expand All @@ -448,3 +452,69 @@ def get_maximum_supplied_power(self):
return float(power_max) / 1000000
else:
return None


class InvalidPsuVolWA:
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:
1. Check the PSU vendor, it should be Delta
2. Generate a temp sensor configuration file which contains a few set commands. Those set commands are the WA provided by low level team.
3. Call "sensors -s -c <tmp_conf_file>"
4. Wait for it to take effect

This issue is found on 3700, 3700c, 3800, 4600c
"""

INVALID_VOLTAGE_VALUE = 127998
EXPECT_VENDOR_NAME = 'DELTA'
EXPECT_CAPACITY = '1100'
EXPECT_PLATFORMS = ['x86_64-mlnx_msn3700-r0', 'x86_64-mlnx_msn3700c-r0', 'x86_64-mlnx_msn3800-r0', 'x86_64-mlnx_msn4600c-r0']
MFR_FIELD = 'MFR_NAME'
CAPACITY_FIELD = 'CAPACITY'
WAIT_TIME = 5

@classmethod
def run(cls, psu, threshold_value, threshold_file):
if threshold_value != cls.INVALID_VOLTAGE_VALUE:
# If the threshold value is not an invalid value, just return
return threshold_value

platform_name = DeviceDataManager.get_platform_name()
# Apply the WA to specified platforms
if platform_name not in cls.EXPECT_PLATFORMS:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but platform is {}'.format(psu.index, threshold_file, threshold_value, platform_name))
return threshold_value

# Check PSU vendor, make sure it is DELTA
vendor_name = psu.vpd_parser.get_entry_value(cls.MFR_FIELD)
if vendor_name != 'N/A' and vendor_name != cls.EXPECT_VENDOR_NAME:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but its vendor is {}'.format(psu.index, threshold_file, threshold_value, vendor_name))
return threshold_value

# Check PSU version, make sure it is 1100
capacity = psu.vpd_parser.get_entry_value(cls.CAPACITY_FIELD)
if capacity != 'N/A' and capacity != cls.EXPECT_CAPACITY:
logger.log_warning('PSU {} threshold file {} value {}, but its capacity is {}'.format(psu.index, threshold_file, threshold_value, capacity))
return threshold_value

# Run a sensor -s command to triger hardware to get the real threashold value
utils.run_command('sensor -s')

# Wait for the threshold value change
return cls.wait_set_done(threshold_file)

@classmethod
def wait_set_done(cls, threshold_file):
wait_time = cls.WAIT_TIME
while wait_time > 0:
value = utils.read_int_from_file(threshold_file, log_func=logger.log_info)
if value != cls.INVALID_VOLTAGE_VALUE:
return value

wait_time -= 1
time.sleep(1)

logger.log_error('sensor -s does not recover PSU threshold sensor after {} seconds'.format(cls.WAIT_TIME))
return None
13 changes: 13 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,16 @@ def _impl(*args, **kwargs):
return return_value
return _impl
return wrapper


def run_command(command):
"""
Utility function to run an shell command and return the output.
:param command: Shell command string.
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0].strip()
except Exception:
return None
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
SN_VPD_FIELD = "SN_VPD_FIELD"
PN_VPD_FIELD = "PN_VPD_FIELD"
REV_VPD_FIELD = "REV_VPD_FIELD"
MFR_VPD_FIELD = "MFR_NAME"


class VpdParser:
Expand Down Expand Up @@ -82,3 +83,17 @@ def get_revision(self):
logger.log_error("Fail to read revision: No key {} in VPD {}".format(REV_VPD_FIELD, self.vpd_file))
return 'N/A'
return self.vpd_data.get(REV_VPD_FIELD, 'N/A')

def get_entry_value(self, key):
"""
Retrieves an vpd entry of the device

Returns:
string: Vpd entry value of device
"""
if self._get_data() and key not in self.vpd_data:
logger.log_warning("Fail to read vpd info: No key {} in VPD {}".format(key, self.vpd_file))
return 'N/A'
return self.vpd_data.get(key, 'N/A')


37 changes: 37 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,40 @@ def test_psu_vpd(self):
assert psu.get_model() == 'MTEF-PSF-AC-C'
assert psu.get_serial() == 'MT1946X07684'
assert psu.get_revision() == 'A3'

assert psu.vpd_parser.get_entry_value('MFR_NAME') == 'DELTA'

@mock.patch('sonic_platform.utils.read_int_from_file', mock.MagicMock(return_value=9999))
@mock.patch('sonic_platform.utils.run_command')
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
@mock.patch('sonic_platform.vpd_parser.VpdParser.get_entry_value')
def test_psu_workaround(self, mock_get_entry_value, mock_get_platform_name, mock_run_command):
from sonic_platform.psu import InvalidPsuVolWA
psu = Psu(0)
# Threshold value is not InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
assert InvalidPsuVolWA.run(psu, 9999, '') == 9999

# Platform name is not in InvalidPsuVolWA.EXPECT_PLATFORMS
mock_get_platform_name.return_value = 'some platform'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# PSU vendor is not InvalidPsuVolWA.EXPECT_VENDOR_NAME
vpd_info = {
InvalidPsuVolWA.MFR_FIELD: 'some psu',
InvalidPsuVolWA.CAPACITY_FIELD: 'some capacity'
}
def get_entry_value(key):
return vpd_info[key]

mock_get_entry_value.side_effect = get_entry_value
mock_get_platform_name.return_value = 'x86_64-mlnx_msn3700-r0'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# PSU capacity is not InvalidPsuVolWA.EXPECT_CAPACITY
vpd_info[InvalidPsuVolWA.MFR_FIELD] = InvalidPsuVolWA.EXPECT_VENDOR_NAME
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# Normal
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
mock_run_command.assert_called_with('sensor -s')
4 changes: 4 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,7 @@ def func():

assert func() == 100
assert mock_log.call_count == 1

def test_run_command(self):
output = utils.run_command('ls')
assert output