Skip to content

Commit

Permalink
[201911] DellEMC S6100 SSD Monitor (#6934)
Browse files Browse the repository at this point in the history
Why I did it
To monitor the SSD health condition in DellEMC S6100 platform post upgrade.

A daemon is introduced to monitor the SSD every one hour.

To check for SSD status at boot time and at the time of cold-reboot.

All these changes are supported only for newer SSD firmware.

Added a platform_reboot_pre_check script to prevent cold-reboot based on SSD status.
Depends on sonic-net/sonic-utilities#1472
DO NOT MERGE UNTIL ABOVE PR IS MERGED
  • Loading branch information
santhosh-kt authored Mar 13, 2021
1 parent 9b553d9 commit 140576d
Show file tree
Hide file tree
Showing 9 changed files with 208 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,20 @@ s6100/scripts/track_reboot_reason.sh usr/share/sonic/device/x86_64-dell_s6100_c2
s6100/scripts/warm-reboot_plugin usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
s6100/scripts/override.conf /etc/systemd/system/systemd-reboot.service.d
common/dell_lpc_mon.sh usr/local/bin
s6100/scripts/s6100_ssd_mon.sh usr/local/bin
s6100/scripts/s6100_ssd_upgrade_status.sh usr/local/bin
s6100/scripts/platform_sensors.py usr/local/bin
s6100/scripts/platform_reboot_pre_check usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
s6100/modules/sonic_platform-1.0-py2-none-any.whl usr/share/sonic/device/x86_64-dell_s6100_c2538-r0
s6100/scripts/platform_watchdog_enable.sh usr/local/bin
s6100/scripts/platform_watchdog_disable.sh usr/local/bin
s6100/scripts/sensors usr/bin
s6100/scripts/iSMART_64 usr/local/bin
s6100/systemd/platform-modules-s6100.service etc/systemd/system
s6100/systemd/s6100-lpc-monitor.service etc/systemd/system
s6100/systemd/s6100-ssd-monitor.service etc/systemd/system
s6100/systemd/s6100-ssd-monitor.timer etc/systemd/system
s6100/systemd/s6100-ssd-upgrade-status.service etc/systemd/system
s6100/systemd/s6100-reboot-cause.service etc/systemd/system
s6100/systemd/s6100-i2c-enumerate.service etc/systemd/system
tools/flashrom/flashrom usr/local/bin/
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
SSD_FW_UPGRADE="/host/ssd_fw_upgrade"

# Check SSD Status
if [ -e $SSD_FW_UPGRADE/GPIO7_low ] || [ -e $SSD_FW_UPGRADE/GPIO7_error ] || [ -e $SSD_FW_UPGRADE/GPIO_pending_upgrade ]; then
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty and does not support cold reboot."
logger -p user.crit -t DELL_S6100_SSD_MON "Please perform a soft-/fast-/warm-reboot instead"
exit 1
fi

if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
iSMART="/usr/local/bin/iSMART_64"
iSMART_OPTIONS="-d /dev/sda"

iSMART_CMD=`$iSMART $iSMART_OPTIONS`

GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')

if [ $GPIO_STATUS == "0x01" ];then
exit 0
else
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty and does not support cold reboot."
logger -p user.crit -t DELL_S6100_SSD_MON "Please perform a soft-/fast-/warm-reboot instead"
exit 1
fi
fi

exit 1
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ if [[ "$1" == "init" ]]; then
/usr/local/bin/platform_watchdog_disable.sh
fi

systemctl start --no-block s6100-ssd-upgrade-status.service

is_fast_warm=$(cat /proc/cmdline | grep SONIC_BOOT_TYPE | wc -l)

if [[ "$is_fast_warm" == "1" ]]; then
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

SSD_FW_UPGRADE="/host/ssd_fw_upgrade"

if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
iSMART="/usr/local/bin/iSMART_64"
iSMART_OPTIONS="-d /dev/sda"

iSMART_CMD=`$iSMART $iSMART_OPTIONS`
GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')

if [ $GPIO_STATUS != "0x01" ];then
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty and does not support cold reboot."
logger -p user.crit -t DELL_S6100_SSD_MON "If a reboot is required, please perform a soft-/fast-/warm-reboot."
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_low
systemctl stop s6100-ssd-monitor.timer
fi
else
systemctl stop s6100-ssd-monitor.timer
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash

SSD_FW_UPGRADE="/host/ssd_fw_upgrade"

if [ -e $SSD_FW_UPGRADE/GPIO7_high ]; then
systemctl start --no-block s6100-ssd-monitor.timer
exit 0
fi

if [ -e $SSD_FW_UPGRADE/GPIO7_low ] || [ -e $SSD_FW_UPGRADE/GPIO7_error ]; then
exit 0
fi

[ ! -d $SSD_FW_UPGRADE ] && mkdir $SSD_FW_UPGRADE

SSD_UPGRADE_LOG="$SSD_FW_UPGRADE/upgrade.log"

SMART_CMD=`smartctl -a /dev/sda`

SSD_FW_VERSION=$(echo "$SMART_CMD" | grep "Firmware Version" | awk '{print $NF}')
SSD_MODEL=$(echo "$SMART_CMD" | grep "Device Model" | awk '{print $NF}')

if [ -e $SSD_FW_UPGRADE/GPIO7_pending_upgrade ]; then
if [ $SSD_FW_VERSION == "S141002C" ] || [ $SSD_FW_VERSION == "S16425c1" ]; then
# If SSD Firmware is not upgraded
exit 0
fi
fi

echo "$0 `date` SSD FW upgrade logs post reboot." >> $SSD_UPGRADE_LOG

iSMART="/usr/local/bin/iSMART_64"
iSMART_OPTIONS="-d /dev/sda"
iSMART_CMD=`$iSMART $iSMART_OPTIONS`

SSD_UPGRADE_STATUS1=`io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 09 --offset 211; io_rd_wr.py --get --offset 212`
SSD_UPGRADE_STATUS1=$(echo "$SSD_UPGRADE_STATUS1" | awk '{print $NF}')

SSD_UPGRADE_STATUS2=`io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 0A --offset 211; io_rd_wr.py --get --offset 212`
SSD_UPGRADE_STATUS2=$(echo "$SSD_UPGRADE_STATUS2" | awk '{print $NF}')

if [ $SSD_UPGRADE_STATUS1 == "2" ]; then
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_error

echo "$0 `date` Upgraded to unknown version after first mp_64 upgrade." >> $SSD_UPGRADE_LOG

elif [ $SSD_UPGRADE_STATUS2 == "2" ];then
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_error

echo "$0 `date` Upgraded to unknown version after second mp_64 upgrade." >> $SSD_UPGRADE_LOG

elif [ $SSD_FW_VERSION == "S141002G" ] || [ $SSD_FW_VERSION == "S16425cG" ]; then
# If SSD Firmware is upgraded
GPIO_STATUS=$(echo "$iSMART_CMD" | grep GPIO | awk '{print $NF}')

if [ $GPIO_STATUS != "0x01" ];then
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty and does not support reboot."
logger -p user.crit -t DELL_S6100_SSD_MON "If a reboot is required, please perform a soft-/fast-/warm-reboot."
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_low
echo "$0 `date` The SSD on this unit is faulty and does not support cold reboot." >> $SSD_UPGRADE_LOG
echo "$0 `date` If a reboot is required, please perform a soft-/fast-/warm-reboot." >> $SSD_UPGRADE_LOG

else
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_high
fi

systemctl start --no-block s6100-ssd-monitor.timer

if [ $SSD_UPGRADE_STATUS1 == "0" ]; then
if [ $SSD_MODEL == "3IE" ];then
echo "$0 `date` SSD FW upgraded from S141002C to S141002G in first mp_64." >> $SSD_UPGRADE_LOG
else
echo "$0 `date` SSD FW upgraded from S16425c1 to S16425cG in first mp_64." >> $SSD_UPGRADE_LOG
fi
elif [ $SSD_UPGRADE_STATUS2 == "1" ]; then
echo "$0 `date` SSD entered loader mode in first mp_64 and upgraded to latest version after second mp_64." >> $SSD_UPGRADE_LOG
fi

else
if [ $SSD_UPGRADE_STATUS1 == "ff" ] && [ $SSD_UPGRADE_STATUS2 == "ff" ]; then
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_pending_upgrade

echo "$0 `date` SSD upgrade didn’t happened." >> $SSD_UPGRADE_LOG

elif [ $SSD_UPGRADE_STATUS1 == "1" ]; then
rm -rf $SSD_FW_UPGRADE/GPIO7_*
touch $SSD_FW_UPGRADE/GPIO7_low
logger -p user.crit -t DELL_S6100_SSD_MON "The SSD on this unit is faulty and does not support reboot."
logger -p user.crit -t DELL_S6100_SSD_MON "If a reboot is required, please perform a soft-/fast-/warm-reboot."

echo "$0 `date` SSD entered loader mode in first mp_64 upgrade." >> $SSD_UPGRADE_LOG

if [ $SSD_UPGRADE_STATUS2 == "0" ]; then
echo "$0 `date` SSD entered loader mode in first mp_64 and recovered back to older version in second mp_64." >> $SSD_UPGRADE_LOG
fi
fi

fi

echo "$0 `date` SMF Register 1 = $SSD_UPGRADE_STATUS1" >> $SSD_UPGRADE_LOG
echo "$0 `date` SMF Register 2 = $SSD_UPGRADE_STATUS2" >> $SSD_UPGRADE_LOG
echo "$SMART_CMD" >> $SSD_UPGRADE_LOG
echo "$iSMART_CMD" >> $SSD_UPGRADE_LOG
sync
# Clearing the upgrade status
io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 09 --offset 211; io_rd_wr.py --set --val ff --offset 213
io_rd_wr.py --set --val 06 --offset 210; io_rd_wr.py --set --val 0A --offset 211; io_rd_wr.py --set --val ff --offset 213
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Dell S6100 SSD monitoring poller
DefaultDependencies=no

[Service]
User=root
ExecStart=/usr/local/bin/s6100_ssd_mon.sh
RemainAfterExit=no

[Install]
WantedBy=multi-user.target

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Dell S6100 SSD monitoring poller timer
DefaultDependencies=no
After=pmon.service

[Timer]
OnBootSec=5min
OnUnitActiveSec=60min

[Install]
WantedBy=timers.target

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description= Checking Dell S6100 SSD upgrade status
After=pmon.service
DefaultDependencies=no

[Service]
User=root
Type=oneshot
ExecStart=/usr/local/bin/s6100_ssd_upgrade_status.sh
RemainAfterExit=no

[Install]
WantedBy=multi-user.target

0 comments on commit 140576d

Please sign in to comment.