Skip to content

Commit

Permalink
[config reload] Config Reload Enhancement (sonic-net#45)
Browse files Browse the repository at this point in the history
Code changes for config reload Enhancement sonic-net/SONiC#1203.
Enhancing config reload to sequence the services and faster system initialization.

Immediately restart the critical services during config reload.
The non critical should be started only after all the ports are initialized.
Services can be configured to be started immediately or delayed. This can be using a field in FEATURE table.
The existing timers should be removed by this event driven approach.
This flow is applicable in case of all reboots (warm/fast/cold) as well as config reload.
  • Loading branch information
dgsudharsan committed May 26, 2023
1 parent 7ab0978 commit 95c494b
Show file tree
Hide file tree
Showing 8 changed files with 430 additions and 164 deletions.
160 changes: 110 additions & 50 deletions scripts/hostcfgd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import syslog
import signal
import re
import jinja2
import threading
from sonic_py_common import device_info
from sonic_py_common.general import check_output_pipe
from swsscommon.swsscommon import ConfigDBConnector, DBConnector, Table, SonicDBConfig
Expand Down Expand Up @@ -61,6 +62,7 @@ CFG_DB = "CONFIG_DB"
STATE_DB = "STATE_DB"
HOSTCFGD_MAX_PRI = 10 # Used to enforce ordering b/w daemons under Hostcfgd
DEFAULT_SELECT_TIMEOUT = 1000
PORT_INIT_TIMEOUT_SEC = 180


def safe_eval(val, default_value=False):
Expand Down Expand Up @@ -156,11 +158,15 @@ class Feature(object):
feature_cfg (dict): Feature CONFIG_DB configuration
deviec_config (dict): DEVICE_METADATA section of CONFIG_DB
"""
if 'has_timer' in feature_cfg:
err_str = "Invalid obsolete field 'has_timer' in FEATURE table. Please update configuration schema version"
syslog.syslog(syslog.LOG_ERR, err_str)
raise ValueError(err_str)

self.name = feature_name
self.state = self._get_feature_table_key_render_value(feature_cfg.get('state'), device_config or {}, ['enabled', 'disabled', 'always_enabled', 'always_disabled'])
self.auto_restart = feature_cfg.get('auto_restart', 'disabled')
self.has_timer = safe_eval(feature_cfg.get('has_timer', 'False'))
self.delayed = safe_eval(feature_cfg.get('delayed', 'False'))
self.has_global_scope = safe_eval(feature_cfg.get('has_global_scope', 'True'))
self.has_per_asic_scope = safe_eval(self._get_feature_table_key_render_value(feature_cfg.get('has_per_asic_scope', 'False'), device_config or {}, ['True', 'False']))

Expand Down Expand Up @@ -204,12 +210,18 @@ class FeatureHandler(object):
FEATURE_STATE_DISABLED = "disabled"
FEATURE_STATE_FAILED = "failed"

def __init__(self, config_db, feature_state_table, device_config):
def __init__(self, config_db, feature_state_table, device_config, is_advanced_boot):
self._config_db = config_db
self._feature_state_table = feature_state_table
self._device_config = device_config
self._cached_config = {}
self.is_multi_npu = device_info.is_multi_npu()
self.enable_delayed_service = False
self.is_advanced_boot = is_advanced_boot
self.lock = threading.Lock()
self.port_listener_thread = threading.Thread(target=self._portListener, name='port_listener_thread')
self.port_listener_thread.daemon = True
self.port_listener_thread.start()
self._device_running_config = device_info.get_device_runtime_metadata()
self.ns_cfg_db = {}
self.ns_feature_state_tbl = {}
Expand All @@ -227,58 +239,101 @@ class FeatureHandler(object):
db_conn = DBConnector(STATE_DB, 0, False, ns);
self.ns_feature_state_tbl[ns] = Table(db_conn, 'FEATURE')

def _enableDelayedServices(self):
with self.lock:
self.enable_delayed_service = True
for feature_name in self._cached_config:
if self._cached_config[feature_name].delayed:
self.update_feature_state(self._cached_config[feature_name])

def _portListener(self):
syslog.syslog(syslog.LOG_INFO, "Starting port listener")
appl_db_connector = DBConnector("APPL_DB", 0)
subscribe_port = swsscommon.SubscriberStateTable(appl_db_connector, "PORT_TABLE")
sel = swsscommon.Select()
sel.addSelectable(subscribe_port)

if self.is_advanced_boot:
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after warm/fast boot")
self._enableDelayedServices()

while True:
(state, selectableObj) = sel.select(PORT_INIT_TIMEOUT_SEC*1000)
# Continue if select is timeout or selectable object is not return
if state == swsscommon.Select.ERROR:
if not self.enable_delayed_service:
syslog.syslog(syslog.LOG_ERR, "Received unexpected error in waiting for port init. Restarting services")
self._enableDelayedServices()
continue
if state != swsscommon.Select.OBJECT:
if not self.enable_delayed_service:
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after timeout")
self._enableDelayedServices()
continue

key, op, fvs = subscribe_port.pop()
if not key:
break
if op == 'SET' and key == 'PortInitDone':
syslog.syslog(syslog.LOG_INFO, "Updating delayed features after port initialization")
self._enableDelayedServices()

def handler(self, feature_name, op, feature_cfg):
if not feature_cfg:
syslog.syslog(syslog.LOG_INFO, "Deregistering feature {}".format(feature_name))
self._cached_config.pop(feature_name, None)
self._feature_state_table._del(feature_name)
return
with self.lock:
if not feature_cfg:
syslog.syslog(syslog.LOG_INFO, "Deregistering feature {}".format(feature_name))
self._cached_config.pop(feature_name, None)
self._feature_state_table._del(feature_name)
return

device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)

feature = Feature(feature_name, feature_cfg, device_config)
self._cached_config.setdefault(feature_name, Feature(feature_name, {}))

# Change auto-restart configuration first.
# If service reached failed state before this configuration applies (e.g. on boot)
# the next called self.update_feature_state will start it again. If it will fail
# again the auto restart will kick-in. Another order may leave it in failed state
# and not auto restart.
if self._cached_config[feature_name].auto_restart != feature.auto_restart:
syslog.syslog(syslog.LOG_INFO, "Auto-restart status of feature '{}' is changed from '{}' to '{}' ..."
.format(feature_name, self._cached_config[feature_name].auto_restart, feature.auto_restart))
self.update_systemd_config(feature)
self._cached_config[feature_name].auto_restart = feature.auto_restart

# Enable/disable the container service if the feature state was changed from its previous state.
if self._cached_config[feature_name].state != feature.state:
if self.update_feature_state(feature):
self._cached_config[feature_name].state = feature.state
else:
self.resync_feature_state(self._cached_config[feature_name])
device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)

feature = Feature(feature_name, feature_cfg, device_config)
self._cached_config.setdefault(feature_name, Feature(feature_name, {}))

# Change auto-restart configuration first.
# If service reached failed state before this configuration applies (e.g. on boot)
# the next called self.update_feature_state will start it again. If it will fail
# again the auto restart will kick-in. Another order may leave it in failed state
# and not auto restart.
if self._cached_config[feature_name].auto_restart != feature.auto_restart:
syslog.syslog(syslog.LOG_INFO, "Auto-restart status of feature '{}' is changed from '{}' to '{}' ..."
.format(feature_name, self._cached_config[feature_name].auto_restart, feature.auto_restart))
self.update_systemd_config(feature)
self._cached_config[feature_name].auto_restart = feature.auto_restart

# Enable/disable the container service if the feature state was changed from its previous state.
if self._cached_config[feature_name].state != feature.state:
if self.update_feature_state(feature):
self.sync_feature_asic_scope(feature)
self._cached_config[feature_name] = feature
else:
self.resync_feature_state(self._cached_config[feature_name])

def sync_state_field(self, feature_table):
"""
Summary:
Updates the state field in the FEATURE|* tables as the state field
might have to be rendered based on DEVICE_METADATA table and generated Device Running Metadata
"""
for feature_name in feature_table.keys():
if not feature_name:
syslog.syslog(syslog.LOG_WARNING, "Feature is None")
continue
with self.lock:
for feature_name in feature_table.keys():
if not feature_name:
syslog.syslog(syslog.LOG_WARNING, "Feature is None")
continue

device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)
feature = Feature(feature_name, feature_table[feature_name], device_config)
device_config = {}
device_config.update(self._device_config)
device_config.update(self._device_running_config)
feature = Feature(feature_name, feature_table[feature_name], device_config)

self._cached_config.setdefault(feature_name, feature)
self.update_systemd_config(feature)
self.update_feature_state(feature)
self.resync_feature_state(feature)
self._cached_config.setdefault(feature_name, feature)
self.update_systemd_config(feature)
self.update_feature_state(feature)
self.sync_feature_asic_scope(feature)
self.resync_feature_state(feature)

def update_feature_state(self, feature):
cached_feature = self._cached_config[feature.name]
Expand Down Expand Up @@ -311,6 +366,10 @@ class FeatureHandler(object):
.format(feature.state, feature.name))
return False

if feature.delayed and not self.enable_delayed_service:
syslog.syslog(syslog.LOG_INFO, "Feature is {} delayed for port init".format(feature.name))
return True

if enable:
self.enable_feature(feature)
syslog.syslog(syslog.LOG_INFO, "Feature {} is enabled and started".format(feature.name))
Expand All @@ -319,8 +378,6 @@ class FeatureHandler(object):
self.disable_feature(feature)
syslog.syslog(syslog.LOG_INFO, "Feature {} is stopped and disabled".format(feature.name))

self.sync_feature_asic_scope(feature)

return True

def sync_feature_asic_scope(self, feature_config):
Expand Down Expand Up @@ -393,7 +450,7 @@ class FeatureHandler(object):
with open(feature_systemd_config_file_path, 'w') as feature_systemd_config_file_handler:
feature_systemd_config_file_handler.write(feature_systemd_config)

syslog.syslog(syslog.LOG_INFO, "Feautre '{}' systemd config file related to auto-restart is updated!"
syslog.syslog(syslog.LOG_INFO, "Feature '{}' systemd config file related to auto-restart is updated!"
.format(feature_name))

try:
Expand All @@ -415,7 +472,7 @@ class FeatureHandler(object):
syslog.syslog(syslog.LOG_ERR, "Feature '{}' service not available"
.format(feature.name))

feature_suffixes = ["service"] + (["timer"] if feature.has_timer else [])
feature_suffixes = ["service"]

return feature_names, feature_suffixes

Expand Down Expand Up @@ -1591,9 +1648,13 @@ class SyslogCfg:
return interval, burst
return interval, burst


class HostConfigDaemon:
def __init__(self):
self.state_db_conn = DBConnector(STATE_DB, 0)
self.advanced_boot = False
if swsscommon.RestartWaiter.isAdvancedBootInProgress(self.state_db_conn):
self.advanced_boot = True
swsscommon.RestartWaiter.waitAdvancedBootDone()
# Just a sanity check to verify if the CONFIG_DB has been initialized
# before moving forward
self.config_db = ConfigDBConnector()
Expand All @@ -1605,7 +1666,6 @@ class HostConfigDaemon:
self.device_config['DEVICE_METADATA'] = self.config_db.get_table('DEVICE_METADATA')

# Load feature state table
self.state_db_conn = DBConnector(STATE_DB, 0)
feature_state_table = Table(self.state_db_conn, 'FEATURE')

# Initialize KDump Config and set the config to default if nothing is provided
Expand All @@ -1615,7 +1675,7 @@ class HostConfigDaemon:
self.iptables = Iptables()

# Intialize Feature Handler
self.feature_handler = FeatureHandler(self.config_db, feature_state_table, self.device_config)
self.feature_handler = FeatureHandler(self.config_db, feature_state_table, self.device_config, self.advanced_boot)

# Initialize Ntp Config Handler
self.ntpcfg = NtpCfg()
Expand Down
74 changes: 74 additions & 0 deletions tests/common/mock_configdb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
class MockConfigDb(object):
"""
Mock Config DB which responds to data tables requests and store updates to the data table
Expand Down Expand Up @@ -55,6 +56,79 @@ def listen(self, init_data_handler=None):
for e in MockConfigDb.event_queue:
self.handlers[e[0]](e[0], e[1], self.get_entry(e[0], e[1]))

class MockSelect():

event_queue = []
OBJECT = "OBJECT"
ERROR = ""

@staticmethod
def set_event_queue(Q):
MockSelect.event_queue = Q

@staticmethod
def get_event_queue():
return MockSelect.event_queue

@staticmethod
def reset_event_queue():
MockSelect.event_queue = []

def __init__(self):
self.sub_map = {}
self.TIMEOUT = "TIMEOUT"
self.ERROR = "ERROR"

def addSelectable(self, subscriber):
self.sub_map[subscriber.table] = subscriber

def select(self, TIMEOUT):
if not MockSelect.get_event_queue():
time.sleep(TIMEOUT/1000)
return "TIMEOUT", {}
table, key = MockSelect.get_event_queue().pop(0)
self.sub_map[table].nextKey(key)
self.reset_event_queue()
return "OBJECT", self.sub_map[table]


class MockSubscriberStateTable():

FD_INIT = 0

@staticmethod
def generate_fd():
curr = MockSubscriberStateTable.FD_INIT
MockSubscriberStateTable.FD_INIT = curr + 1
return curr

@staticmethod
def reset_fd():
MockSubscriberStateTable.FD_INIT = 0

def __init__(self, conn, table, pop=None, pri=None):
self.fd = MockSubscriberStateTable.generate_fd()
self.next_key = ''
self.table = table

def getFd(self):
return self.fd

def nextKey(self, key):
print("next key")
self.next_key = key

def pop(self):
table = MockConfigDb.CONFIG_DB.get(self.table, {})
print(self.next_key)
if self.next_key not in table:
op = "DEL"
fvs = {}
else:
op = "SET"
fvs = table.get(self.next_key, {})
return self.next_key, op, fvs


class MockDBConnector():
def __init__(self, db, val, tcpFlag=False, name=None):
Expand Down
25 changes: 25 additions & 0 deletions tests/common/mock_restart_waiter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
class MockRestartWaiter(object):
advancedReboot = False
"""
Mock Config DB which responds to data tables requests and store updates to the data table
"""
def waitAdvancedBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return True

def waitWarmBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return False

def waitFastBootDone(maxWaitSec=180, dbTimeout=0, isTcpConn=False):
return False

def isAdvancedBootInProgress(stateDb):
return MockRestartWaiter.advancedReboot

def isFastBootInProgress(stateDb):
return False

def isWarmBootInProgress(stateDb):
return False

def __init__(self):
pass
Loading

0 comments on commit 95c494b

Please sign in to comment.