diff --git a/config/plugins/auto_techsupport.py b/config/plugins/auto_techsupport.py new file mode 100644 index 000000000000..c2960646d450 --- /dev/null +++ b/config/plugins/auto_techsupport.py @@ -0,0 +1,350 @@ +""" +Autogenerated config CLI plugin. +""" + +import click +import utilities_common.cli as clicommon +import utilities_common.general as general +from config import config_mgmt + + +# Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension. +sonic_cfggen = general.load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen') + + +def exit_with_error(*args, **kwargs): + """ Print a message and abort CLI. """ + + click.secho(*args, **kwargs) + raise click.Abort() + + +def validate_config_or_raise(cfg): + """ Validate config db data using ConfigMgmt """ + + try: + cfg = sonic_cfggen.FormatConverter.to_serialized(cfg) + config_mgmt.ConfigMgmt().loadData(cfg) + except Exception as err: + raise Exception('Failed to validate configuration: {}'.format(err)) + + +def add_entry_validated(db, table, key, data): + """ Add new entry in table and validate configuration """ + + cfg = db.get_config() + cfg.setdefault(table, {}) + if key in cfg[table]: + raise Exception(f"{key} already exists") + + cfg[table][key] = data + + validate_config_or_raise(cfg) + db.set_entry(table, key, data) + + +def update_entry_validated(db, table, key, data, create_if_not_exists=False): + """ Update entry in table and validate configuration. + If attribute value in data is None, the attribute is deleted. + """ + + cfg = db.get_config() + cfg.setdefault(table, {}) + + if create_if_not_exists: + cfg[table].setdefault(key, {}) + + if key not in cfg[table]: + raise Exception(f"{key} does not exist") + + for attr, value in data.items(): + if value is None and attr in cfg[table][key]: + cfg[table][key].pop(attr) + else: + cfg[table][key][attr] = value + + validate_config_or_raise(cfg) + db.set_entry(table, key, cfg[table][key]) + + +def del_entry_validated(db, table, key): + """ Delete entry in table and validate configuration """ + + cfg = db.get_config() + cfg.setdefault(table, {}) + if key not in cfg[table]: + raise Exception(f"{key} does not exist") + + cfg[table].pop(key) + + validate_config_or_raise(cfg) + db.set_entry(table, key, None) + + +def add_list_entry_validated(db, table, key, attr, data): + """ Add new entry into list in table and validate configuration""" + + cfg = db.get_config() + cfg.setdefault(table, {}) + if key not in cfg[table]: + raise Exception(f"{key} does not exist") + cfg[table][key].setdefault(attr, []) + for entry in data: + if entry in cfg[table][key][attr]: + raise Exception(f"{entry} already exists") + cfg[table][key][attr].append(entry) + + validate_config_or_raise(cfg) + db.set_entry(table, key, cfg[table][key]) + + +def del_list_entry_validated(db, table, key, attr, data): + """ Delete entry from list in table and validate configuration""" + + cfg = db.get_config() + cfg.setdefault(table, {}) + if key not in cfg[table]: + raise Exception(f"{key} does not exist") + cfg[table][key].setdefault(attr, []) + for entry in data: + if entry not in cfg[table][key][attr]: + raise Exception(f"{entry} does not exist") + cfg[table][key][attr].remove(entry) + if not cfg[table][key][attr]: + cfg[table][key].pop(attr) + + validate_config_or_raise(cfg) + db.set_entry(table, key, cfg[table][key]) + + +def clear_list_entry_validated(db, table, key, attr): + """ Clear list in object and validate configuration""" + + update_entry_validated(db, table, key, {attr: None}) + + +@click.group(name="auto-techsupport", + cls=clicommon.AliasedGroup) +def AUTO_TECHSUPPORT(): + """ AUTO_TECHSUPPORT part of config_db.json """ + + pass + + +@AUTO_TECHSUPPORT.group(name="global", + cls=clicommon.AliasedGroup) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL(db): + """ """ + + pass + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="state") +@click.argument( + "state", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_state(db, state): + """ Knob to make techsupport invocation event-driven based on core-dump generation """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "state": state, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="rate-limit-interval") +@click.argument( + "rate-limit-interval", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_rate_limit_interval(db, rate_limit_interval): + """ Minimum time in seconds between two successive techsupport invocations. Configure 0 to explicitly disable """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "rate_limit_interval": rate_limit_interval, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="max-techsupport-limit") +@click.argument( + "max-techsupport-limit", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_max_techsupport_limit(db, max_techsupport_limit): + """ Max Limit in percentage for the cummulative size of ts dumps. + No cleanup is performed if the value isn't configured or is 0.0 + """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "max_techsupport_limit": max_techsupport_limit, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="max-core-limit") +@click.argument( + "max-core-limit", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit): + """ Max Limit in percentage for the cummulative size of core dumps. + No cleanup is performed if the value isn't congiured or is 0.0 + """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "max_core_limit": max_core_limit, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_GLOBAL.command(name="since") +@click.argument( + "since", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL_since(db, since): + """ Only collect the logs & core-dumps generated since the time provided. + A default value of '2 days ago' is used if this value is not set explicitly or a non-valid string is provided """ + + table = "AUTO_TECHSUPPORT" + key = "GLOBAL" + data = { + "since": since, + } + try: + update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@click.group(name="auto-techsupport-feature", + cls=clicommon.AliasedGroup) +def AUTO_TECHSUPPORT_FEATURE(): + """ AUTO_TECHSUPPORT_FEATURE part of config_db.json """ + pass + + +@AUTO_TECHSUPPORT_FEATURE.command(name="add") +@click.argument( + "feature-name", + nargs=1, + required=True, +) +@click.option( + "--state", + help="Enable auto techsupport invocation on the processes running inside this feature", +) +@click.option( + "--rate-limit-interval", + help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable", +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval): + """ Add object in AUTO_TECHSUPPORT_FEATURE. """ + + table = "AUTO_TECHSUPPORT_FEATURE" + key = feature_name + data = {} + if state is not None: + data["state"] = state + if rate_limit_interval is not None: + data["rate_limit_interval"] = rate_limit_interval + + try: + add_entry_validated(db.cfgdb, table, key, data) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_FEATURE.command(name="update") +@click.argument( + "feature-name", + nargs=1, + required=True, +) +@click.option( + "--state", + help="Enable auto techsupport invocation on the processes running inside this feature", +) +@click.option( + "--rate-limit-interval", + help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable", +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval): + """ Add object in AUTO_TECHSUPPORT_FEATURE. """ + + table = "AUTO_TECHSUPPORT_FEATURE" + key = feature_name + data = {} + if state is not None: + data["state"] = state + if rate_limit_interval is not None: + data["rate_limit_interval"] = rate_limit_interval + + try: + update_entry_validated(db.cfgdb, table, key, data) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +@AUTO_TECHSUPPORT_FEATURE.command(name="delete") +@click.argument( + "feature-name", + nargs=1, + required=True, +) +@clicommon.pass_db +def AUTO_TECHSUPPORT_FEATURE_delete(db, feature_name): + """ Delete object in AUTO_TECHSUPPORT_FEATURE. """ + + table = "AUTO_TECHSUPPORT_FEATURE" + key = feature_name + try: + del_entry_validated(db.cfgdb, table, key) + except Exception as err: + exit_with_error(f"Error: {err}", fg="red") + + +def register(cli): + cli_node = AUTO_TECHSUPPORT + if cli_node.name in cli.commands: + raise Exception(f"{cli_node.name} already exists in CLI") + cli.add_command(AUTO_TECHSUPPORT) + cli_node = AUTO_TECHSUPPORT_FEATURE + if cli_node.name in cli.commands: + raise Exception(f"{cli_node.name} already exists in CLI") + cli.add_command(AUTO_TECHSUPPORT_FEATURE) diff --git a/scripts/coredump-compress b/scripts/coredump-compress index 53381fc00ed4..667d5f8a58a6 100755 --- a/scripts/coredump-compress +++ b/scripts/coredump-compress @@ -7,7 +7,9 @@ while [[ $# > 1 ]]; do shift done +CONTAINER_ID="" if [ $# > 0 ]; then + CONTAINER_ID=$(xargs -0 -L1 -a /proc/${1}/cgroup | grep -oP "pids:/docker/\K\w+") ns=`xargs -0 -L1 -a /proc/${1}/environ | grep -e "^NAMESPACE_ID" | cut -f2 -d'='` if [ ! -z ${ns} ]; then PREFIX=${PREFIX}${ns}. @@ -15,3 +17,18 @@ if [ $# > 0 ]; then fi /bin/gzip -1 - > /var/core/${PREFIX}core.gz + +if [[ ! -z $CONTAINER_ID ]]; then + CONTAINER_NAME=$(docker inspect --format='{{.Name}}' ${CONTAINER_ID} | cut -c2-) + if [[ ! -z ${CONTAINER_NAME} ]]; then + # coredump_gen_handler invokes techsupport if all the other required conditions are met + # explicitly passing in the env vars because coredump-compress's namespace doesn't have these set by default + for path in $(find /usr/local/lib/python3*/dist-packages -maxdepth 0); do + PYTHONPATH=$PYTHONPATH:$path + done + setsid $(echo > /tmp/coredump_gen_handler.log; + export PYTHONPATH=$PYTHONPATH; + python3 /usr/local/bin/coredump_gen_handler.py ${PREFIX}core.gz ${CONTAINER_NAME} &>> /tmp/coredump_gen_handler.log) & + fi +fi + diff --git a/scripts/coredump_gen_handler.py b/scripts/coredump_gen_handler.py new file mode 100644 index 000000000000..895c22146a6b --- /dev/null +++ b/scripts/coredump_gen_handler.py @@ -0,0 +1,185 @@ +""" +coredump_gen_handler script. + This script is invoked by the coredump-compress script + for auto techsupport invocation and cleanup core dumps. + For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD +""" +import os +import time +import argparse +import syslog +from swsscommon.swsscommon import SonicV2Connector +from utilities_common.auto_techsupport_helper import * + +# Explicity Pass this to the subprocess invoking techsupport +ENV_VAR = os.environ +PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else "" +ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV + + +def handle_coredump_cleanup(dump_name, db): + file_path = os.path.join(CORE_DUMP_DIR, dump_name) + if not verify_recent_file_creation(file_path): + return + + _, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN)) + + if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled": + msg = "coredump_cleanup is disabled. No cleanup is performed. current size occupied : {}" + syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes))) + return + + core_usage = db.get(CFG_DB, AUTO_TS, CFG_CORE_USAGE) + try: + core_usage = float(core_usage) + except ValueError: + core_usage = 0.0 + + if not core_usage: + msg = "core-usage argument is not set. No cleanup is performed, current size occupied: {}" + syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes))) + return + + cleanup_process(core_usage, CORE_DUMP_PTRN, CORE_DUMP_DIR) + + +class CriticalProcCoreDumpHandle(): + """ + Class to handle coredump creation event for critical processes + """ + + def __init__(self, core_name, container_name, db): + self.core_name = core_name + self.container = container_name + self.db = db + self.proc_mp = {} + self.core_ts_map = {} + self.curr_ts_list = [] + + def handle_core_dump_creation_event(self): + file_path = os.path.join(CORE_DUMP_DIR, self.core_name) + if not verify_recent_file_creation(file_path): + syslog.syslog(syslog.LOG_INFO, "Spurious Invocation. {} is not created within last {} sec".format(file_path, TIME_BUF)) + return + + if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled": + syslog.syslog(syslog.LOG_NOTICE, "auto_invoke_ts is disabled. No cleanup is performed: core {}".format(self.core_name)) + return + + # Config made for the defaul instance applies to all the masic instances + self.container = trim_masic_suffix(self.container) + + FEATURE_KEY = FEATURE.format(self.container) + if self.db.get(CFG_DB, FEATURE_KEY, CFG_STATE) != "enabled": + msg = "auto-techsupport feature for {} is not enabled. Techsupport Invocation is skipped. core: {}" + syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name)) + return + + global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF) + container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF) + + try: + global_cooloff = float(global_cooloff) + except ValueError: + global_cooloff = 0.0 + + try: + container_cooloff = float(container_cooloff) + except ValueError: + container_cooloff = 0.0 + + cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff) + if cooloff_passed: + since_cfg = self.get_since_arg() + new_file = self.invoke_ts_cmd(since_cfg) + if new_file: + self.write_to_state_db(int(time.time()), new_file[0]) + + def write_to_state_db(self, timestamp, ts_dump): + name = strip_ts_ext(ts_dump) + key = TS_MAP + "|" + name + self.db.set(STATE_DB, key, CORE_DUMP, self.core_name) + self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp)) + self.db.set(STATE_DB, key, CONTAINER, self.container) + + def get_since_arg(self): + since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE) + if not since_cfg: + return SINCE_DEFAULT + rc, _, stderr = subprocess_exec(["date", "--date='{}'".format(since_cfg)], env=ENV_VAR) + if rc == 0: + return since_cfg + return SINCE_DEFAULT + + def invoke_ts_cmd(self, since_cfg): + since_cfg = "'" + since_cfg + "'" + cmd = " ".join(["show", "techsupport", "--since", since_cfg]) + rc, _, stderr = subprocess_exec(["show", "techsupport", "--since", since_cfg], env=ENV_VAR) + if not rc: + syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr:{}".format(rc, stderr)) + new_list = get_ts_dumps(True) + diff = list(set(new_list).difference(set(self.curr_ts_list))) + self.curr_ts_list = new_list + if not diff: + syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd)) + else: + syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, diff)) + return diff + + def verify_rate_limit_intervals(self, global_cooloff, container_cooloff): + """Verify both the global and per-proc rate_limit_intervals have passed""" + self.curr_ts_list = get_ts_dumps(True) + if global_cooloff and self.curr_ts_list: + last_ts_dump_creation = os.path.getmtime(self.curr_ts_list[-1]) + if time.time() - last_ts_dump_creation < global_cooloff: + msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}" + syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name)) + return False + + self.parse_ts_map() + if container_cooloff and self.container in self.core_ts_map: + last_creation_time = self.core_ts_map[self.container][0][0] + if time.time() - last_creation_time < container_cooloff: + msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}" + syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name)) + return False + return True + + def parse_ts_map(self): + """Create proc_name, ts_dump & creation_time map""" + ts_keys = self.db.keys(STATE_DB, TS_MAP+"*") + if not ts_keys: + return + for ts_key in ts_keys: + data = self.db.get_all(STATE_DB, ts_key) + if not data: + continue + container_name = data.get(CONTAINER, "") + creation_time = data.get(TIMESTAMP, "") + try: + creation_time = int(creation_time) + except Exception: + continue # if the creation time is invalid, skip the entry + ts_dump = ts_key.split("|")[-1] + if container_name and container_name not in self.core_ts_map: + self.core_ts_map[container_name] = [] + self.core_ts_map[container_name].append((int(creation_time), ts_dump)) + for container_name in self.core_ts_map: + self.core_ts_map[container_name].sort() + +def main(): + parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script') + parser.add_argument('name', type=str, help='Core Dump Name') + parser.add_argument('container', type=str, help='Container Name') + args = parser.parse_args() + syslog.openlog(logoption=syslog.LOG_PID) + db = SonicV2Connector(use_unix_socket_path=True) + db.connect(CFG_DB) + db.connect(STATE_DB) + cls = CriticalProcCoreDumpHandle(args.name, args.container, db) + cls.handle_core_dump_creation_event() + handle_coredump_cleanup(args.name, db) + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_dump b/scripts/generate_dump index 63cf6e52cfac..d5f3bb67311e 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -1338,6 +1338,10 @@ main() { fi fi + # Invoke the TechSupport Cleanup Hook + setsid $(echo > /tmp/techsupport_cleanup.log; + python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &>> /tmp/techsupport_cleanup.log) & + echo ${TARFILE} if ! $SAVE_STDERR diff --git a/scripts/techsupport_cleanup.py b/scripts/techsupport_cleanup.py new file mode 100644 index 000000000000..53a10562e7a9 --- /dev/null +++ b/scripts/techsupport_cleanup.py @@ -0,0 +1,59 @@ +""" +techsupport_cleanup script. + This script is invoked by the generate_dump script for techsupport cleanup + For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD +""" +import os +import argparse +import syslog +from swsscommon.swsscommon import SonicV2Connector +from utilities_common.auto_techsupport_helper import * + + +def clean_state_db_entries(removed_files, db): + if not removed_files: + return + for file in removed_files: + name = strip_ts_ext(file) + db.delete(STATE_DB, TS_MAP + "|" + name) + + +def handle_techsupport_creation_event(dump_name, db): + file_path = os.path.join(TS_DIR, dump_name) + if not verify_recent_file_creation(file_path): + return + _ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN)) + + if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled": + msg = "techsupport_cleanup is disabled. No cleanup is performed. current size occupied : {}" + syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes))) + return + + max_ts = db.get(CFG_DB, AUTO_TS, CFG_MAX_TS) + try: + max_ts = float(max_ts) + except ValueError: + max_ts = 0.0 + + if not max_ts: + msg = "max-techsupport-limit argument is not set. No cleanup is performed, current size occupied: {}" + syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes))) + return + + removed_files = cleanup_process(max_ts, TS_PTRN, TS_DIR) + clean_state_db_entries(removed_files, db) + + +def main(): + parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script') + parser.add_argument('name', type=str, help='TechSupport Dump Name') + args = parser.parse_args() + syslog.openlog(logoption=syslog.LOG_PID) + db = SonicV2Connector(use_unix_socket_path=True) + db.connect(CFG_DB) + db.connect(STATE_DB) + handle_techsupport_creation_event(args.name, db) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index c269bb130b75..81dec3623a0c 100644 --- a/setup.py +++ b/setup.py @@ -138,6 +138,8 @@ 'scripts/sonic-kdump-config', 'scripts/centralize_database', 'scripts/null_route_helper', + 'scripts/coredump_gen_handler.py', + 'scripts/techsupport_cleanup.py', 'scripts/check_db_integrity.py' ], entry_points={ diff --git a/show/plugins/auto_techsupport.py b/show/plugins/auto_techsupport.py new file mode 100644 index 000000000000..9bcda1b04c07 --- /dev/null +++ b/show/plugins/auto_techsupport.py @@ -0,0 +1,159 @@ +""" +Auto-generated show CLI plugin. +Manually Edited to add show cli for "show auto_techsupport history" +""" + +import click +import tabulate +import natsort +import utilities_common.cli as clicommon + + +def format_attr_value(entry, attr): + """ Helper that formats attribute to be presented in the table output. + + Args: + entry (Dict[str, str]): CONFIG DB entry configuration. + attr (Dict): Attribute metadata. + + Returns: + str: fomatted attribute value. + """ + + if attr["is-leaf-list"]: + return "\n".join(entry.get(attr["name"], [])) + return entry.get(attr["name"], "N/A") + + +def format_group_value(entry, attrs): + """ Helper that formats grouped attribute to be presented in the table output. + + Args: + entry (Dict[str, str]): CONFIG DB entry configuration. + attrs (List[Dict]): Attributes metadata that belongs to the same group. + + Returns: + str: fomatted group attributes. + """ + + data = [] + for attr in attrs: + if entry.get(attr["name"]): + data.append((attr["name"] + ":", format_attr_value(entry, attr))) + return tabulate.tabulate(data, tablefmt="plain") + + +@click.group(name="auto-techsupport", + cls=clicommon.AliasedGroup) +def AUTO_TECHSUPPORT(): + """ AUTO_TECHSUPPORT part of config_db.json """ + + pass + + +@AUTO_TECHSUPPORT.command(name="global") +@clicommon.pass_db +def AUTO_TECHSUPPORT_GLOBAL(db): + """ """ + + header = [ + "STATE", + "RATE LIMIT INTERVAL (sec)", + "MAX TECHSUPPORT LIMIT (%)", + "MAX CORE LIMIT (%)", + "SINCE", + ] + + body = [] + table = db.cfgdb.get_table("AUTO_TECHSUPPORT") + entry = table.get("GLOBAL", {}) + row = [ + format_attr_value( + entry, + {'name': 'state', 'description': 'Knob to make techsupport invocation event-driven based on core-dump generation', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'rate_limit_interval', 'description': 'Minimum time in seconds between two successive techsupport invocations. Configure 0 to explicitly disable', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'max_techsupport_limit', 'description': 'Max Limit in percentage for the cummulative size of ts dumps. No cleanup is performed if the value isn\'t configured or is 0.0', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'max_core_limit', 'description': 'Max Limit in percentage for the cummulative size of core dumps. No cleanup is performed if the value isn\'t congiured or is 0.0', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'since', 'description': "Only collect the logs & core-dumps generated since the time provided. A default value of '2 days ago' is used if this value is not set explicitly or a non-valid string is provided", 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + ] + + body.append(row) + click.echo(tabulate.tabulate(body, header, numalign="left")) + + +@AUTO_TECHSUPPORT.command(name="history") +@clicommon.pass_db +def AUTO_TECHSUPPORT_history(db): + keys = db.db.keys("STATE_DB", "AUTO_TECHSUPPORT_DUMP_INFO|*") + header = ["TECHSUPPORT DUMP", "TRIGGERED BY", "CORE DUMP"] + body = [] + for key in keys: + dump = key.split("|")[-1] + fv_pairs = db.db.get_all("STATE_DB", key) + core_dump = fv_pairs.get("core_dump", "") + container = fv_pairs.get("container_name", "") + body.append([dump, container, core_dump]) + click.echo(tabulate.tabulate(body, header, numalign="left")) + + +@click.group(name="auto-techsupport-feature", + cls=clicommon.AliasedGroup, + invoke_without_command=True) +@clicommon.pass_db +def AUTO_TECHSUPPORT_FEATURE(db): + """ [Callable command group] """ + + header = [ + "FEATURE NAME", + "STATE", + "RATE LIMIT INTERVAL (sec)", + ] + + body = [] + + table = db.cfgdb.get_table("AUTO_TECHSUPPORT_FEATURE") + for key in natsort.natsorted(table): + entry = table[key] + if not isinstance(key, tuple): + key = (key,) + + row = [*key] + [ + format_attr_value( + entry, + {'name': 'state', 'description': 'Enable auto techsupport invocation on the processes running inside this feature', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + format_attr_value( + entry, + {'name': 'rate_limit_interval', 'description': 'Rate limit interval for the corresponding feature. Configure 0 to explicitly disable', 'is-leaf-list': False, 'is-mandatory': False, 'group': ''} + ), + ] + body.append(row) + click.echo(tabulate.tabulate(body, header, numalign="left")) + + +def register(cli): + cli_node = AUTO_TECHSUPPORT + if cli_node.name in cli.commands: + raise Exception(f"{cli_node.name} already exists in CLI") + cli.add_command(AUTO_TECHSUPPORT) + cli_node = AUTO_TECHSUPPORT_FEATURE + if cli_node.name in cli.commands: + raise Exception(f"{cli_node.name} already exists in CLI") + cli.add_command(AUTO_TECHSUPPORT_FEATURE) + cli_node = AUTO_TECHSUPPORT_history + if cli_node.name in cli.commands: + raise Exception(f"{cli_node.name} already exists in CLI") + cli.add_command(AUTO_TECHSUPPORT_history) diff --git a/tests/coredump_gen_handler_test.py b/tests/coredump_gen_handler_test.py new file mode 100644 index 000000000000..c742f09d06ac --- /dev/null +++ b/tests/coredump_gen_handler_test.py @@ -0,0 +1,383 @@ +import os +import time +import sys +import pyfakefs +import unittest +from pyfakefs.fake_filesystem_unittest import Patcher +from swsscommon import swsscommon +from utilities_common.general import load_module_from_source +from utilities_common.db import Db +from .mock_tables import dbconnector + +sys.path.append("scripts") +import coredump_gen_handler as cdump_mod + + +def set_auto_ts_cfg(redis_mock, state="disabled", + rate_limit_interval="0", + max_core_size="0.0", + since_cfg="None"): + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.AUTO_TS, cdump_mod.CFG_STATE, state) + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.AUTO_TS, cdump_mod.COOLOFF, rate_limit_interval) + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.AUTO_TS, cdump_mod.CFG_CORE_USAGE, max_core_size) + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.AUTO_TS, cdump_mod.CFG_SINCE, since_cfg) + + +def set_feature_table_cfg(redis_mock, state="disabled", rate_limit_interval="0", container_name="swss"): + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.FEATURE.format(container_name), cdump_mod.CFG_STATE, state) + redis_mock.set(cdump_mod.CFG_DB, cdump_mod.FEATURE.format(container_name), cdump_mod.COOLOFF, rate_limit_interval) + + +def set_auto_ts_dump_info(redis_mock, ts_dump, core_dump, timestamp, container): + key = cdump_mod.TS_MAP + "|" + ts_dump + redis_mock.set(cdump_mod.STATE_DB, key, cdump_mod.CORE_DUMP, core_dump) + redis_mock.set(cdump_mod.STATE_DB, key, cdump_mod.TIMESTAMP, timestamp) + redis_mock.set(cdump_mod.STATE_DB, key, cdump_mod.CONTAINER, container) + + +def verify_post_exec_state(redis_mock, cdump_expect=[], cdumps_not_expect=[], container_mp={}): + final_state = redis_mock.keys(cdump_mod.STATE_DB, cdump_mod.TS_MAP+"*") + print(final_state) + for dump in cdump_expect: + assert cdump_mod.TS_MAP+"|"+dump in final_state + for dump in cdumps_not_expect: + assert cdump_mod.TS_MAP+"|"+dump not in final_state + for dump, container in container_mp.items(): + key = cdump_mod.TS_MAP+"|"+dump + assert container in redis_mock.get(cdump_mod.STATE_DB, key, cdump_mod.CONTAINER) + + +def populate_state_db(redis_mock, + ts_map={"sonic_dump_random1": "orchagent;1575985;swss", + "sonic_dump_random2": "syncd;1575988;syncd"}): + for dump, value in ts_map.items(): + core_dump, timestamp, container_name = value.split(";") + set_auto_ts_dump_info(redis_mock, dump, core_dump, timestamp, container_name) + print(redis_mock.keys(cdump_mod.STATE_DB, cdump_mod.TS_MAP+"*")) + + +class TestCoreDumpCreationEvent(unittest.TestCase): + + def setUp(self): + cdump_mod.TIME_BUF = 1 + cdump_mod.WAIT_BUFFER = 1 + + def test_invoc_ts_state_db_update(self): + """ + Scenario: CFG_STATE is enabled. CFG_CORE_CLEANUP is disabled and no rate_limit_interval is provided + Check if techsupport is invoked, file is created and State DB is updated + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="enabled") + populate_state_db(redis_mock) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("orchagent.12345.123.core.gz", redis_mock) + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random2.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" in os.listdir(cdump_mod.TS_DIR) + cdump_expect = ["sonic_dump_random1", "sonic_dump_random2", "sonic_dump_random3"] + verify_post_exec_state(redis_mock, cdump_expect) + + def test_global_rate_limit_interval(self): + """ + Scenario: CFG_STATE is enabled. + Global rate_limit_interval is not passed yet. Check if techsupport isn't invoked. + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", rate_limit_interval="1") + set_feature_table_cfg(redis_mock, state="enabled") + populate_state_db(redis_mock) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("orchagent.12345.123.core.gz", redis_mock) + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random2.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" not in os.listdir(cdump_mod.TS_DIR) + cdump_expect = ["sonic_dump_random1", "sonic_dump_random2"] + cdump_not_expect = ["sonic_dump_random3"] + verify_post_exec_state(redis_mock, cdump_expect, cdump_not_expect) + + def test_per_container_rate_limit_interval(self): + """ + Scenario: CFG_STATE is enabled. Global rate_limit_interval is passed + But Per container rate_limit_interval is not passed yet. Check if techsupport isn't invoked + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", rate_limit_interval="0.25") + set_feature_table_cfg(redis_mock, state="enabled", rate_limit_interval="10") + populate_state_db(redis_mock, ts_map={"sonic_dump_random1": + "orchagent;{};swss".format(int(time.time()))}) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + time.sleep(0.25) # wait for global rate_limit_interval to pass + cls.handle_core_dump_creation_event() + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" not in os.listdir(cdump_mod.TS_DIR) + verify_post_exec_state(redis_mock, ["sonic_dump_random1"], ["sonic_dump_random3"]) + + def test_invoc_ts_after_rate_limit_interval(self): + """ + Scenario: CFG_STATE is enabled. + All the rate_limit_interval's are passed. Check if techsupport is invoked + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", rate_limit_interval="0.1") + set_feature_table_cfg(redis_mock, state="enabled", rate_limit_interval="0.25") + populate_state_db(redis_mock, ts_map={"sonic_dump_random1": + "orchagent;{};swss".format(int(time.time()))}) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + time.sleep(0.25) # wait for all the rate_limit_interval's to pass + cls.handle_core_dump_creation_event() + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" in os.listdir(cdump_mod.TS_DIR) + ts_mp = {"sonic_dump_random3": "swss"} + verify_post_exec_state(redis_mock, ["sonic_dump_random1", "sonic_dump_random3"], [], ts_mp) + + def test_core_dump_with_invalid_container_name(self): + """ + Scenario: CFG_STATE is enabled. + Core Dump is found but no relevant exit_event entry is found in STATE_DB. + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="enabled", container_name="snmp") + populate_state_db(redis_mock, {}) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("snmpd.12345.123.core.gz", "whatevver", redis_mock) + cls.handle_core_dump_creation_event() + assert "sonic_dump_random3.tar.gz" not in os.listdir(cdump_mod.TS_DIR) + final_state = redis_mock.keys(cdump_mod.STATE_DB, cdump_mod.TS_MAP+"*") + assert not final_state + + def test_feature_table_not_set(self): + """ + Scenario: CFG_STATE is enabled. + The auto-techsupport in Feature table is not enabled for the core-dump generated + Check if techsupport is not invoked + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="disabled", container_name="snmp") + populate_state_db(redis_mock, {}) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/core/python3.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("python3.12345.123.core.gz", "snmp", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("python3.12345.123.core.gz", redis_mock) + assert "sonic_dump_random3.tar.gz" not in os.listdir(cdump_mod.TS_DIR) + + def test_since_argument(self): + """ + Scenario: CFG_STATE is enabled. + Check if techsupport is invoked and since argument in properly applied + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", since_cfg="4 days ago") + set_feature_table_cfg(redis_mock, state="enabled") + populate_state_db(redis_mock) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport --since '4 days ago'" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + return 0, "", "" + elif "date --date='4 days ago'" in cmd_str: + return 0, "", "" + else: + return 1, "", "Invalid Command" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("orchagent.12345.123.core.gz", redis_mock) + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random2.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" in os.listdir(cdump_mod.TS_DIR) + expect = ["sonic_dump_random1", "sonic_dump_random2", "sonic_dump_random3"] + ts_mp = {"sonic_dump_random3": "swss"} + verify_post_exec_state(redis_mock, expect, [], ts_mp) + + def test_masic_core_dump(self): + """ + Scenario: Dump is generated from swss12 container. Config specified for swss shoudl be applied + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled") + set_feature_table_cfg(redis_mock, state="enabled") + populate_state_db(redis_mock) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + else: + return 1, "", "Command Not Found" + return 0, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss12", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("orchagent.12345.123.core.gz", redis_mock) + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random2.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" in os.listdir(cdump_mod.TS_DIR) + cdump_expect = ["sonic_dump_random1", "sonic_dump_random2", "sonic_dump_random3"] + verify_post_exec_state(redis_mock, cdump_expect) + + def test_invalid_since_argument(self): + """ + Scenario: CFG_STATE is enabled. + Check if techsupport is invoked and an invalid since argument in identified + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", since_cfg="whatever") + set_feature_table_cfg(redis_mock, state="enabled") + populate_state_db(redis_mock) + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport --since '2 days ago'" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + return 0, "", "" + elif "date --date='whatever'" in cmd_str: + return 1, "", "Invalid Date Format" + else: + return 1, "", "" + cdump_mod.subprocess_exec = mock_cmd + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz") + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz") + cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock) + cls.handle_core_dump_creation_event() + cdump_mod.handle_coredump_cleanup("orchagent.12345.123.core.gz", redis_mock) + assert "sonic_dump_random1.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random2.tar.gz" in os.listdir(cdump_mod.TS_DIR) + assert "sonic_dump_random3.tar.gz" in os.listdir(cdump_mod.TS_DIR) + expect = ["sonic_dump_random1", "sonic_dump_random2", "sonic_dump_random3"] + ts_mp = {"sonic_dump_random3": "swss"} + verify_post_exec_state(redis_mock, expect, [], ts_mp) + + def test_core_dump_cleanup(self): + """ + Scenario: CFG_STATE is enabled. core-dump limit is crossed + Verify Whether is cleanup is performed + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", max_core_size="6.0") + with Patcher() as patcher: + patcher.fs.set_disk_usage(1000, path="/var/core/") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25) + patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25) + patcher.fs.create_file("/var/core/python3.12345.21.core.gz", st_size=25) + cdump_mod.handle_coredump_cleanup("python3.12345.21.core.gz", redis_mock) + current_fs = os.listdir(cdump_mod.CORE_DUMP_DIR) + assert len(current_fs) == 2 + assert "orchagent.12345.123.core.gz" not in current_fs + assert "lldpmgrd.12345.22.core.gz" in current_fs + assert "python3.12345.21.core.gz" in current_fs + + def test_max_core_size_limit_not_crossed(self): + """ + Scenario: CFG_STATE is enabled. core-dump limit is crossed + Verify Whether is cleanup is performed + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, state="enabled", max_core_size="5.0") + with Patcher() as patcher: + def mock_cmd(cmd, env): + cmd_str = " ".join(cmd) + if "show techsupport" in cmd_str: + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz") + return 0, "", "" + patcher.fs.set_disk_usage(2000, path="/var/core/") + patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25) + patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25) + patcher.fs.create_file("/var/core/python3.12345.21.core.gz", st_size=25) + cdump_mod.handle_coredump_cleanup("python3.12345.21.core.gz", redis_mock) + current_fs = os.listdir(cdump_mod.CORE_DUMP_DIR) + assert len(current_fs) == 3 + assert "orchagent.12345.123.core.gz" in current_fs + assert "lldpmgrd.12345.22.core.gz" in current_fs + assert "python3.12345.21.core.gz" in current_fs diff --git a/tests/techsupport_cleanup_test.py b/tests/techsupport_cleanup_test.py new file mode 100644 index 000000000000..da1e2a772cb1 --- /dev/null +++ b/tests/techsupport_cleanup_test.py @@ -0,0 +1,115 @@ +import os +import sys +import pyfakefs +import unittest +from pyfakefs.fake_filesystem_unittest import Patcher +from swsscommon import swsscommon +from utilities_common.general import load_module_from_source +from utilities_common.db import Db +from .mock_tables import dbconnector + +sys.path.append("scripts") +import techsupport_cleanup as ts_mod + + +def set_auto_ts_cfg(redis_mock, auto_ts_state="disabled", max_ts="0"): + redis_mock.set(ts_mod.CFG_DB, ts_mod.AUTO_TS, ts_mod.CFG_STATE, auto_ts_state) + redis_mock.set(ts_mod.CFG_DB, ts_mod.AUTO_TS, ts_mod.CFG_MAX_TS, max_ts) + + +def set_auto_ts_dump_info(redis_mock, ts_dump, core_dump, timestamp, container_name): + key = ts_mod.TS_MAP + "|" + ts_dump + redis_mock.set(ts_mod.STATE_DB, key, ts_mod.CORE_DUMP, core_dump) + redis_mock.set(ts_mod.STATE_DB, key, ts_mod.TIMESTAMP, timestamp) + redis_mock.set(ts_mod.STATE_DB, key, ts_mod.CONTAINER, container_name) + + +class TestTechsupportCreationEvent(unittest.TestCase): + + def test_no_cleanup_state_disabled(self): + """ + Scenario: TS_CLEANUP is disabled. Check no cleanup is performed, + even though the techsupport limit is already crossed + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, max_ts="5") + with Patcher() as patcher: + patcher.fs.set_disk_usage(1000, path="/var/dump/") + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz", st_size=30) + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz", st_size=30) + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz", st_size=30) + ts_mod.handle_techsupport_creation_event("/var/dump/sonic_dump_random3.tar.gz", redis_mock) + current_fs = os.listdir(ts_mod.TS_DIR) + print(current_fs) + assert len(current_fs) == 3 + assert "sonic_dump_random1.tar.gz" in current_fs + assert "sonic_dump_random2.tar.gz" in current_fs + assert "sonic_dump_random3.tar.gz" in current_fs + + def test_no_cleanup_state_enabled(self): + """ + Scenario: TS_CLEANUP is enabled. + Verify no cleanup is performed, as the techsupport limit haven't crossed yet + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, auto_ts_state="enabled", max_ts="10") + with Patcher() as patcher: + patcher.fs.set_disk_usage(1000, path="/var/dump/") + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz", st_size=30) + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz", st_size=30) + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz", st_size=30) + ts_mod.handle_techsupport_creation_event("/var/dump/sonic_dump_random3.tar.gz", redis_mock) + current_fs = os.listdir(ts_mod.TS_DIR) + print(current_fs) + assert len(current_fs) == 3 + assert "sonic_dump_random1.tar.gz" in current_fs + assert "sonic_dump_random2.tar.gz" in current_fs + assert "sonic_dump_random3.tar.gz" in current_fs + + def test_dump_cleanup(self): + """ + Scenario: TS_CLEANUP is enabled. techsupport size limit is crosed + Verify Whether is cleanup is performed or not + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, auto_ts_state="enabled", max_ts="5") + with Patcher() as patcher: + patcher.fs.set_disk_usage(1000, path="/var/dump/") + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz", st_size=25) + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz", st_size=25) + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz", st_size=25) + ts_mod.handle_techsupport_creation_event("/var/dump/sonic_dump_random3.tar.gz", redis_mock) + current_fs = os.listdir(ts_mod.TS_DIR) + assert len(current_fs) == 2 + assert "sonic_dump_random1.tar.gz" not in current_fs + assert "sonic_dump_random2.tar.gz" in current_fs + assert "sonic_dump_random3.tar.gz" in current_fs + + def test_state_db_update(self): + """ + Scenario: TS_CLEANUP is enabled. techsupport size limit is crosed + Verify Whether is cleanup is performed and the state_db is updated + """ + db_wrap = Db() + redis_mock = db_wrap.db + set_auto_ts_cfg(redis_mock, auto_ts_state="enabled", max_ts="5") + set_auto_ts_dump_info(redis_mock, "sonic_dump_random1", "orchagent", "1575985", "orchagent") + set_auto_ts_dump_info(redis_mock, "sonic_dump_random2", "syncd", "1575988", "syncd") + with Patcher() as patcher: + patcher.fs.set_disk_usage(1000, path="/var/dump/") + patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz", st_size=25) + patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz", st_size=25) + patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz", st_size=25) + ts_mod.handle_techsupport_creation_event("/var/dump/sonic_dump_random3.tar.gz", redis_mock) + current_fs = os.listdir(ts_mod.TS_DIR) + print(current_fs) + assert len(current_fs) == 2 + assert "sonic_dump_random1.tar.gz" not in current_fs + assert "sonic_dump_random2.tar.gz" in current_fs + assert "sonic_dump_random3.tar.gz" in current_fs + final_state = redis_mock.keys(ts_mod.STATE_DB, ts_mod.TS_MAP + "*") + assert ts_mod.TS_MAP + "|sonic_dump_random2" in final_state + assert ts_mod.TS_MAP + "|sonic_dump_random1" not in final_state diff --git a/utilities_common/auto_techsupport_helper.py b/utilities_common/auto_techsupport_helper.py new file mode 100644 index 000000000000..b94b7828974b --- /dev/null +++ b/utilities_common/auto_techsupport_helper.py @@ -0,0 +1,184 @@ +import os +import glob +import time +import subprocess +import shutil +import math +import syslog +from os.path import basename, splitext + +__all__ = [ # Contants + "CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN", + "CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF", + "CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB", + "TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", + "TIME_BUF", "SINCE_DEFAULT" + ] + [ # Methods + "verify_recent_file_creation", + "get_ts_dumps", + "strip_ts_ext", + "get_stats", + "pretty_size", + "cleanup_process", + "subprocess_exec", + "trim_masic_suffix" + ] + + +# MISC +CORE_DUMP_DIR = "/var/core" +CORE_DUMP_PTRN = "*.core.gz" + +TS_DIR = "/var/dump" +TS_PTRN = "sonic_dump_*.tar*" + +# CONFIG DB Attributes +CFG_DB = "CONFIG_DB" + +# AUTO_TECHSUPPORT|GLOBAL table attributes +AUTO_TS = "AUTO_TECHSUPPORT|GLOBAL" +CFG_STATE = "state" +CFG_MAX_TS = "max_techsupport_limit" +COOLOFF = "rate_limit_interval" +CFG_CORE_USAGE = "max_core_limit" +CFG_SINCE = "since" + +# AUTO_TECHSUPPORT_FEATURE Table +FEATURE = "AUTO_TECHSUPPORT_FEATURE|{}" + +# State DB Attributes +STATE_DB = "STATE_DB" + +# AUTO_TECHSUPPORT_DUMP_INFO table info +TS_MAP = "AUTO_TECHSUPPORT_DUMP_INFO" +CORE_DUMP = "core_dump" +TIMESTAMP = "timestamp" +CONTAINER = "container_name" + +TIME_BUF = 20 +SINCE_DEFAULT = "2 days ago" + + +# Helper methods +def subprocess_exec(cmd, env=None): + output = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env + ) + return output.returncode, output.stdout, output.stderr + + +def strip_ts_ext(ts_path): + """ Return the basename and strip the techsupport dump of any extensions """ + base_name = basename(ts_path) + name, _ = splitext(splitext(base_name)[0]) # *.tar.gz + return name + + +def get_ts_dumps(full_path=False): + """ Get the list of TS dumps in the TS_DIR, sorted by the creation time """ + curr_list = glob.glob(os.path.join(TS_DIR, TS_PTRN)) + curr_list.sort(key=os.path.getmtime) + if full_path: + return curr_list + return [os.path.basename(name) for name in curr_list] + + +def verify_recent_file_creation(file_path, in_last_sec=TIME_BUF): + """ Verify if the file exists and is created within the last TIME_BUF sec """ + curr = time.time() + try: + was_created_on = os.path.getmtime(file_path) + except Exception: + return False + if curr - was_created_on < in_last_sec: + return True + else: + return False + + +def get_stats(ptrn, collect_stats=True): + """ + Returns the size of the files (matched by the ptrn) occupied. + Also returns the list of files Sorted by the Descending order of creation time & size + """ + files = glob.glob(ptrn) + file_stats = [] + total_size = 0 + for file in files: + file_size = os.path.getsize(file) + if collect_stats: + file_stats.append((os.path.getmtime(file), file_size, file)) + total_size += file_size + if collect_stats: + # Sort by the Descending order of file_creation_time, size_of_file + file_stats = sorted(file_stats, key=lambda sub: (-sub[0], sub[1], sub[2])) + return (file_stats, total_size) + + +def pretty_size(bytes): + """Get human-readable file sizes""" + UNITS_MAPPING = [ + (1 << 50, ' PB'), + (1 << 40, ' TB'), + (1 << 30, ' GB'), + (1 << 20, ' MB'), + (1 << 10, ' KB'), + (1, (' byte', ' bytes')), + ] + for factor, suffix in UNITS_MAPPING: + if bytes >= factor: + break + amount = int(bytes / factor) + + if isinstance(suffix, tuple): + singular, multiple = suffix + if amount == 1: + suffix = singular + else: + suffix = multiple + return str(amount) + suffix + + +def cleanup_process(limit, file_ptrn, dir): + """Deletes the oldest files incrementally until the size is under limit""" + if not(0 < limit and limit < 100): + syslog.syslog(syslog.LOG_ERR, "core_usage_limit can only be between 1 and 100, whereas the configured value is: {}".format(limit)) + return + + fs_stats, curr_size = get_stats(os.path.join(dir, file_ptrn)) + disk_stats = shutil.disk_usage(dir) + max_limit_bytes = math.floor((limit * disk_stats.total / 100)) + + if curr_size <= max_limit_bytes: + return + + num_bytes_to_del = curr_size - max_limit_bytes + num_deleted = 0 + removed_files = [] + # Preserve the latest file created + while num_deleted < num_bytes_to_del and len(fs_stats) > 1: + stat = fs_stats.pop() + try: + os.remove(stat[2]) + removed_files.append(stat[2]) + except OSError as error: + continue + num_deleted += stat[1] + syslog.syslog(syslog.LOG_INFO, "{} deleted from {}".format(pretty_size(num_deleted), dir)) + return removed_files + + +def trim_masic_suffix(container_name): + """ Trim any masic suffix i.e swss0 -> swss """ + arr = list(container_name) + index = len(arr) - 1 + while index >= 0: + if arr[-1].isdigit(): + arr.pop() + else: + break + index = index - 1 + return "".join(arr)