Skip to content

Commit

Permalink
Merge branch 'master' into SYSTEM_READY
Browse files Browse the repository at this point in the history
  • Loading branch information
sg893052 authored May 20, 2022
2 parents 86e3aea + 4fc09b1 commit 350cf66
Show file tree
Hide file tree
Showing 56 changed files with 3,673 additions and 339 deletions.
304 changes: 271 additions & 33 deletions config/main.py

Large diffs are not rendered by default.

60 changes: 58 additions & 2 deletions config/plugins/auto_techsupport.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,50 @@ def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="available-mem-threshold")
@click.argument(
"available-mem-threshold",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_available_mem_threshold(db, available_mem_threshold):
""" Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"available_mem_threshold": available_mem_threshold,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="min-available-mem")
@click.argument(
"min-available-mem",
nargs=1,
required=True,
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_GLOBAL_min_available_mem(db, min_available_mem):
""" Minimum free memory amount in Kb when techsupport will be executed.
"""

table = "AUTO_TECHSUPPORT"
key = "GLOBAL"
data = {
"min_available_mem": min_available_mem,
}
try:
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
except Exception as err:
exit_with_error(f"Error: {err}", fg="red")


@AUTO_TECHSUPPORT_GLOBAL.command(name="since")
@click.argument(
"since",
Expand Down Expand Up @@ -271,8 +315,12 @@ def AUTO_TECHSUPPORT_FEATURE():
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@click.option(
"--available-mem-threshold",
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval, available_mem_threshold):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
Expand All @@ -282,6 +330,8 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval
if available_mem_threshold is not None:
data["available_mem_threshold"] = available_mem_threshold

try:
add_entry_validated(db.cfgdb, table, key, data)
Expand All @@ -303,8 +353,12 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
"--rate-limit-interval",
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
)
@click.option(
"--available-mem-threshold",
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
)
@clicommon.pass_db
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval, available_mem_threshold):
""" Add object in AUTO_TECHSUPPORT_FEATURE. """

table = "AUTO_TECHSUPPORT_FEATURE"
Expand All @@ -314,6 +368,8 @@ def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval
data["state"] = state
if rate_limit_interval is not None:
data["rate_limit_interval"] = rate_limit_interval
if available_mem_threshold is not None:
data["available_mem_threshold"] = available_mem_threshold

try:
update_entry_validated(db.cfgdb, table, key, data)
Expand Down
3 changes: 3 additions & 0 deletions doc/Command-Reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -5281,8 +5281,11 @@ If vrf-name is also provided as part of the command, if the vrf is created it wi
default Vlan20
Vrf-red Vlan100
Loopback11
Eth0.100
Vrf-blue Loopback100
Loopback102
Ethernet0.10
PortChannel101
````

### VRF config commands
Expand Down
17 changes: 17 additions & 0 deletions dump/match_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
from dump.match_infra import MatchRequest
from dump.helper import handle_multiple_keys_matched_error

# Return dict helper methods

def check_error(ret):
""" Check if the match request failed """
if ret["error"]:
return True, ret["error"]
else:
return False, ""

def get_matched_keys(ret):
""" Return Matched Keys """
failed, err_str = check_error(ret)
if not failed:
return ret["keys"], ""
else:
return [], err_str

# Port Helper Methods

def fetch_port_oid(match_engine, port_name, ns):
Expand Down
4 changes: 4 additions & 0 deletions dump/match_infra.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,10 @@ def clear(self, namespace=None):
elif namespace in self.cache:
del self.cache[namespace]

def fill(self, ns, conn, connected_to):
""" Update internal cache """
self.cache[ns] = {'conn': conn, 'connected_to': set(connected_to)}


class MatchEngine:
"""
Expand Down
38 changes: 37 additions & 1 deletion generic_config_updater/gu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,33 @@ def validate_config_db_config(self, config_db_as_json):
sy.loadData(tmp_config_db_as_json)

sy.validate_data_tree()
return True, None

# TODO: modularize custom validations better or move directly to sonic-yang module
return self.validate_bgp_peer_group(config_db_as_json)
except sonic_yang.SonicYangException as ex:
return False, ex

def validate_bgp_peer_group(self, config_db):
if "BGP_PEER_RANGE" not in config_db:
return True, None

visited = {}
table = config_db["BGP_PEER_RANGE"]
for peer_group_name in table:
peer_group = table[peer_group_name]
if "ip_range" not in peer_group:
continue

# TODO: convert string to IpAddress object for better handling of IPs
# TODO: validate range intersection
ip_range = peer_group["ip_range"]
for ip in ip_range:
if ip in visited:
return False, f"{ip} is duplicated in BGP_PEER_RANGE: {set([peer_group_name, visited[ip]])}"
visited[ip] = peer_group_name

return True, None

def crop_tables_without_yang(self, config_db_as_json):
sy = self.create_sonic_yang_with_loaded_models()

Expand Down Expand Up @@ -727,6 +750,19 @@ def _get_path_tokens_from_leaf(self, model, token_index, xpath_tokens, config):
# leaf_list_name = match.group(1)
leaf_list_value = match.group(1)
list_config = config[leaf_list_name]
# Workaround for those fields who is defined as leaf-list in YANG model but have string value in config DB
# No need to lookup the item index in ConfigDb since the list is represented as a string, return path to string immediately
# Example:
# xpath: /sonic-buffer-port-egress-profile-list:sonic-buffer-port-egress-profile-list/BUFFER_PORT_EGRESS_PROFILE_LIST/BUFFER_PORT_EGRESS_PROFILE_LIST_LIST[port='Ethernet9']/profile_list[.='egress_lossy_profile']
# path: /BUFFER_PORT_EGRESS_PROFILE_LIST/Ethernet9/profile_list
if isinstance(list_config, str):
return [leaf_list_name]

if not isinstance(list_config, list):
raise ValueError(f"list_config is expected to be of type list or string. Found {type(list_config)}.\n " + \
f"model: {model}\n token_index: {token_index}\n " + \
f"xpath_tokens: {xpath_tokens}\n config: {config}")

list_idx = list_config.index(leaf_list_value)
return [leaf_list_name, list_idx]

Expand Down
117 changes: 2 additions & 115 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,11 @@
For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
"""
import os
import time
import argparse
import syslog
import re
from swsscommon.swsscommon import SonicV2Connector
from utilities_common.auto_techsupport_helper import *

# Explicity Pass this to the subprocess invoking techsupport
ENV_VAR = os.environ
PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else ""
ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV


def handle_coredump_cleanup(dump_name, db):
_, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN))
Expand Down Expand Up @@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
self.core_name = core_name
self.container = container_name
self.db = db
self.proc_mp = {}
self.core_ts_map = {}

def handle_core_dump_creation_event(self):
if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
Expand All @@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name))
return

global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF)
container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF)

try:
global_cooloff = float(global_cooloff)
except ValueError:
global_cooloff = 0.0

try:
container_cooloff = float(container_cooloff)
except ValueError:
container_cooloff = 0.0

cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff)
if cooloff_passed:
since_cfg = self.get_since_arg()
new_file = self.invoke_ts_cmd(since_cfg)
if new_file:
self.write_to_state_db(int(time.time()), new_file)

def write_to_state_db(self, timestamp, ts_dump):
name = strip_ts_ext(ts_dump)
key = TS_MAP + "|" + name
self.db.set(STATE_DB, key, CORE_DUMP, self.core_name)
self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp))
self.db.set(STATE_DB, key, CONTAINER, self.container)

def get_since_arg(self):
since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE)
if not since_cfg:
return SINCE_DEFAULT
rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR)
if rc == 0:
return since_cfg
return SINCE_DEFAULT

def parse_ts_dump_name(self, ts_stdout):
""" Figure out the ts_dump name from the techsupport stdout """
matches = re.findall(TS_PTRN, ts_stdout)
if matches:
return matches[-1]
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg, num_retry=0):
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
new_dump = ""
if rc == EXT_LOCKFAIL:
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
elif rc == EXT_RETRY:
if num_retry <= MAX_RETRY_LIMIT:
return self.invoke_ts_cmd(since_cfg, num_retry+1)
else:
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
elif rc != EXT_SUCCESS:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
else: # EXT_SUCCESS
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
"""Verify both the global and per-proc rate_limit_intervals have passed"""
curr_ts_list = get_ts_dumps(True)
if global_cooloff and curr_ts_list:
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
if time.time() - last_ts_dump_creation < global_cooloff:
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
return False

self.parse_ts_map()
if container_cooloff and self.container in self.core_ts_map:
last_creation_time = self.core_ts_map[self.container][0][0]
if time.time() - last_creation_time < container_cooloff:
msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name))
return False
return True

def parse_ts_map(self):
"""Create proc_name, ts_dump & creation_time map"""
ts_keys = self.db.keys(STATE_DB, TS_MAP+"*")
if not ts_keys:
return
for ts_key in ts_keys:
data = self.db.get_all(STATE_DB, ts_key)
if not data:
continue
container_name = data.get(CONTAINER, "")
creation_time = data.get(TIMESTAMP, "")
try:
creation_time = int(creation_time)
except Exception:
continue # if the creation time is invalid, skip the entry
ts_dump = ts_key.split("|")[-1]
if container_name and container_name not in self.core_ts_map:
self.core_ts_map[container_name] = []
self.core_ts_map[container_name].append((int(creation_time), ts_dump))
for container_name in self.core_ts_map:
self.core_ts_map[container_name].sort()
invoke_ts_command_rate_limited(self.db, EVENT_TYPE_CORE, {CORE_DUMP: self.core_name}, self.container)


def main():
parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script')
Expand Down
Loading

0 comments on commit 350cf66

Please sign in to comment.