Skip to content

Commit

Permalink
Share image for gnmi and telemetry (sonic-net#16863)
Browse files Browse the repository at this point in the history
Why I did it
Share docker image to support gnmi container and telemetry container

Work item tracking
Microsoft ADO 25423918:
How I did it
Create telemetry image from gnmi docker image.
Enable gnmi container and disable telemetry container by default.

How to verify it
Run end to end test.
  • Loading branch information
ganglyu authored Nov 8, 2023
1 parent f5c0960 commit c71fb3a
Show file tree
Hide file tree
Showing 24 changed files with 334 additions and 30 deletions.
1 change: 1 addition & 0 deletions Makefile.work
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ SONIC_BUILD_INSTRUCTION := $(MAKE) \
DOCKER_LOCKFILE_SAVE=$(DOCKER_LOCKFILE_SAVE) \
SONIC_CONFIG_USE_NATIVE_DOCKERD_FOR_BUILD=$(SONIC_CONFIG_USE_NATIVE_DOCKERD_FOR_BUILD) \
SONIC_INCLUDE_SYSTEM_TELEMETRY=$(INCLUDE_SYSTEM_TELEMETRY) \
SONIC_INCLUDE_SYSTEM_GNMI=$(INCLUDE_SYSTEM_GNMI) \
INCLUDE_DHCP_RELAY=$(INCLUDE_DHCP_RELAY) \
INCLUDE_DHCP_SERVER=$(INCLUDE_DHCP_SERVER) \
INCLUDE_MACSEC=$(INCLUDE_MACSEC) \
Expand Down
34 changes: 34 additions & 0 deletions dockers/docker-sonic-gnmi/Dockerfile.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %}
FROM docker-config-engine-bullseye-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}}

ARG docker_container_name
ARG image_version

## Make apt-get non-interactive
ENV DEBIAN_FRONTEND=noninteractive

# Pass the image_version to container
ENV IMAGE_VERSION=$image_version

RUN apt-get update

{% if docker_sonic_gnmi_debs.strip() -%}
# Copy locally-built Debian package dependencies
{{ copy_files("debs/", docker_sonic_gnmi_debs.split(' '), "/debs/") }}

# Install locally-built Debian packages and implicitly install their dependencies
{{ install_debian_packages(docker_sonic_gnmi_debs.split(' ')) }}
{%- endif %}

RUN apt-get clean -y && \
apt-get autoclean - && \
apt-get autoremove -y && \
rm -rf /debs

COPY ["start.sh", "gnmi-native.sh", "dialout.sh", "/usr/bin/"]
COPY ["telemetry_vars.j2", "/usr/share/sonic/templates/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor"]

ENTRYPOINT ["/usr/local/bin/supervisord"]
5 changes: 5 additions & 0 deletions dockers/docker-sonic-gnmi/base_image_files/monit_gnmi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
###############################################################################
## Monit configuration for telemetry container
###############################################################################
check program container_memory_gnmi with path "/usr/bin/memory_checker gnmi 419430400"
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service gnmi" repeat every 2 cycles
1 change: 1 addition & 0 deletions dockers/docker-sonic-gnmi/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
program:gnmi-native
6 changes: 6 additions & 0 deletions dockers/docker-sonic-gnmi/dialout.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash

# Start with default config
export CVL_SCHEMA_PATH=/usr/sbin/schema
exec /usr/sbin/dialout_client_cli -insecure -logtostderr -v 2

105 changes: 105 additions & 0 deletions dockers/docker-sonic-gnmi/gnmi-native.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env bash

EXIT_TELEMETRY_VARS_FILE_NOT_FOUND=1
INCORRECT_TELEMETRY_VALUE=2
TELEMETRY_VARS_FILE=/usr/share/sonic/templates/telemetry_vars.j2

if [ ! -f "$TELEMETRY_VARS_FILE" ]; then
echo "Telemetry vars template file not found"
exit $EXIT_TELEMETRY_VARS_FILE_NOT_FOUND
fi

# Try to read telemetry and certs config from ConfigDB.
# Use default value if no valid config exists
TELEMETRY_VARS=$(sonic-cfggen -d -t $TELEMETRY_VARS_FILE)
TELEMETRY_VARS=${TELEMETRY_VARS//[\']/\"}
X509=$(echo $TELEMETRY_VARS | jq -r '.x509')
GNMI=$(echo $TELEMETRY_VARS | jq -r '.gnmi')
CERTS=$(echo $TELEMETRY_VARS | jq -r '.certs')

TELEMETRY_ARGS=" -logtostderr"
export CVL_SCHEMA_PATH=/usr/sbin/schema

if [ -n "$CERTS" ]; then
SERVER_CRT=$(echo $CERTS | jq -r '.server_crt')
SERVER_KEY=$(echo $CERTS | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
fi

CA_CRT=$(echo $CERTS | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
elif [ -n "$X509" ]; then
SERVER_CRT=$(echo $X509 | jq -r '.server_crt')
SERVER_KEY=$(echo $X509 | jq -r '.server_key')
if [ -z $SERVER_CRT ] || [ -z $SERVER_KEY ]; then
TELEMETRY_ARGS+=" --insecure"
else
TELEMETRY_ARGS+=" --server_crt $SERVER_CRT --server_key $SERVER_KEY "
fi

CA_CRT=$(echo $X509 | jq -r '.ca_crt')
if [ ! -z $CA_CRT ]; then
TELEMETRY_ARGS+=" --ca_crt $CA_CRT"
fi
else
TELEMETRY_ARGS+=" --noTLS"
fi

# If no configuration entry exists for TELEMETRY, create one default port
if [ -z "$GNMI" ]; then
PORT=8080
else
PORT=$(echo $GNMI | jq -r '.port')
fi
TELEMETRY_ARGS+=" --port $PORT"

CLIENT_AUTH=$(echo $GNMI | jq -r '.client_auth')
if [ -z $CLIENT_AUTH ] || [ $CLIENT_AUTH == "false" ]; then
TELEMETRY_ARGS+=" --allow_no_client_auth"
fi

LOG_LEVEL=$(echo $GNMI | jq -r '.log_level')
if [[ $LOG_LEVEL =~ ^[0-9]+$ ]]; then
TELEMETRY_ARGS+=" -v=$LOG_LEVEL"
else
TELEMETRY_ARGS+=" -v=2"
fi

# Enable ZMQ for SmartSwitch
LOCALHOST_SUBTYPE=`sonic-db-cli CONFIG_DB hget localhost "subtype"`
if [[ x"${LOCALHOST_SUBTYPE}" == x"SmartSwitch" ]]; then
TELEMETRY_ARGS+=" -zmq_address=tcp://127.0.0.1:8100"
fi

# Server will handle threshold connections consecutively
THRESHOLD_CONNECTIONS=$(echo $GNMI | jq -r '.threshold')
if [[ $THRESHOLD_CONNECTIONS =~ ^[0-9]+$ ]]; then
TELEMETRY_ARGS+=" --threshold $THRESHOLD_CONNECTIONS"
else
if [ -z "$GNMI" ] || [[ $THRESHOLD_CONNECTIONS == "null" ]]; then
TELEMETRY_ARGS+=" --threshold 100"
else
echo "Incorrect threshold value, expecting positive integers" >&2
exit $INCORRECT_TELEMETRY_VALUE
fi
fi

# Close idle connections after certain duration (in seconds)
IDLE_CONN_DURATION=$(echo $GNMI | jq -r '.idle_conn_duration')
if [[ $IDLE_CONN_DURATION =~ ^[0-9]+$ ]]; then
TELEMETRY_ARGS+=" --idle_conn_duration $IDLE_CONN_DURATION"
else
if [ -z "$GNMI" ] || [[ $IDLE_CONN_DURATION == "null" ]]; then
TELEMETRY_ARGS+=" --idle_conn_duration 5"
else
echo "Incorrect idle_conn_duration value, expecting positive integers" >&2
exit $INCORRECT_TELEMETRY_VALUE
fi
fi

exec /usr/sbin/telemetry ${TELEMETRY_ARGS}
18 changes: 18 additions & 0 deletions dockers/docker-sonic-gnmi/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

if [ "${RUNTIME_OWNER}" == "" ]; then
RUNTIME_OWNER="kube"
fi

CTR_SCRIPT="/usr/share/sonic/scripts/container_startup.py"
if test -f ${CTR_SCRIPT}
then
${CTR_SCRIPT} -f gnmi -o ${RUNTIME_OWNER} -v ${IMAGE_VERSION}
fi

mkdir -p /var/sonic
echo "# Config files managed by sonic-config-engine" > /var/sonic/config_status

TZ=$(cat /etc/timezone)
rm -rf /etc/localtime
ln -sf /usr/share/zoneinfo/$TZ /etc/localtime
60 changes: 60 additions & 0 deletions dockers/docker-sonic-gnmi/supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
[supervisord]
logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:dependent-startup]
command=python3 -m supervisord_dependent_startup
autostart=true
autorestart=unexpected
startretries=0
exitcodes=0,3
events=PROCESS_STATE
buffer_size=1024

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name gnmi
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=false
buffer_size=1024

[program:rsyslogd]
command=/usr/sbin/rsyslogd -n -iNONE
priority=1
autostart=false
autorestart=true
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true

[program:start]
command=/usr/bin/start.sh
priority=2
autostart=false
autorestart=false
startsecs=0
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=rsyslogd:running

[program:gnmi-native]
command=/usr/bin/gnmi-native.sh
priority=3
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=start:exited

[program:dialout]
command=/usr/bin/dialout.sh
priority=4
autostart=false
autorestart=false
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
dependent_startup_wait_for=gnmi-native:running
5 changes: 5 additions & 0 deletions dockers/docker-sonic-gnmi/telemetry_vars.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"certs": {% if "certs" in GNMI.keys() %}{{ GNMI["certs"] }}{% else %}""{% endif %},
"gnmi" : {% if "gnmi" in GNMI.keys() %}{{ GNMI["gnmi"] }}{% else %}""{% endif %},
"x509" : {% if "x509" in DEVICE_METADATA.keys() %}{{ DEVICE_METADATA["x509"] }}{% else %}""{% endif %}
}
3 changes: 1 addition & 2 deletions dockers/docker-sonic-telemetry/Dockerfile.j2
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %}
FROM docker-config-engine-bullseye-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}}
FROM docker-sonic-gnmi-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}}

ARG docker_container_name
ARG image_version
Expand Down Expand Up @@ -28,7 +28,6 @@ RUN apt-get clean -y && \
COPY ["start.sh", "telemetry.sh", "dialout.sh", "/usr/bin/"]
COPY ["telemetry_vars.j2", "/usr/share/sonic/templates/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor"]

ENTRYPOINT ["/usr/local/bin/supervisord"]
12 changes: 3 additions & 9 deletions dockers/docker-sonic-telemetry/telemetry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,12 @@ else
TELEMETRY_ARGS+=" -v=2"
fi

# Enable ZMQ for SmartSwitch
LOCALHOST_SUBTYPE=`sonic-db-cli CONFIG_DB hget localhost "subtype"`
if [[ x"${LOCALHOST_SUBTYPE}" == x"SmartSwitch" ]]; then
TELEMETRY_ARGS+=" -zmq_address=tcp://127.0.0.1:8100"
fi

# Server will handle threshold connections consecutively
THRESHOLD_CONNECTIONS=$(echo $GNMI | jq -r '.threshold')
if [[ $THRESHOLD_CONNECTIONS =~ ^[0-9]+$ ]]; then
TELEMETRY_ARGS+=" --threshold $THRESHOLD_CONNECTIONS"
else
if [ -z $GNMI ] || [[ $THRESHOLD_CONNECTIONS == "null" ]]; then
if [ -z "$GNMI" ] || [[ $THRESHOLD_CONNECTIONS == "null" ]]; then
TELEMETRY_ARGS+=" --threshold 100"
else
echo "Incorrect threshold value, expecting positive integers" >&2
Expand All @@ -94,13 +88,13 @@ IDLE_CONN_DURATION=$(echo $GNMI | jq -r '.idle_conn_duration')
if [[ $IDLE_CONN_DURATION =~ ^[0-9]+$ ]]; then
TELEMETRY_ARGS+=" --idle_conn_duration $IDLE_CONN_DURATION"
else
if [ -z $GNMI ] || [[ $IDLE_CONN_DURATION == "null" ]]; then
if [ -z "$GNMI" ] || [[ $IDLE_CONN_DURATION == "null" ]]; then
TELEMETRY_ARGS+=" --idle_conn_duration 5"
else
echo "Incorrect idle_conn_duration value, expecting positive integers" >&2
exit $INCORRECT_TELEMETRY_VALUE
fi
fi

TELEMETRY_ARGS+=" -gnmi_native_write=false"

exec /usr/sbin/telemetry ${TELEMETRY_ARGS}
16 changes: 16 additions & 0 deletions files/build_templates/gnmi.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=GNMI container
Requires=database.service
After=database.service swss.service syncd.service
Before=ntp-config.service
BindsTo=sonic.target
After=sonic.target
StartLimitIntervalSec=1200
StartLimitBurst=3

[Service]
User={{ sonicadmin_user }}
ExecStartPre=/usr/local/bin/{{docker_container_name}}.sh start
ExecStart=/usr/local/bin/{{docker_container_name}}.sh wait
ExecStop=/usr/local/bin/{{docker_container_name}}.sh stop
RestartSec=30
3 changes: 2 additions & 1 deletion files/build_templates/init_cfg.json.j2
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
{%- if include_restapi == "y" %}{% do features.append(("restapi", "enabled", false, "enabled")) %}{% endif %}
{%- if include_sflow == "y" %}{% do features.append(("sflow", "disabled", true, "enabled")) %}{% endif %}
{%- if include_macsec == "y" %}{% do features.append(("macsec", "{% if 'type' in DEVICE_METADATA['localhost'] and DEVICE_METADATA['localhost']['type'] == 'SpineRouter' and DEVICE_RUNTIME_METADATA['MACSEC_SUPPORTED'] %}enabled{% else %}disabled{% endif %}", false, "enabled")) %}{% endif %}
{%- if include_system_gnmi == "y" %}{% do features.append(("gnmi", "enabled", true, "enabled")) %}{% endif %}
{%- if include_system_telemetry == "y" %}{% do features.append(("telemetry", "enabled", true, "enabled")) %}{% endif %}
"FEATURE": {
{# delayed field if set, will start the feature systemd .timer unit instead of .service unit #}
Expand All @@ -76,7 +77,7 @@
"check_up_status" : "false",
{%- endif %}
{%- if include_kubernetes == "y" %}
{%- if feature in ["lldp", "pmon", "radv", "eventd", "snmp", "telemetry"] %}
{%- if feature in ["lldp", "pmon", "radv", "eventd", "snmp", "telemetry", "gnmi"] %}
"set_owner": "kube", {% else %}
"set_owner": "local", {% endif %} {% endif %}
"high_mem_alert": "disabled"
Expand Down
1 change: 1 addition & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,7 @@ sudo LANG=C cp $SCRIPTS_DIR/radv.sh $FILESYSTEM_ROOT/usr/local/bin/radv.sh
sudo LANG=C cp $SCRIPTS_DIR/database.sh $FILESYSTEM_ROOT/usr/local/bin/database.sh
sudo LANG=C cp $SCRIPTS_DIR/snmp.sh $FILESYSTEM_ROOT/usr/local/bin/snmp.sh
sudo LANG=C cp $SCRIPTS_DIR/telemetry.sh $FILESYSTEM_ROOT/usr/local/bin/telemetry.sh
sudo LANG=C cp $SCRIPTS_DIR/gnmi.sh $FILESYSTEM_ROOT/usr/local/bin/gnmi.sh
sudo LANG=C cp $SCRIPTS_DIR/mgmt-framework.sh $FILESYSTEM_ROOT/usr/local/bin/mgmt-framework.sh
sudo LANG=C cp $SCRIPTS_DIR/asic_status.sh $FILESYSTEM_ROOT/usr/local/bin/asic_status.sh
sudo LANG=C cp $SCRIPTS_DIR/asic_status.py $FILESYSTEM_ROOT/usr/local/bin/asic_status.py
Expand Down
1 change: 1 addition & 0 deletions files/image_config/logrotate/rsyslog.j2
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
/var/log/syslog
/var/log/teamd.log
/var/log/telemetry.log
/var/log/gnmi.log
/var/log/frr/bgpd.log
/var/log/frr/zebra.log
/var/log/swss/sairedis*.rec
Expand Down
8 changes: 4 additions & 4 deletions files/image_config/monit/container_checker
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def get_expected_running_containers():
for container_name in feature_table.keys():
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
if feature_table[container_name]["has_global_scope"] == "True":
if feature_table[container_name].get("has_global_scope", "True") == "True":
expected_running_containers.add(container_name)
if feature_table[container_name]["has_per_asic_scope"] == "True":
if feature_table[container_name].get("has_per_asic_scope", "False") == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
if asic_id in asics_id_presence or container_name in run_all_instance_list:
Expand All @@ -69,9 +69,9 @@ def get_expected_running_containers():
expected_running_containers.add(container_name)
if feature_table[container_name]["state"] == 'always_enabled':
if multi_asic.is_multi_asic():
if feature_table[container_name]["has_global_scope"] == "True":
if feature_table[container_name].get("has_global_scope", "True") == "True":
always_running_containers.add(container_name)
if feature_table[container_name]["has_per_asic_scope"] == "True":
if feature_table[container_name].get("has_per_asic_scope", "False") == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
if asic_id in asics_id_presence or container_name in run_all_instance_list:
Expand Down
6 changes: 6 additions & 0 deletions files/image_config/rsyslog/rsyslog.d/00-sonic.conf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ if $programname contains "teamd_" then {
stop
}

## gnmi rules
if $msg startswith " gnmi-native" then {
/var/log/gnmi.log
stop
}

## telemetry rules
if $msg startswith " telemetry" or ($msg startswith " dialout" )then {
/var/log/telemetry.log
Expand Down
1 change: 1 addition & 0 deletions files/scripts/gnmi.sh
Loading

0 comments on commit c71fb3a

Please sign in to comment.