Skip to content

Commit

Permalink
Support Graceful Shutdown
Browse files Browse the repository at this point in the history
If the kubelet where the the sriov pod is running has gracefulShutdown
configured, we'll delay in preStop for a while if and while /tmp/sriov-delay-shutdown
exists, up to a maximum wait of 10 minutes (less than the 15 minutes which is specified in
the daemonset pod's terminationGracePeriodSeconds).
  • Loading branch information
jerpeter1 committed May 10, 2023
1 parent d4709f6 commit f6f872a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 10 deletions.
1 change: 1 addition & 0 deletions bindata/manifests/daemon/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ spec:
volumeMounts:
- name: cnibin
mountPath: /host/opt/cni/bin
terminationGracePeriodSeconds: 900
volumes:
- name: host
hostPath:
Expand Down
28 changes: 26 additions & 2 deletions bindata/scripts/clean-k8s-services.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,36 @@
#!/bin/bash

chroot_path="/proc/1/root"
delay_shutdown_path="$chroot_path/tmp/sriov-delay-shutdown"
kubelet_config_path="$chroot_path/etc/kubernetes/kubelet.conf"

# 10 minutes - this should be shorter than the time that is specifed for the
# terminationGracePeriodSeconds in the daemonset's pod spec, so that everything
# else in the preStop hook has time to run and the Pod can be terminated properly.
wait_time=600

# If the kubelet is configured to shutdown gracefully (>0s shutdownGracePeriod), we need to wait for
# things to settle before shutting down the node.
if [ -f "$delay_shutdown_path" ]; then
if grep "$kubelet_config_path" -e shutdownGracePeriod | grep -qv \"0s\"; then
start=$(date +%s)
touch "$chroot_path/var/log/sriov-delay-start"
while [ $(( $(date +%s) - $start )) -lt $wait_time ]; do
if [ ! -f "$delay_shutdown_path" ]; then # don't have to wait anymore
break
fi
sleep 1
done
rm -f "$delay_shutdown_path"
touch "$chroot_path/var/log/sriov-delay-end"
fi
fi

if [ "$CLUSTER_TYPE" == "openshift" ]; then
echo "openshift cluster"
exit
fi

chroot_path="/host"

function clean_services() {
# Remove switchdev service files
rm -f $chroot_path/etc/systemd/system/switchdev-configuration-after-nm.service
Expand Down
44 changes: 36 additions & 8 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ const (
// maxUpdateBackoff is the maximum time to react to a change as we back off
// in the face of errors.
maxUpdateBackoff = 60 * time.Second

// the presence of this file indicates that the sriov shutdown should be delayed
delayShutdownPath = "/host/tmp/sriov-delay-shutdown"
)

type Message struct {
Expand Down Expand Up @@ -612,6 +615,22 @@ func (dn *Daemon) completeDrain() error {
glog.Errorf("completeDrain(): failed to annotate node: %v", err)
return err
}

if _, err := os.Stat(delayShutdownPath); err != nil {
if os.IsNotExist(err) {
// delayShutdownPath does not exist, so we don't need to do anything
return nil
}

glog.Errorf("completeDrain(): error checking file status %v: %v", delayShutdownPath, err)
return err
}

if err := os.Remove(delayShutdownPath); err != nil {
glog.Errorf("completeDrain(): failed to remove file %v: %v", delayShutdownPath, err)
return err
}

return nil
}

Expand Down Expand Up @@ -679,15 +698,16 @@ func rebootNode() {
glog.Errorf("rebootNode(): %v", err)
}
defer exit()
// creates a new transient systemd unit to reboot the system.
// We explictily try to stop kubelet.service first, before anything else; this
// way we ensure the rest of system stays running, because kubelet may need
// to do "graceful" shutdown by e.g. de-registering with a load balancer.
// However note we use `;` instead of `&&` so we keep rebooting even
// if kubelet failed to shutdown - that way the machine will still eventually reboot
// as systemd will time out the stop invocation.
// creates a new transient systemd unit to reboot the system that
// reboots the system using `systemctl reboot``
// by shutting down the system this way instead via `reboot`,
// when kubelet is configured with a shutdownGracePeriod, then it will
// be give some time to pods to run their preStop scripts and respond to
// SIGTERM by terminating gracefully before being forcefully killed via
// SIGKILL. Stopping the kubelet service and then immediately running
// `reboot` just results in all pods being immediately killed
cmd := exec.Command("systemd-run", "--unit", "sriov-network-config-daemon-reboot",
"--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl stop kubelet.service; reboot")
"--description", "sriov-network-config-daemon reboot node", "/bin/sh", "-c", "systemctl reboot")

if err := cmd.Run(); err != nil {
glog.Errorf("failed to reboot node: %v", err)
Expand Down Expand Up @@ -933,6 +953,14 @@ func (dn *Daemon) drainNode() error {
return err
}
glog.Info("drainNode(): drain complete")

file, err := os.Create(delayShutdownPath)
if err != nil {
glog.Errorf("drainNode(): failed to create file %v %v", delayShutdownPath, err)
return err
}
defer file.Close()

return nil
}

Expand Down

0 comments on commit f6f872a

Please sign in to comment.