From e9854484338b7384eff761b0fb7765a289a33f57 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 18 Apr 2022 16:06:50 -0700 Subject: [PATCH] Add systemd cgroup controller support Signed-off-by: Brad Davidson --- pkg/agent/containerd/config_linux.go | 8 +++-- pkg/agent/containerd/config_windows.go | 1 + pkg/agent/templates/templates.go | 1 + pkg/agent/templates/templates_linux.go | 3 ++ pkg/cgroups/cgroups_linux.go | 36 ++++++++++------------ pkg/cgroups/cgroups_windows.go | 5 ++-- pkg/daemons/agent/agent_linux.go | 41 +++++++++++++------------- pkg/daemons/config/types.go | 1 + 8 files changed, 52 insertions(+), 44 deletions(-) diff --git a/pkg/agent/containerd/config_linux.go b/pkg/agent/containerd/config_linux.go index 6dc2ca1b83b9..319119fe877d 100644 --- a/pkg/agent/containerd/config_linux.go +++ b/pkg/agent/containerd/config_linux.go @@ -45,18 +45,22 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { } isRunningInUserNS := userns.RunningInUserNS() - _, _, hasCFS, hasPIDs := cgroups.CheckCgroups() + _, _, controllers := cgroups.CheckCgroups() // "/sys/fs/cgroup" is namespaced cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil - disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable) + disableCgroup := isRunningInUserNS && (!controllers["cpu"] || !controllers["pids"] || !cgroupfsWritable) if disableCgroup { logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.") } + systemdCgroup := controllers["cpuset"] && os.Getenv("NOTIFY_SOCKET") != "" + cfg.AgentConfig.Systemd = systemdCgroup + var containerdTemplate string containerdConfig := templates.ContainerdConfig{ NodeConfig: cfg, DisableCgroup: disableCgroup, + SystemdCgroup: systemdCgroup, IsRunningInUserNS: isRunningInUserNS, PrivateRegistryConfig: privRegistries.Registry, ExtraRuntimes: findNvidiaContainerRuntimes(os.DirFS(string(os.PathSeparator))), diff --git a/pkg/agent/containerd/config_windows.go b/pkg/agent/containerd/config_windows.go index 5720cd239514..892e9a3765d8 100644 --- a/pkg/agent/containerd/config_windows.go +++ b/pkg/agent/containerd/config_windows.go @@ -45,6 +45,7 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { containerdConfig := templates.ContainerdConfig{ NodeConfig: cfg, DisableCgroup: true, + SystemdCgroup: false, IsRunningInUserNS: false, PrivateRegistryConfig: privRegistries.Registry, } diff --git a/pkg/agent/templates/templates.go b/pkg/agent/templates/templates.go index 5d27b09c46ad..f931c212d925 100644 --- a/pkg/agent/templates/templates.go +++ b/pkg/agent/templates/templates.go @@ -14,6 +14,7 @@ type ContainerdRuntimeConfig struct { type ContainerdConfig struct { NodeConfig *config.Node DisableCgroup bool + SystemdCgroup bool IsRunningInUserNS bool PrivateRegistryConfig *registries.Registry ExtraRuntimes map[string]ContainerdRuntimeConfig diff --git a/pkg/agent/templates/templates_linux.go b/pkg/agent/templates/templates_linux.go index c03432deef0b..829d110b3370 100644 --- a/pkg/agent/templates/templates_linux.go +++ b/pkg/agent/templates/templates_linux.go @@ -81,6 +81,9 @@ enable_keychain = true [plugins.cri.containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" +[plugins.cri.containerd.runtimes.runc.options] + SystemdCgroup = {{ .SystemdCgroup }} + {{ if .PrivateRegistryConfig }} {{ if .PrivateRegistryConfig.Mirrors }} [plugins.cri.registry.mirrors]{{end}} diff --git a/pkg/cgroups/cgroups_linux.go b/pkg/cgroups/cgroups_linux.go index 3b9b4b333e9c..2947c00060de 100644 --- a/pkg/cgroups/cgroups_linux.go +++ b/pkg/cgroups/cgroups_linux.go @@ -65,34 +65,30 @@ func validateCgroupsV2() error { return nil } -func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { +func CheckCgroups() (kubeletRoot, runtimeRoot string, controllers map[string]bool) { cgroupsModeV2 := cgroups.Mode() == cgroups.Unified + controllers = make(map[string]bool) // For Unified (v2) cgroups we can directly check to see what controllers are mounted // under the unified hierarchy. if cgroupsModeV2 { m, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") if err != nil { - return "", "", false, false + return } - controllers, err := m.Controllers() + enabledControllers, err := m.Controllers() if err != nil { - return "", "", false, false + return } // Intentionally using an expressionless switch to match the logic below - for _, controller := range controllers { - switch { - case controller == "cpu": - hasCFS = true - case controller == "pids": - hasPIDs = true - } + for _, controller := range enabledControllers { + controllers[controller] = true } } f, err := os.Open("/proc/self/cgroup") if err != nil { - return "", "", false, false + return } defer f.Close() @@ -102,10 +98,10 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { if len(parts) < 3 { continue } - controllers := strings.Split(parts[1], ",") + enabledControllers := strings.Split(parts[1], ",") // For v1 or hybrid, controller can be a single value {"blkio"}, or a comounted set {"cpu","cpuacct"} - // For v2, controllers = {""} (only contains a single empty string) - for _, controller := range controllers { + // For v2, controllers = {""} (only contains a single empty string) so this section is not used. + for _, controller := range enabledControllers { switch { case controller == "name=systemd" || cgroupsModeV2: // If we detect that we are running under a `.scope` unit with systemd @@ -128,10 +124,10 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // can fail if we use the comma-separated name. Instead, we check for the controller using the symlink. p := filepath.Join("/sys/fs/cgroup", controller, parts[2], "cpu.cfs_period_us") if _, err := os.Stat(p); err == nil { - hasCFS = true + controllers[controller] = true } - case controller == "pids": - hasPIDs = true + default: + controllers[controller] = true } } } @@ -146,7 +142,7 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { // a host PID scenario but we don't support this. g, err := os.Open("/proc/1/cgroup") if err != nil { - return "", "", false, false + return } defer g.Close() scan = bufio.NewScanner(g) @@ -170,5 +166,5 @@ func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { } } } - return kubeletRoot, runtimeRoot, hasCFS, hasPIDs + return } diff --git a/pkg/cgroups/cgroups_windows.go b/pkg/cgroups/cgroups_windows.go index f5c11dd38f10..b38ba9fb6a3a 100644 --- a/pkg/cgroups/cgroups_windows.go +++ b/pkg/cgroups/cgroups_windows.go @@ -1,3 +1,4 @@ +//go:build windows // +build windows package cgroups @@ -6,6 +7,6 @@ func Validate() error { return nil } -func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { - return "", "", false, false +func CheckCgroups() (kubeletRoot, runtimeRoot string, controllers map[string]bool) { + return } diff --git a/pkg/daemons/agent/agent_linux.go b/pkg/daemons/agent/agent_linux.go index d9b6ad60c539..812adde65022 100644 --- a/pkg/daemons/agent/agent_linux.go +++ b/pkg/daemons/agent/agent_linux.go @@ -18,17 +18,15 @@ import ( "k8s.io/kubernetes/pkg/kubeapiserver/authorizer/modes" ) -func createRootlessConfig(argsMap map[string]string, hasCFS, hasPIDs bool) { +func createRootlessConfig(argsMap map[string]string, controllers map[string]bool) { argsMap["feature-gates=KubeletInUserNamespace"] = "true" // "/sys/fs/cgroup" is namespaced cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil - if hasCFS && hasPIDs && cgroupfsWritable { + if controllers["cpu"] && controllers["pids"] && cgroupfsWritable { logrus.Info("cgroup v2 controllers are delegated for rootless.") - // cgroupfs v2, delegated for rootless by systemd - argsMap["cgroup-driver"] = "cgroupfs" - } else { - logrus.Fatal("delegated cgroup v2 controllers are required for rootless.") + return } + logrus.Fatal("delegated cgroup v2 controllers are required for rootless.") } func checkRuntimeEndpoint(cfg *config.Agent, argsMap map[string]string) { @@ -67,14 +65,13 @@ func kubeletArgs(cfg *config.Agent) map[string]string { bindAddress = "::1" } argsMap := map[string]string{ - "healthz-bind-address": bindAddress, - "read-only-port": "0", - "cluster-domain": cfg.ClusterDomain, - "kubeconfig": cfg.KubeConfigKubelet, - "eviction-hard": "imagefs.available<5%,nodefs.available<5%", - "eviction-minimum-reclaim": "imagefs.available=10%,nodefs.available=10%", - "fail-swap-on": "false", - //"cgroup-root": "/k3s", + "healthz-bind-address": bindAddress, + "read-only-port": "0", + "cluster-domain": cfg.ClusterDomain, + "kubeconfig": cfg.KubeConfigKubelet, + "eviction-hard": "imagefs.available<5%,nodefs.available<5%", + "eviction-minimum-reclaim": "imagefs.available=10%,nodefs.available=10%", + "fail-swap-on": "false", "cgroup-driver": "cgroupfs", "authentication-token-webhook": "true", "anonymous-auth": "false", @@ -138,13 +135,13 @@ func kubeletArgs(cfg *config.Agent) map[string]string { if err != nil || defaultIP.String() != cfg.NodeIP { argsMap["node-ip"] = cfg.NodeIP } - kubeletRoot, runtimeRoot, hasCFS, hasPIDs := cgroups.CheckCgroups() - if !hasCFS { - logrus.Warn("Disabling CPU quotas due to missing cpu.cfs_period_us") + kubeletRoot, runtimeRoot, controllers := cgroups.CheckCgroups() + if !controllers["cpu"] { + logrus.Warn("Disabling CPU quotas due to missing cpu controller or cpu.cfs_period_us") argsMap["cpu-cfs-quota"] = "false" } - if !hasPIDs { - logrus.Fatal("PIDS cgroup support not found") + if !controllers["pids"] { + logrus.Fatal("pids cgroup controller not found") } if kubeletRoot != "" { argsMap["kubelet-cgroups"] = kubeletRoot @@ -172,7 +169,11 @@ func kubeletArgs(cfg *config.Agent) map[string]string { } if cfg.Rootless { - createRootlessConfig(argsMap, hasCFS, hasCFS) + createRootlessConfig(argsMap, controllers) + } + + if cfg.Systemd { + argsMap["cgroup-driver"] = "systemd" } if cfg.ProtectKernelDefaults { diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index ae121ade834b..a3c6c66a97e1 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -90,6 +90,7 @@ type Agent struct { ExtraKubeProxyArgs []string PauseImage string Snapshotter string + Systemd bool CNIPlugin bool NodeTaints []string NodeLabels []string