From 25b9edae3c2e392816b422c57da8b410987b704e Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Mon, 22 Apr 2024 11:56:08 -0700 Subject: [PATCH] Add an extra parameter ExitCode to RemoveFailedPod Update README.md Fix README and test files Update README Address a7i's comments Update README --- README.md | 9 +- .../plugins/removefailedpods/failedpods.go | 23 ++++ .../removefailedpods/failedpods_test.go | 123 +++++++++++++++--- .../plugins/removefailedpods/types.go | 1 + .../removefailedpods/zz_generated.deepcopy.go | 5 + 5 files changed, 139 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 80c424ee3d..e99fe2f175 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ Balance Plugins: These plugins process all pods, or groups of pods, and determin | [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint) |Balance|Evicts pods violating TopologySpreadConstraints| | [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts) |Deschedule|Evicts pods having too many restarts| | [PodLifeTime](#podlifetime) |Deschedule|Evicts pods that have exceeded a specified age limit| -| [RemoveFailedPods](#removefailedpods) |Deschedule|Evicts pods with certain failed reasons| +| [RemoveFailedPods](#removefailedpods) |Deschedule|Evicts pods with certain failed reasons and exit codes| ### RemoveDuplicates @@ -698,10 +698,8 @@ profiles: ``` ### RemoveFailedPods - This strategy evicts pods that are in failed status phase. -You can provide an optional parameter to filter by failed `reasons`. -`reasons` can be expanded to include reasons of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`. +You can provide optional parameters to filter by failed pods' and containters' `reasons`. and `exitCodes`. `exitCodes` apply to failed pods' containers with `terminated` state only. `reasons` and `exitCodes` can be expanded to include those of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`. You can specify an optional parameter `minPodLifetimeSeconds` to evict pods that are older than specified seconds. Lastly, you can specify the optional parameter `excludeOwnerKinds` and if a pod has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction. @@ -713,6 +711,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere |`minPodLifetimeSeconds`|uint| |`excludeOwnerKinds`|list(string)| |`reasons`|list(string)| +|`exitCodes`|list(int32)| |`includingInitContainers`|bool| |`namespaces`|(see [namespace filtering](#namespace-filtering))| |`labelSelector`|(see [label filtering](#label-filtering))| @@ -729,6 +728,8 @@ profiles: args: reasons: - "NodeAffinity" + exitCodes: + - 1 includingInitContainers: true excludeOwnerKinds: - "Job" diff --git a/pkg/framework/plugins/removefailedpods/failedpods.go b/pkg/framework/plugins/removefailedpods/failedpods.go index 439a74c6b9..9eca25db47 100644 --- a/pkg/framework/plugins/removefailedpods/failedpods.go +++ b/pkg/framework/plugins/removefailedpods/failedpods.go @@ -148,6 +148,17 @@ func validateCanEvict(pod *v1.Pod, failedPodArgs *RemoveFailedPodsArgs) error { } } + if len(failedPodArgs.ExitCodes) > 0 { + exitCodes := getFailedContainerStatusExitCodes(pod.Status.ContainerStatuses) + if failedPodArgs.IncludingInitContainers { + exitCodes = append(exitCodes, getFailedContainerStatusExitCodes(pod.Status.InitContainerStatuses)...) + } + + if !sets.New(failedPodArgs.ExitCodes...).HasAny(exitCodes...) { + errs = append(errs, fmt.Errorf("pod does not match any of the exitCodes")) + } + } + return utilerrors.NewAggregate(errs) } @@ -165,3 +176,15 @@ func getFailedContainerStatusReasons(containerStatuses []v1.ContainerStatus) []s return reasons } + +func getFailedContainerStatusExitCodes(containerStatuses []v1.ContainerStatus) []int32 { + exitCodes := make([]int32, 0) + + for _, containerStatus := range containerStatuses { + if containerStatus.State.Terminated != nil { + exitCodes = append(exitCodes, containerStatus.State.Terminated.ExitCode) + } + } + + return exitCodes +} diff --git a/pkg/framework/plugins/removefailedpods/failedpods_test.go b/pkg/framework/plugins/removefailedpods/failedpods_test.go index 0179fbca5b..d052fc670e 100644 --- a/pkg/framework/plugins/removefailedpods/failedpods_test.go +++ b/pkg/framework/plugins/removefailedpods/failedpods_test.go @@ -41,12 +41,15 @@ var OneHourInSeconds uint = 3600 func TestRemoveFailedPods(t *testing.T) { createRemoveFailedPodsArgs := func( includingInitContainers bool, - reasons, excludeKinds []string, + reasons []string, + exitCodes []int32, + excludeKinds []string, minAgeSeconds *uint, ) RemoveFailedPodsArgs { return RemoveFailedPodsArgs{ IncludingInitContainers: includingInitContainers, Reasons: reasons, + ExitCodes: exitCodes, MinPodLifetimeSeconds: minAgeSeconds, ExcludeOwnerKinds: excludeKinds, } @@ -69,14 +72,14 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "0 failures, 0 evictions", - args: createRemoveFailedPodsArgs(false, nil, nil, nil), + args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{}, // no pods come back with field selector phase=Failed }, { description: "1 container terminated with reason NodeAffinity, 1 eviction", - args: createRemoveFailedPodsArgs(false, nil, nil, nil), + args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -85,9 +88,70 @@ func TestRemoveFailedPods(t *testing.T) { }), nil), }, }, + { + description: "1 container terminated with exitCode 1, 1 eviction", + args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 1, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 1}, + }), nil), + }, + }, + { + description: "1 container terminated with exitCode 1, reason NodeAffinity, 1 eviction", + args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 1, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity", ExitCode: 1}, + }), nil), + }, + }, + { + description: "1 container terminated with exitCode 1, expected exitCode 2, 0 eviction", + args: createRemoveFailedPodsArgs(false, nil, []int32{2}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 0, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 1}, + }), nil), + }, + }, + { + description: "2 containers terminated with exitCode 1 and 2 respectively, 2 evictions", + args: createRemoveFailedPodsArgs(false, nil, []int32{1, 2, 3}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 2, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 1}, + }), nil), + buildTestPod("p2", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 2}, + }), nil), + }, + }, + { + description: "2 containers terminated with exitCode 1 and 2 respectively, expected exitCode 1, 1 eviction", + args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 1, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 1}, + }), nil), + buildTestPod("p2", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 2}, + }), nil), + }, + }, { description: "1 init container terminated with reason NodeAffinity, 1 eviction", - args: createRemoveFailedPodsArgs(true, nil, nil, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -96,9 +160,31 @@ func TestRemoveFailedPods(t *testing.T) { }, nil), nil), }, }, + { + description: "1 init container terminated with exitCode 1, 1 eviction", + args: createRemoveFailedPodsArgs(true, nil, []int32{1}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 1, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 1}, + }), nil), + }, + }, + { + description: "1 init container terminated with exitCode 2, expected 1, 0 eviction", + args: createRemoveFailedPodsArgs(true, nil, []int32{1}, nil, nil), + nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, + expectedEvictedPodCount: 0, + pods: []*v1.Pod{ + buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ExitCode: 2}, + }), nil), + }, + }, { description: "1 init container waiting with reason CreateContainerConfigError, 1 eviction", - args: createRemoveFailedPodsArgs(true, nil, nil, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -109,7 +195,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "2 init container waiting with reason CreateContainerConfigError, 2 nodes, 2 evictions", - args: createRemoveFailedPodsArgs(true, nil, nil, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil), nodes: []*v1.Node{ test.BuildTestNode("node1", 2000, 3000, 10, nil), test.BuildTestNode("node2", 2000, 3000, 10, nil), @@ -126,7 +212,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "include reason=CreateContainerConfigError, 1 container terminated with reason CreateContainerConfigError, 1 eviction", - args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -137,7 +223,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "include reason=CreateContainerConfigError+NodeAffinity, 1 container terminated with reason CreateContainerConfigError, 1 eviction", - args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -148,7 +234,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "include reason=CreateContainerConfigError, 1 container terminated with reason NodeAffinity, 0 eviction", - args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -159,7 +245,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "include init container=false, 1 init container waiting with reason CreateContainerConfigError, 0 eviction", - args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -170,7 +256,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "lifetime 1 hour, 1 container terminated with reason NodeAffinity, 0 eviction", - args: createRemoveFailedPodsArgs(false, nil, nil, &OneHourInSeconds), + args: createRemoveFailedPodsArgs(false, nil, nil, nil, &OneHourInSeconds), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -181,7 +267,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "nodeFit=true, 1 unschedulable node, 1 container terminated with reason NodeAffinity, 0 eviction", - args: createRemoveFailedPodsArgs(false, nil, nil, nil), + args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil), nodes: []*v1.Node{ test.BuildTestNode("node1", 2000, 3000, 10, nil), test.BuildTestNode("node2", 2000, 2000, 10, func(node *v1.Node) { @@ -198,7 +284,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "nodeFit=true, only available node does not have enough resources, 1 container terminated with reason CreateContainerConfigError, 0 eviction", - args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 1, 1, 10, nil), test.BuildTestNode("node2", 0, 0, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -210,7 +296,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "excluded owner kind=ReplicaSet, 1 init container terminated with owner kind=ReplicaSet, 0 eviction", - args: createRemoveFailedPodsArgs(true, nil, []string{"ReplicaSet"}, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, []string{"ReplicaSet"}, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -221,7 +307,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 eviction", - args: createRemoveFailedPodsArgs(true, nil, []string{"DaemonSet"}, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, []string{"DaemonSet"}, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 1, pods: []*v1.Pod{ @@ -232,7 +318,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 pod in termination; nothing should be moved", - args: createRemoveFailedPodsArgs(true, nil, []string{"DaemonSet"}, nil), + args: createRemoveFailedPodsArgs(true, nil, nil, []string{"DaemonSet"}, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -243,7 +329,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "1 container terminated with reason ShutDown, 0 evictions", - args: createRemoveFailedPodsArgs(false, nil, nil, nil), + args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 0, pods: []*v1.Pod{ @@ -253,7 +339,7 @@ func TestRemoveFailedPods(t *testing.T) { }, { description: "include reason=Shutdown, 2 containers terminated with reason ShutDown, 2 evictions", - args: createRemoveFailedPodsArgs(false, []string{"Shutdown"}, nil, nil), + args: createRemoveFailedPodsArgs(false, []string{"Shutdown"}, nil, nil, nil), nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)}, expectedEvictedPodCount: 2, pods: []*v1.Pod{ @@ -322,6 +408,7 @@ func TestRemoveFailedPods(t *testing.T) { plugin, err := New(&RemoveFailedPodsArgs{ Reasons: tc.args.Reasons, + ExitCodes: tc.args.ExitCodes, MinPodLifetimeSeconds: tc.args.MinPodLifetimeSeconds, IncludingInitContainers: tc.args.IncludingInitContainers, ExcludeOwnerKinds: tc.args.ExcludeOwnerKinds, diff --git a/pkg/framework/plugins/removefailedpods/types.go b/pkg/framework/plugins/removefailedpods/types.go index 313021a792..b9c952937f 100644 --- a/pkg/framework/plugins/removefailedpods/types.go +++ b/pkg/framework/plugins/removefailedpods/types.go @@ -30,5 +30,6 @@ type RemoveFailedPodsArgs struct { ExcludeOwnerKinds []string `json:"excludeOwnerKinds"` MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds"` Reasons []string `json:"reasons"` + ExitCodes []int32 `json:"exitCodes"` IncludingInitContainers bool `json:"includingInitContainers"` } diff --git a/pkg/framework/plugins/removefailedpods/zz_generated.deepcopy.go b/pkg/framework/plugins/removefailedpods/zz_generated.deepcopy.go index 4e440d9d06..2c6016b9d1 100644 --- a/pkg/framework/plugins/removefailedpods/zz_generated.deepcopy.go +++ b/pkg/framework/plugins/removefailedpods/zz_generated.deepcopy.go @@ -56,6 +56,11 @@ func (in *RemoveFailedPodsArgs) DeepCopyInto(out *RemoveFailedPodsArgs) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.ExitCodes != nil { + in, out := &in.ExitCodes, &out.ExitCodes + *out = make([]int32, len(*in)) + copy(*out, *in) + } return }