Skip to content

Commit

Permalink
Add an extra parameter ExitCode to RemoveFailedPod
Browse files Browse the repository at this point in the history
Update README.md
  • Loading branch information
yuanchen8911 committed Apr 24, 2024
1 parent b953806 commit acfa396
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 21 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ Balance Plugins: These plugins process all pods, or groups of pods, and determin
| [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint) |Balance|Evicts pods violating TopologySpreadConstraints|
| [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts) |Deschedule|Evicts pods having too many restarts|
| [PodLifeTime](#podlifetime) |Deschedule|Evicts pods that have exceeded a specified age limit|
| [RemoveFailedPods](#removefailedpods) |Deschedule|Evicts pods with certain failed reasons|
| [RemoveFailedPods](#removefailedpods) |Deschedule|Evicts pods with certain failed reasons and exit codes|


### RemoveDuplicates
Expand Down Expand Up @@ -700,8 +700,8 @@ profiles:
### RemoveFailedPods

This strategy evicts pods that are in failed status phase.
You can provide an optional parameter to filter by failed `reasons`.
`reasons` can be expanded to include reasons of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`.
You can provide an optional parameter to filter by failed `reasons` and `exitCodes`.
`reasons` and `exitCodes` can be expanded to include reasons of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`.
You can specify an optional parameter `minPodLifetimeSeconds` to evict pods that are older than specified seconds.
Lastly, you can specify the optional parameter `excludeOwnerKinds` and if a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction.
Expand All @@ -713,6 +713,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere
|`minPodLifetimeSeconds`|uint|
|`excludeOwnerKinds`|list(string)|
|`reasons`|list(string)|
|`exitCodes`|list(int32)|
|`includingInitContainers`|bool|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
Expand All @@ -729,6 +730,8 @@ profiles:
args:
reasons:
- "NodeAffinity"
exitCodes:
- 1
includingInitContainers: true
excludeOwnerKinds:
- "Job"
Expand Down
23 changes: 23 additions & 0 deletions pkg/framework/plugins/removefailedpods/failedpods.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,17 @@ func validateCanEvict(pod *v1.Pod, failedPodArgs *RemoveFailedPodsArgs) error {
}
}

if len(failedPodArgs.ExitCodes) > 0 && pod.Status.Phase == v1.PodFailed {
exitCodes := getFailedContainerStatusExitCodes(pod.Status.ContainerStatuses)
if failedPodArgs.IncludingInitContainers {
exitCodes = append(exitCodes, getFailedContainerStatusExitCodes(pod.Status.InitContainerStatuses)...)
}

if !sets.New(failedPodArgs.ExitCodes...).HasAny(exitCodes...) {
errs = append(errs, fmt.Errorf("pod does not match any of the exitCodes"))
}
}

return utilerrors.NewAggregate(errs)
}

Expand All @@ -165,3 +176,15 @@ func getFailedContainerStatusReasons(containerStatuses []v1.ContainerStatus) []s

return reasons
}

func getFailedContainerStatusExitCodes(containerStatuses []v1.ContainerStatus) []int32 {
exitCodes := make([]int32, 0)

for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated != nil {
exitCodes = append(exitCodes, containerStatus.State.Terminated.ExitCode)
}
}

return exitCodes
}
123 changes: 105 additions & 18 deletions pkg/framework/plugins/removefailedpods/failedpods_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@ var OneHourInSeconds uint = 3600
func TestRemoveFailedPods(t *testing.T) {
createRemoveFailedPodsArgs := func(
includingInitContainers bool,
reasons, excludeKinds []string,
reasons []string,
exitCodes []int32,
excludeKinds []string,
minAgeSeconds *uint,
) RemoveFailedPodsArgs {
return RemoveFailedPodsArgs{
IncludingInitContainers: includingInitContainers,
Reasons: reasons,
ExitCodes: exitCodes,
MinPodLifetimeSeconds: minAgeSeconds,
ExcludeOwnerKinds: excludeKinds,
}
Expand All @@ -69,14 +72,14 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "0 failures, 0 evictions",
args: createRemoveFailedPodsArgs(false, nil, nil, nil),
args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{}, // no pods come back with field selector phase=Failed
},
{
description: "1 container terminated with reason NodeAffinity, 1 eviction",
args: createRemoveFailedPodsArgs(false, nil, nil, nil),
args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -85,9 +88,70 @@ func TestRemoveFailedPods(t *testing.T) {
}), nil),
},
},
{
description: "1 container terminated with exitCode 1, 1 eviction",
args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1},
}), nil),
},
},
{
description: "1 container terminated with exitCode 1, reason NodeAffinity, 1 eviction",
args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity", ExitCode: 1},
}), nil),
},
},
{
description: "1 container terminated with exitCode 1, 0 eviction",
args: createRemoveFailedPodsArgs(false, nil, []int32{2}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1},
}), nil),
},
},
{
description: "2 containers terminated with exitCode 1 and 2 respectively, 2 evictions",
args: createRemoveFailedPodsArgs(false, nil, []int32{1, 2, 3}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 2,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1},
}), nil),
buildTestPod("p2", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 2},
}), nil),
},
},
{
description: "2 containers terminated with exitCode 1 and 2 respectively, 1 eviction",
args: createRemoveFailedPodsArgs(false, nil, []int32{1}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1},
}), nil),
buildTestPod("p2", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 2},
}), nil),
},
},
{
description: "1 init container terminated with reason NodeAffinity, 1 eviction",
args: createRemoveFailedPodsArgs(true, nil, nil, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -96,9 +160,31 @@ func TestRemoveFailedPods(t *testing.T) {
}, nil), nil),
},
},
{
description: "1 init container terminated with exitCode 1, 1 eviction",
args: createRemoveFailedPodsArgs(true, nil, []int32{1}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 1},
}), nil),
},
},
{
description: "1 init container terminated with exitCode 1, 0 eviction",
args: createRemoveFailedPodsArgs(true, nil, []int32{1}, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{ExitCode: 2},
}), nil),
},
},
{
description: "1 init container waiting with reason CreateContainerConfigError, 1 eviction",
args: createRemoveFailedPodsArgs(true, nil, nil, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -109,7 +195,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "2 init container waiting with reason CreateContainerConfigError, 2 nodes, 2 evictions",
args: createRemoveFailedPodsArgs(true, nil, nil, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, nil, nil),
nodes: []*v1.Node{
test.BuildTestNode("node1", 2000, 3000, 10, nil),
test.BuildTestNode("node2", 2000, 3000, 10, nil),
Expand All @@ -126,7 +212,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -137,7 +223,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "include reason=CreateContainerConfigError+NodeAffinity, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -148,7 +234,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason NodeAffinity, 0 eviction",
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -159,7 +245,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "include init container=false, 1 init container waiting with reason CreateContainerConfigError, 0 eviction",
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -170,7 +256,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "lifetime 1 hour, 1 container terminated with reason NodeAffinity, 0 eviction",
args: createRemoveFailedPodsArgs(false, nil, nil, &OneHourInSeconds),
args: createRemoveFailedPodsArgs(false, nil, nil, nil, &OneHourInSeconds),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -181,7 +267,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "nodeFit=true, 1 unschedulable node, 1 container terminated with reason NodeAffinity, 0 eviction",
args: createRemoveFailedPodsArgs(false, nil, nil, nil),
args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil),
nodes: []*v1.Node{
test.BuildTestNode("node1", 2000, 3000, 10, nil),
test.BuildTestNode("node2", 2000, 2000, 10, func(node *v1.Node) {
Expand All @@ -198,7 +284,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "nodeFit=true, only available node does not have enough resources, 1 container terminated with reason CreateContainerConfigError, 0 eviction",
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"CreateContainerConfigError"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 1, 1, 10, nil), test.BuildTestNode("node2", 0, 0, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -210,7 +296,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "excluded owner kind=ReplicaSet, 1 init container terminated with owner kind=ReplicaSet, 0 eviction",
args: createRemoveFailedPodsArgs(true, nil, []string{"ReplicaSet"}, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, []string{"ReplicaSet"}, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -221,7 +307,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 eviction",
args: createRemoveFailedPodsArgs(true, nil, []string{"DaemonSet"}, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, []string{"DaemonSet"}, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
Expand All @@ -232,7 +318,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 pod in termination; nothing should be moved",
args: createRemoveFailedPodsArgs(true, nil, []string{"DaemonSet"}, nil),
args: createRemoveFailedPodsArgs(true, nil, nil, []string{"DaemonSet"}, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -243,7 +329,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "1 container terminated with reason ShutDown, 0 evictions",
args: createRemoveFailedPodsArgs(false, nil, nil, nil),
args: createRemoveFailedPodsArgs(false, nil, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
Expand All @@ -253,7 +339,7 @@ func TestRemoveFailedPods(t *testing.T) {
},
{
description: "include reason=Shutdown, 2 containers terminated with reason ShutDown, 2 evictions",
args: createRemoveFailedPodsArgs(false, []string{"Shutdown"}, nil, nil),
args: createRemoveFailedPodsArgs(false, []string{"Shutdown"}, nil, nil, nil),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 2,
pods: []*v1.Pod{
Expand Down Expand Up @@ -322,6 +408,7 @@ func TestRemoveFailedPods(t *testing.T) {

plugin, err := New(&RemoveFailedPodsArgs{
Reasons: tc.args.Reasons,
ExitCodes: tc.args.ExitCodes,
MinPodLifetimeSeconds: tc.args.MinPodLifetimeSeconds,
IncludingInitContainers: tc.args.IncludingInitContainers,
ExcludeOwnerKinds: tc.args.ExcludeOwnerKinds,
Expand Down
1 change: 1 addition & 0 deletions pkg/framework/plugins/removefailedpods/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ type RemoveFailedPodsArgs struct {
ExcludeOwnerKinds []string `json:"excludeOwnerKinds"`
MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds"`
Reasons []string `json:"reasons"`
ExitCodes []int32 `json:"exitCodes"`
IncludingInitContainers bool `json:"includingInitContainers"`
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit acfa396

Please sign in to comment.