diff --git a/go.mod b/go.mod index 11019bdff..3e5de7a7e 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/aws/aws-sdk-go v1.38.59 github.com/containerd/cgroups v1.0.1 github.com/kyokomi/emoji v2.2.4+incompatible - github.com/litmuschaos/chaos-operator v0.0.0-20230413151351-184224e2d2e9 + github.com/litmuschaos/chaos-operator v0.0.0-20230418072131-5ea32522f048 github.com/palantir/stacktrace v0.0.0-20161112013806-78658fd2d177 github.com/pkg/errors v0.9.1 github.com/sirupsen/logrus v1.8.1 diff --git a/go.sum b/go.sum index 68c53f747..e6301becd 100644 --- a/go.sum +++ b/go.sum @@ -354,8 +354,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kyokomi/emoji v2.2.4+incompatible h1:np0woGKwx9LiHAQmwZx79Oc0rHpNw3o+3evou4BEPv4= github.com/kyokomi/emoji v2.2.4+incompatible/go.mod h1:mZ6aGCD7yk8j6QY6KICwnZ2pxoszVseX1DNoGtU2tBA= -github.com/litmuschaos/chaos-operator v0.0.0-20230413151351-184224e2d2e9 h1:g7LFJ2VuERTpv68pR0HeStVtH2sB7rfCKJOjcCIGW34= -github.com/litmuschaos/chaos-operator v0.0.0-20230413151351-184224e2d2e9/go.mod h1:jRA6jKGed6ytLDJ7897yr2Kr2ygg+cuRXJqwvNmE4Bw= +github.com/litmuschaos/chaos-operator v0.0.0-20230418072131-5ea32522f048 h1:JcNUqCgjD+xHxYlTrhaSRLGYSN+0N3OfI++TmP0HXPk= +github.com/litmuschaos/chaos-operator v0.0.0-20230418072131-5ea32522f048/go.mod h1:jRA6jKGed6ytLDJ7897yr2Kr2ygg+cuRXJqwvNmE4Bw= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= diff --git a/pkg/cerrors/custom_errors.go b/pkg/cerrors/custom_errors.go index fefd12461..143a099d0 100644 --- a/pkg/cerrors/custom_errors.go +++ b/pkg/cerrors/custom_errors.go @@ -28,6 +28,7 @@ const ( FailureTypeHttpProbe ErrorType = "HTTP_PROBE_FAILURE" ErrorTypePromProbe ErrorType = "PROM_PROBE_ERROR" FailureTypePromProbe ErrorType = "PROM_PROBE_FAILURE" + ErrorTypeTimeout ErrorType = "TIMEOUT" ) type userFriendly interface { @@ -46,6 +47,10 @@ func GetErrorType(err error) ErrorType { if ufe, ok := err.(userFriendly); ok { return ufe.ErrorType() } + rootCause := stacktrace.RootCause(err) + if ufe, ok := rootCause.(userFriendly); ok { + return ufe.ErrorType() + } return ErrorTypeNonUserFriendly } diff --git a/pkg/probe/cmdprobe.go b/pkg/probe/cmdprobe.go index 1e2a34d08..6dcdf5ed0 100644 --- a/pkg/probe/cmdprobe.go +++ b/pkg/probe/cmdprobe.go @@ -54,6 +54,7 @@ func prepareCmdProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, // triggerInlineCmdProbe trigger the cmd probe and storing the output into the out buffer func triggerInlineCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) var description string // It parses the templated command and return normal string @@ -68,8 +69,8 @@ func triggerInlineCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types. // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Timeout(int64(probe.RunProperties.ProbeTimeout)). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Timeout(probeTimeout.ProbeTimeout). + Wait(probeTimeout.Interval). TryWithTimeout(func(attempt uint) error { var out, stdErr bytes.Buffer // run the inline command probe @@ -98,7 +99,7 @@ func triggerInlineCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types. resultDetails.ProbeArtifacts[probe.Name] = probes return nil }); err != nil { - return err + return checkProbeTimeoutError(probe.Name, cerrors.FailureTypeCmdProbe, err) } setProbeDescription(resultDetails, probe, description) @@ -108,6 +109,7 @@ func triggerInlineCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types. // triggerSourceCmdProbe trigger the cmd probe inside the external pod func triggerSourceCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails litmusexec.PodDetails, clients clients.ClientSets, resultDetails *types.ResultDetails) error { var description string + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) // It parses the templated command and return normal string // if command doesn't have template, it will return the same command @@ -121,8 +123,8 @@ func triggerSourceCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails li // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Timeout(int64(probe.RunProperties.ProbeTimeout)). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Timeout(probeTimeout.ProbeTimeout). + Wait(probeTimeout.Interval). TryWithTimeout(func(attempt uint) error { command := append([]string{"/bin/sh", "-c"}, probe.CmdProbeInputs.Command) // exec inside the external pod to get the o/p of given command @@ -148,7 +150,7 @@ func triggerSourceCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails li resultDetails.ProbeArtifacts[probe.Name] = probes return nil }); err != nil { - return err + return checkProbeTimeoutError(probe.Name, cerrors.FailureTypeCmdProbe, err) } setProbeDescription(resultDetails, probe, description) @@ -320,11 +322,13 @@ func deleteProbePod(chaosDetails *types.ChaosDetails, clients clients.ClientSets // triggerInlineContinuousCmdProbe trigger the inline continuous cmd probes func triggerInlineContinuousCmdProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) + var isExperimentFailed bool // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v duration before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // it trigger the inline cmd probe for the entire duration of chaos and it fails, if any err encounter @@ -346,7 +350,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort // if experiment fails but stopOnfailure is provided as false then it will continue the execution @@ -360,13 +364,15 @@ loop: // triggerInlineOnChaosCmdProbe trigger the inline onchaos cmd probes func triggerInlineOnChaosCmdProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) + var isExperimentFailed bool duration := chaosDetails.ChaosDuration // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) - duration = math.Maximum(0, duration-probe.RunProperties.InitialDelaySeconds) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) + duration = math.Maximum(0, duration-int(probeTimeout.InitialDelay.Seconds())) } var endTime <-chan time.Time @@ -397,7 +403,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort @@ -412,14 +418,15 @@ loop: // triggerSourceOnChaosCmdProbe trigger the onchaos cmd probes having need some external source image func triggerSourceOnChaosCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails litmusexec.PodDetails, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool duration := chaosDetails.ChaosDuration // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) - duration = math.Maximum(0, duration-probe.RunProperties.InitialDelaySeconds) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) + duration = math.Maximum(0, duration-int(probeTimeout.InitialDelay)) } endTime := time.After(time.Duration(duration) * time.Second) @@ -448,7 +455,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort @@ -464,12 +471,13 @@ loop: // triggerSourceContinuousCmdProbe trigger the continuous cmd probes having need some external source image func triggerSourceContinuousCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails litmusexec.PodDetails, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // it trigger the cmd probe for the entire duration of chaos and it fails, if any err encounter @@ -491,7 +499,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort // if experiment fails but stopOnfailure is provided as false then it will continue the execution @@ -535,6 +543,7 @@ func validateResult(comparator v1alpha1.ComparatorInfo, probeName, cmdOutput str // preChaosCmdProbe trigger the cmd probe for prechaos phase func preChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch probe.Mode { case "SOT", "Edge": @@ -551,9 +560,9 @@ func preChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the cmd probe for the inline mode @@ -625,6 +634,7 @@ func preChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul // postChaosCmdProbe trigger cmd probe for post chaos phase func postChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch probe.Mode { case "EOT", "Edge": @@ -641,9 +651,9 @@ func postChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the cmd probe for the inline mode diff --git a/pkg/probe/comparator/integer.go b/pkg/probe/comparator/integer.go index 8eb2298b0..81cc7fdc1 100644 --- a/pkg/probe/comparator/integer.go +++ b/pkg/probe/comparator/integer.go @@ -40,7 +40,7 @@ func (model Model) CompareInt(errorCode cerrors.ErrorType) error { } case "==": if !obj.isEqual() { - return cerrors.Error{ErrorCode: errorCode, Target: model.probeName, Reason: fmt.Sprintf("Actual value: %v. Expected value: should not be equal to %v", obj.a, obj.b)} + return cerrors.Error{ErrorCode: errorCode, Target: model.probeName, Reason: fmt.Sprintf("Actual value: %v. Expected value: should be equal to %v", obj.a, obj.b)} } case "!=": if !obj.isNotEqual() { diff --git a/pkg/probe/httpprobe.go b/pkg/probe/httpprobe.go index e87d22895..e76a24297 100644 --- a/pkg/probe/httpprobe.go +++ b/pkg/probe/httpprobe.go @@ -33,7 +33,7 @@ func prepareHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets return err } case "postchaos": - if err := postChaosHTTPProbe(probe, resultDetails, clients, chaosDetails); err != nil { + if err := postChaosHTTPProbe(probe, resultDetails); err != nil { return err } case "duringchaos": @@ -46,6 +46,7 @@ func prepareHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets // triggerHTTPProbe run the http probe command func triggerHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) // It parses the templated url and return normal string // if command doesn't have template, it will return the same command @@ -58,14 +59,13 @@ func triggerHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul method := getHTTPMethodType(probe.HTTPProbeInputs.Method) // initialize simple http client with default attributes - timeout := time.Duration(probe.RunProperties.ProbeTimeout) * time.Millisecond - client := &http.Client{Timeout: timeout} + client := &http.Client{Timeout: probeTimeout.ProbeTimeout} // impose properties to http client with cert check disabled if probe.HTTPProbeInputs.InsecureSkipVerify { transCfg := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } - client = &http.Client{Transport: transCfg, Timeout: timeout} + client = &http.Client{Transport: transCfg, Timeout: probeTimeout.ProbeTimeout} } switch method { @@ -103,13 +103,14 @@ func getHTTPMethodType(httpMethod v1alpha1.HTTPMethod) string { // httpGet send the http Get request to the given URL and verify the response code to follow the specified criteria func httpGet(probe v1alpha1.ProbeAttributes, client *http.Client, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) var description string // it will retry for some retry count, in each iteration of try it contains following things // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Wait(probeTimeout.Interval). Try(func(attempt uint) error { // getting the response from the given url resp, err := client.Get(probe.HTTPProbeInputs.URL) @@ -141,6 +142,7 @@ func httpGet(probe v1alpha1.ProbeAttributes, client *http.Client, resultDetails // httpPost send the http post request to the given URL func httpPost(probe v1alpha1.ProbeAttributes, client *http.Client, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) body, err := getHTTPBody(probe.HTTPProbeInputs.Method.Post, probe.Name) if err != nil { return err @@ -152,7 +154,7 @@ func httpPost(probe v1alpha1.ProbeAttributes, client *http.Client, resultDetails // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Wait(probeTimeout.Interval). Try(func(attempt uint) error { resp, err := client.Post(probe.HTTPProbeInputs.URL, probe.HTTPProbeInputs.Method.Post.ContentType, strings.NewReader(body)) if err != nil { @@ -210,11 +212,12 @@ func getHTTPBody(httpBody v1alpha1.PostMethod, probeName string) (string, error) // triggerContinuousHTTPProbe trigger the continuous http probes func triggerContinuousHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // it triggers the http probe for the entire duration of chaos and it fails, if any error encounter @@ -236,7 +239,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort // if experiment fails but stopOnfailure is provided as false then it will continue the execution @@ -250,6 +253,7 @@ loop: // preChaosHTTPProbe trigger the http probe for prechaos phase func preChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch probe.Mode { case "SOT", "Edge": @@ -264,9 +268,9 @@ func preChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // trigger the http probe if err = triggerHTTPProbe(probe, resultDetails); err != nil && cerrors.GetErrorType(err) != cerrors.FailureTypeHttpProbe { @@ -294,7 +298,8 @@ func preChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu } // postChaosHTTPProbe trigger the http probe for postchaos phase -func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { +func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch probe.Mode { case "EOT", "Edge": @@ -309,9 +314,9 @@ func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // trigger the http probe @@ -339,14 +344,15 @@ func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res // triggerOnChaosHTTPProbe trigger the onchaos http probes func triggerOnChaosHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool duration := chaosDetails.ChaosDuration // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) - duration = math.Maximum(0, duration-probe.RunProperties.InitialDelaySeconds) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) + duration = math.Maximum(0, duration-int(probeTimeout.InitialDelay.Seconds())) } endTime := time.After(time.Duration(duration) * time.Second) @@ -376,7 +382,7 @@ loop: } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort diff --git a/pkg/probe/k8sprobe.go b/pkg/probe/k8sprobe.go index f6e9fdd57..91b420586 100644 --- a/pkg/probe/k8sprobe.go +++ b/pkg/probe/k8sprobe.go @@ -44,6 +44,7 @@ func prepareK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Result // triggerK8sProbe run the k8s probe command func triggerK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) inputs := probe.K8sProbeInputs @@ -78,8 +79,8 @@ func triggerK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Timeout(int64(probe.RunProperties.ProbeTimeout)). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Timeout(probeTimeout.ProbeTimeout). + Wait(probeTimeout.Interval). TryWithTimeout(func(attempt uint) error { //defining the gvr for the requested resource gvr := schema.GroupVersionResource{ @@ -115,7 +116,7 @@ func triggerK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, description = fmt.Sprintf("Probe successfully performed the '%s' operation on the specified Kubernetes resource", probe.K8sProbeInputs.Operation) return nil }); err != nil { - return err + return checkProbeTimeoutError(probe.Name, cerrors.FailureTypeK8sProbe, err) } setProbeDescription(resultDetails, probe, description) @@ -124,11 +125,13 @@ func triggerK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, // triggerContinuousK8sProbe trigger the continuous k8s probes func triggerContinuousK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) + var isExperimentFailed bool // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // it triggers the k8s probe for the entire duration of chaos and it fails, if any error encounter @@ -150,7 +153,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort // if experiment fails but stopOnfailure is provided as false then it will continue the execution @@ -279,6 +282,7 @@ func resourcesAbsent(probe v1alpha1.ProbeAttributes, gvr schema.GroupVersionReso // preChaosK8sProbe trigger the k8s probe for prechaos phase func preChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch strings.ToLower(probe.Mode) { case "sot", "edge": @@ -292,9 +296,9 @@ func preChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul "Phase": "PreChaos", }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the k8s probe if err = triggerK8sProbe(probe, clients, resultDetails); err != nil && cerrors.GetErrorType(err) != cerrors.FailureTypeK8sProbe { @@ -323,6 +327,7 @@ func preChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul // postChaosK8sProbe trigger the k8s probe for postchaos phase func postChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch strings.ToLower(probe.Mode) { case "eot", "edge": @@ -336,9 +341,9 @@ func postChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu "Phase": "PostChaos", }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the k8s probe if err = triggerK8sProbe(probe, clients, resultDetails); err != nil && cerrors.GetErrorType(err) != cerrors.FailureTypeK8sProbe { @@ -384,14 +389,15 @@ func onChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Result // triggerOnChaosK8sProbe trigger the onchaos k8s probes func triggerOnChaosK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool duration := chaosDetails.ChaosDuration // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) - duration = math.Maximum(0, duration-probe.RunProperties.InitialDelaySeconds) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) + duration = math.Maximum(0, duration-int(probeTimeout.InitialDelay)) } endTime := time.After(time.Duration(duration) * time.Second) @@ -420,7 +426,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort diff --git a/pkg/probe/probe.go b/pkg/probe/probe.go index 8e9d296d0..d3e1ac41c 100644 --- a/pkg/probe/probe.go +++ b/pkg/probe/probe.go @@ -234,6 +234,14 @@ func getProbeByName(name string, probeDetails []*types.ProbeDetails) *types.Prob return nil } +func getProbeTimeouts(name string, probeDetails []*types.ProbeDetails) types.ProbeTimeouts { + probe := getProbeByName(name, probeDetails) + if probe != nil { + return probe.Timeouts + } + return types.ProbeTimeouts{} +} + func getDescription(err error) string { rootCause := stacktrace.RootCause(err) if error, ok := rootCause.(cerrors.Error); ok { @@ -354,3 +362,15 @@ func IsProbeFailed(reason string) bool { } return false } + +func checkProbeTimeoutError(name string, code cerrors.ErrorType, probeErr error) error { + log.Infof("name: %s, err: %v", name, probeErr) + if cerrors.GetErrorType(probeErr) == cerrors.ErrorTypeTimeout { + return cerrors.Error{ + ErrorCode: code, + Target: fmt.Sprintf("{name: %s}", name), + Reason: "probe failed due to timeout", + } + } + return probeErr +} diff --git a/pkg/probe/promProbe.go b/pkg/probe/promProbe.go index c95b43e10..5e8d553f8 100644 --- a/pkg/probe/promProbe.go +++ b/pkg/probe/promProbe.go @@ -43,6 +43,7 @@ func preparePromProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets // preChaosPromProbe trigger the prometheus probe for prechaos phase func preChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch strings.ToLower(probe.Mode) { case "sot", "edge": @@ -59,9 +60,9 @@ func preChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the prom probe and storing the output into the out buffer @@ -97,6 +98,7 @@ func preChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu // postChaosPromProbe trigger the prometheus probe for postchaos phase func postChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) switch strings.ToLower(probe.Mode) { case "eot", "edge": @@ -113,9 +115,9 @@ func postChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res }) // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // triggering the prom probe and storing the output into the out buffer @@ -170,14 +172,16 @@ func onChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul // triggerPromProbe trigger the prometheus probe inside the external pod func triggerPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.ResultDetails) error { + probeTimeout := getProbeTimeouts(probe.Name, resultDetails.ProbeDetails) + var description string // running the prom probe command and matching the output // it will retry for some retry count, in each iteration of try it contains following things // it contains a timeout per iteration of retry. if the timeout expires without success then it will go to next try // for a timeout, it will run the command, if it fails wait for the interval and again execute the command until timeout expires if err := retry.Times(uint(getAttempts(probe.RunProperties.Attempt, probe.RunProperties.Retry))). - Timeout(int64(probe.RunProperties.ProbeTimeout)). - Wait(time.Duration(probe.RunProperties.Interval) * time.Millisecond). + Timeout(probeTimeout.ProbeTimeout). + Wait(probeTimeout.Interval). TryWithTimeout(func(attempt uint) error { var command string // It will use query or queryPath to get the prometheus metrics @@ -219,7 +223,7 @@ func triggerPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul description = fmt.Sprintf("Obtained the specified prometheus metrics. Actual value: %s. Expected value: %s", value, probe.PromProbeInputs.Comparator.Value) return nil }); err != nil { - return err + return checkProbeTimeoutError(probe.Name, cerrors.FailureTypePromProbe, err) } setProbeDescription(resultDetails, probe, description) return nil @@ -227,12 +231,13 @@ func triggerPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul // triggerContinuousPromProbe trigger the continuous prometheus probe func triggerContinuousPromProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) } // it trigger the prom probe for the entire duration of chaos and it fails, if any err encounter @@ -254,7 +259,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort // if experiment fails but stopOnfailure is provided as false then it will continue the execution @@ -268,14 +273,15 @@ loop: // triggerOnChaosPromProbe trigger the onchaos prom probe func triggerOnChaosPromProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { + probeTimeout := getProbeTimeouts(probe.Name, chaosresult.ProbeDetails) var isExperimentFailed bool duration := chaosDetails.ChaosDuration // waiting for initial delay - if probe.RunProperties.InitialDelaySeconds != 0 { - log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) - time.Sleep(time.Duration(probe.RunProperties.InitialDelaySeconds) * time.Second) - duration = math.Maximum(0, duration-probe.RunProperties.InitialDelaySeconds) + if probeTimeout.InitialDelay != 0 { + log.Infof("[Wait]: Waiting for %v before probe execution", probe.RunProperties.InitialDelay) + time.Sleep(probeTimeout.InitialDelay) + duration = math.Maximum(0, duration-int(probeTimeout.InitialDelay)) } endTime := time.After(time.Duration(duration) * time.Second) @@ -304,7 +310,7 @@ loop: } } // waiting for the probe polling interval - time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) + time.Sleep(probeTimeout.ProbePollingInterval) } } // if experiment fails and stopOnfailure is provided as true then it will patch the chaosengine for abort diff --git a/pkg/types/types.go b/pkg/types/types.go index 3616e28b2..569a3d44c 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -87,8 +87,16 @@ type ProbeDetails struct { IsProbeFailedWithError error Failed bool RunID string - RunCount int Stopped bool + Timeouts ProbeTimeouts +} + +type ProbeTimeouts struct { + ProbeTimeout time.Duration + Interval time.Duration + ProbePollingInterval time.Duration + InitialDelay time.Duration + EvaluationTimeout time.Duration } // EventDetails is for collecting all the events-related details @@ -305,7 +313,9 @@ func GetValuesFromChaosEngine(chaosDetails *ChaosDetails, clients clients.Client // get all the probes defined inside chaosengine for the corresponding experiment for _, experiment := range engine.Spec.Experiments { if experiment.Name == chaosDetails.ExperimentName { - InitializeProbesInChaosResultDetails(chaosresult, experiment.Spec.Probe) + if err := InitializeProbesInChaosResultDetails(chaosresult, experiment.Spec.Probe); err != nil { + return stacktrace.Propagate(err, "could not initialize probe") + } InitializeSidecarDetails(chaosDetails, engine, experiment.Spec.Components.ENV) } } @@ -376,7 +386,7 @@ func getEnvSource(apiVersion string, fieldPath string) *corev1.EnvVarSource { return &downwardENV } -func InitializeProbesInChaosResultDetails(chaosresult *ResultDetails, probes []v1alpha1.ProbeAttributes) { +func InitializeProbesInChaosResultDetails(chaosresult *ResultDetails, probes []v1alpha1.ProbeAttributes) error { var probeDetails []*ProbeDetails // set the probe details for k8s probe @@ -385,13 +395,57 @@ func InitializeProbesInChaosResultDetails(chaosresult *ResultDetails, probes []v tempProbe.Name = probe.Name tempProbe.Type = probe.Type tempProbe.Mode = probe.Mode - tempProbe.RunCount = 0 tempProbe.Status = v1alpha1.ProbeStatus{ Verdict: "Awaited", } + tempProbe.Timeouts, err = parseProbeTimeouts(probe) + if err != nil { + return err + } probeDetails = append(probeDetails, tempProbe) } chaosresult.ProbeDetails = probeDetails chaosresult.ProbeArtifacts = map[string]ProbeArtifact{} + return nil +} + +func parseProbeTimeouts(probe v1alpha1.ProbeAttributes) (ProbeTimeouts, error) { + var timeout ProbeTimeouts + timeout.ProbeTimeout, err = parseDuration(probe.RunProperties.ProbeTimeout) + if err != nil { + return timeout, generateError(probe.Name, probe.Type, "ProbeTimeout", err) + } + timeout.Interval, err = parseDuration(probe.RunProperties.Interval) + if err != nil { + return timeout, generateError(probe.Name, probe.Type, "Interval", err) + } + timeout.ProbePollingInterval, err = parseDuration(probe.RunProperties.ProbePollingInterval) + if err != nil { + return timeout, generateError(probe.Name, probe.Type, "ProbePollingInterval", err) + } + timeout.InitialDelay, err = parseDuration(probe.RunProperties.InitialDelay) + if err != nil { + return timeout, generateError(probe.Name, probe.Type, "InitialDelay", err) + } + timeout.EvaluationTimeout, err = parseDuration(probe.RunProperties.EvaluationTimeout) + if err != nil { + return timeout, generateError(probe.Name, probe.Type, "EvaluationTimeout", err) + } + return timeout, nil +} + +func parseDuration(duration string) (time.Duration, error) { + if strings.TrimSpace(duration) == "" { + return 0, nil + } + return time.ParseDuration(duration) +} + +func generateError(probeName, probeType, field string, err error) error { + return cerrors.Error{ + ErrorCode: cerrors.ErrorTypeGeneric, + Reason: fmt.Sprintf("Invalid probe runProperties field '%s': %s", field, err.Error()), + Target: fmt.Sprintf("{probeName: %s, type: %s}", probeName, probeType), + } } diff --git a/pkg/utils/retry/retry.go b/pkg/utils/retry/retry.go index 9fb2cc7d9..8d2b0bc39 100644 --- a/pkg/utils/retry/retry.go +++ b/pkg/utils/retry/retry.go @@ -15,7 +15,7 @@ type Action func(attempt uint) error type Model struct { retry uint waitTime time.Duration - timeout int64 + timeout time.Duration } // Times is used to define the retry count @@ -48,14 +48,14 @@ func (model *Model) Wait(waitTime time.Duration) *Model { // Timeout is used to define the timeout duration for each iteration of retry // it will run if the instance of model is not present before -func Timeout(timeout int64) *Model { +func Timeout(timeout time.Duration) *Model { model := Model{} return model.Timeout(timeout) } // Timeout is used to define the timeout duration for each iteration of retry // it will run if the instance of model is already present -func (model *Model) Timeout(timeout int64) *Model { +func (model *Model) Timeout(timeout time.Duration) *Model { model.timeout = timeout return model } @@ -67,7 +67,7 @@ func (model Model) Try(action Action) error { } var err error - for attempt := uint(0); (attempt == 0 || err != nil) && attempt <= model.retry; attempt++ { + for attempt := uint(0); (attempt == 0 || err != nil) && attempt < model.retry; attempt++ { err = action(attempt) if model.waitTime > 0 { time.Sleep(model.waitTime) @@ -91,10 +91,10 @@ func (model Model) TryWithTimeout(action Action) error { for attempt := uint(0); (attempt == 0 || err != nil) && attempt < model.retry; { startTime := time.Now().UnixMilli() err = action(attempt) - if err == nil && time.Now().UnixMilli()-startTime >= model.timeout { + if err == nil && time.Now().UnixMilli()-startTime >= model.timeout.Milliseconds() { err = cerrors.Error{ - ErrorCode: cerrors.ErrorTypeGeneric, - Reason: "probe is failed due to timeout", + ErrorCode: cerrors.ErrorTypeTimeout, + Reason: "action timeout", } } attempt++