diff --git a/CHANGELOG.md b/CHANGELOG.md index 95ea7ea8aa8..44329558173 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ * [BUGFIX] Ring: fix bug where instances may appear unhealthy in the hash ring web UI even though they are not. #1933 * [BUGFIX] API: gzip is now enforced when identity encoding is explicitly rejected. #1864 * [BUGFIX] Fix panic at startup when Mimir is running in monolithic mode and query sharding is enabled. #2036 -* [BUGFIX] Ruler: report failed evaluation metric for any 5xx status code returned by the query-frontend when remote operational mode is enabled. #2053 +* [BUGFIX] Ruler: report `cortex_ruler_queries_failed_total` metric for any remote query error except 4xx when remote operational mode is enabled. #2053 #2143 * [BUGFIX] Ingester: fix slow rollout when using `-ingester.ring.unregister-on-shutdown=false` with long `-ingester.ring.heartbeat-period`. #2085 * [BUGFIX] Ruler: add timeout for remote rule evaluation queries to prevent rule group evaluations getting stuck indefinitely. The duration is configurable with (`-ruler.query-frontend.timeout` (default `2m`). #2090 * [BUGFIX] Limits: Active series custom tracker configuration has been named back from `active_series_custom_trackers_config` to `active_series_custom_trackers`. For backwards compatibility both version is going to be supported for until Mimir v2.4. When both fields are specified, `active_series_custom_trackers_config` takes precedence over `active_series_custom_trackers`. #2101 diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 7776e7db637..1b4197a99fa 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -150,9 +150,9 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun return result, origErr } else if err != nil { - // When remote querier enabled, only consider failed queries those returning a 5xx status code. + // When remote querier enabled, consider anything an error except those with 4xx status code. st, ok := status.FromError(err) - if ok && st.Code()/100 == 5 { + if !(ok && st.Code()/100 == 4) { failedQueries.Inc() } } diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 7fb3718c75a..1d9f4cdb56e 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -153,6 +153,7 @@ func TestPusherErrors(t *testing.T) { func TestMetricsQueryFuncErrors(t *testing.T) { for name, tc := range map[string]struct { returnedError error + expectedError error expectedQueries int expectedFailedQueries int }{ @@ -161,44 +162,58 @@ func TestMetricsQueryFuncErrors(t *testing.T) { expectedFailedQueries: 0, }, - "400 error": { + "httpgrpc 400 error": { returnedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"), + expectedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"), expectedQueries: 1, expectedFailedQueries: 0, // 400 errors not reported as failures. }, - "500 error": { + "httpgrpc 500 error": { returnedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"), + expectedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"), expectedQueries: 1, expectedFailedQueries: 1, // 500 errors are failures }, + "unknown but non-queryable error": { + returnedError: errors.New("test error"), + expectedError: errors.New("test error"), + expectedQueries: 1, + expectedFailedQueries: 1, // Any other error should always be reported. + }, + "promql.ErrStorage": { - returnedError: promql.ErrStorage{Err: errors.New("test error")}, + returnedError: WrapQueryableErrors(promql.ErrStorage{Err: errors.New("test error")}), + expectedError: promql.ErrStorage{Err: errors.New("test error")}, expectedQueries: 1, expectedFailedQueries: 1, }, "promql.ErrQueryCanceled": { - returnedError: promql.ErrQueryCanceled("test error"), + returnedError: WrapQueryableErrors(promql.ErrQueryCanceled("test error")), + expectedError: promql.ErrQueryCanceled("test error"), expectedQueries: 1, expectedFailedQueries: 0, // Not interesting. }, "promql.ErrQueryTimeout": { - returnedError: promql.ErrQueryTimeout("test error"), + returnedError: WrapQueryableErrors(promql.ErrQueryTimeout("test error")), + expectedError: promql.ErrQueryTimeout("test error"), expectedQueries: 1, expectedFailedQueries: 0, // Not interesting. }, "promql.ErrTooManySamples": { - returnedError: promql.ErrTooManySamples("test error"), + returnedError: WrapQueryableErrors(promql.ErrTooManySamples("test error")), + expectedError: promql.ErrTooManySamples("test error"), expectedQueries: 1, expectedFailedQueries: 0, // Not interesting. }, "unknown error": { - returnedError: errors.New("test error"), + returnedError: WrapQueryableErrors(errors.New("test error")), + expectedError: errors.New("test error"), expectedQueries: 1, expectedFailedQueries: 1, // unknown errors are not 400, so they are reported. }, @@ -208,12 +223,12 @@ func TestMetricsQueryFuncErrors(t *testing.T) { failures := prometheus.NewCounter(prometheus.CounterOpts{}) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { - return promql.Vector{}, WrapQueryableErrors(tc.returnedError) + return promql.Vector{}, tc.returnedError } qf := MetricsQueryFunc(mockFunc, queries, failures) _, err := qf(context.Background(), "test", time.Now()) - require.Equal(t, tc.returnedError, err) + require.Equal(t, tc.expectedError, err) require.Equal(t, tc.expectedQueries, int(testutil.ToFloat64(queries))) require.Equal(t, tc.expectedFailedQueries, int(testutil.ToFloat64(failures)))