Skip to content

Commit

Permalink
Ruler: Report wider range of errors from remote rule evaluation queri…
Browse files Browse the repository at this point in the history
…es. (grafana#2143)

The `cortex_ruler_queries_failed_total` metric should increment whenever a
rule evaluation query fails. Before this change, errors would be ignored
unless they are specifically of type`httpgrpc`. This means that many errors
get lost, for example, connectivity errors coming from gRPC.
  • Loading branch information
stevesg authored and mason committed Jul 11, 2022
1 parent 92da3a0 commit 3bfb352
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
* [BUGFIX] Ring: fix bug where instances may appear unhealthy in the hash ring web UI even though they are not. #1933
* [BUGFIX] API: gzip is now enforced when identity encoding is explicitly rejected. #1864
* [BUGFIX] Fix panic at startup when Mimir is running in monolithic mode and query sharding is enabled. #2036
* [BUGFIX] Ruler: report failed evaluation metric for any 5xx status code returned by the query-frontend when remote operational mode is enabled. #2053
* [BUGFIX] Ruler: report `cortex_ruler_queries_failed_total` metric for any remote query error except 4xx when remote operational mode is enabled. #2053 #2143
* [BUGFIX] Ingester: fix slow rollout when using `-ingester.ring.unregister-on-shutdown=false` with long `-ingester.ring.heartbeat-period`. #2085
* [BUGFIX] Ruler: add timeout for remote rule evaluation queries to prevent rule group evaluations getting stuck indefinitely. The duration is configurable with (`-ruler.query-frontend.timeout` (default `2m`). #2090
* [BUGFIX] Limits: Active series custom tracker configuration has been named back from `active_series_custom_trackers_config` to `active_series_custom_trackers`. For backwards compatibility both version is going to be supported for until Mimir v2.4. When both fields are specified, `active_series_custom_trackers_config` takes precedence over `active_series_custom_trackers`. #2101
Expand Down
4 changes: 2 additions & 2 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
return result, origErr

} else if err != nil {
// When remote querier enabled, only consider failed queries those returning a 5xx status code.
// When remote querier enabled, consider anything an error except those with 4xx status code.
st, ok := status.FromError(err)
if ok && st.Code()/100 == 5 {
if !(ok && st.Code()/100 == 4) {
failedQueries.Inc()
}
}
Expand Down
33 changes: 24 additions & 9 deletions pkg/ruler/compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func TestPusherErrors(t *testing.T) {
func TestMetricsQueryFuncErrors(t *testing.T) {
for name, tc := range map[string]struct {
returnedError error
expectedError error
expectedQueries int
expectedFailedQueries int
}{
Expand All @@ -161,44 +162,58 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
expectedFailedQueries: 0,
},

"400 error": {
"httpgrpc 400 error": {
returnedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // 400 errors not reported as failures.
},

"500 error": {
"httpgrpc 500 error": {
returnedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // 500 errors are failures
},

"unknown but non-queryable error": {
returnedError: errors.New("test error"),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // Any other error should always be reported.
},

"promql.ErrStorage": {
returnedError: promql.ErrStorage{Err: errors.New("test error")},
returnedError: WrapQueryableErrors(promql.ErrStorage{Err: errors.New("test error")}),
expectedError: promql.ErrStorage{Err: errors.New("test error")},
expectedQueries: 1,
expectedFailedQueries: 1,
},

"promql.ErrQueryCanceled": {
returnedError: promql.ErrQueryCanceled("test error"),
returnedError: WrapQueryableErrors(promql.ErrQueryCanceled("test error")),
expectedError: promql.ErrQueryCanceled("test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // Not interesting.
},

"promql.ErrQueryTimeout": {
returnedError: promql.ErrQueryTimeout("test error"),
returnedError: WrapQueryableErrors(promql.ErrQueryTimeout("test error")),
expectedError: promql.ErrQueryTimeout("test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // Not interesting.
},

"promql.ErrTooManySamples": {
returnedError: promql.ErrTooManySamples("test error"),
returnedError: WrapQueryableErrors(promql.ErrTooManySamples("test error")),
expectedError: promql.ErrTooManySamples("test error"),
expectedQueries: 1,
expectedFailedQueries: 0, // Not interesting.
},

"unknown error": {
returnedError: errors.New("test error"),
returnedError: WrapQueryableErrors(errors.New("test error")),
expectedError: errors.New("test error"),
expectedQueries: 1,
expectedFailedQueries: 1, // unknown errors are not 400, so they are reported.
},
Expand All @@ -208,12 +223,12 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
failures := prometheus.NewCounter(prometheus.CounterOpts{})

mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
return promql.Vector{}, WrapQueryableErrors(tc.returnedError)
return promql.Vector{}, tc.returnedError
}
qf := MetricsQueryFunc(mockFunc, queries, failures)

_, err := qf(context.Background(), "test", time.Now())
require.Equal(t, tc.returnedError, err)
require.Equal(t, tc.expectedError, err)

require.Equal(t, tc.expectedQueries, int(testutil.ToFloat64(queries)))
require.Equal(t, tc.expectedFailedQueries, int(testutil.ToFloat64(failures)))
Expand Down

0 comments on commit 3bfb352

Please sign in to comment.