Skip to content

Commit

Permalink
add configurable 'job_labels' and 'cluster_labels'
Browse files Browse the repository at this point in the history
Replaces hard-coded usage of [namespace, job] or [namespace, cluster]
in alerts and recording rules to support different configurations
of cluster and job unique identifier labels
  • Loading branch information
kevinschoonover committed Sep 6, 2021
1 parent 075d3d1 commit f2ae32b
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 39 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
* [ENHANCEMENT] Add `gen index` and `gen bloom` commands to tempo-cli. [#903](https://github.com/grafana/tempo/pull/903) (@annanay25)
* [ENHANCEMENT] Implement trace comparison in Vulture [#904](https://github.com/grafana/tempo/pull/904) (@zalegrala)
* [CHANGE] Renamed CLI flag from `--storage.trace.maintenance-cycle` to `--storage.trace.blocklist_poll`. This is a **breaking change** [#897](https://github.com/grafana/tempo/pull/897) (@mritunjaysharma394)
* [CHANGE] update jsonnet alerts and recording rules to use `job_selectors` and `cluster_selectors` for configurable unique identifier labels [#935](https://github.com/grafana/tempo/pull/935) (@kevinschoonover)

## v1.1.0 / 2021-08-26
* [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1)
Expand Down
74 changes: 37 additions & 37 deletions operations/tempo-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
{
alert: 'TempoRequestErrors',
expr: |||
100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (%(group_by_job)s, route)
/
sum(rate(tempo_request_duration_seconds_count[1m])) by (namespace, job, route)
sum(rate(tempo_request_duration_seconds_count[1m])) by (%(group_by_job)s, route)
> 10
|||,
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
Expand All @@ -20,14 +20,14 @@
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors',
},
},
{
alert: 'TempoRequestLatency',
expr: |||
namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} > 3
|||,
%(group_prefix_jobs)s_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} > 3
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
Expand All @@ -36,123 +36,123 @@
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency',
},
},
{
alert: 'TempoCompactorUnhealthy',
'for': '15m',
expr: |||
max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="compactor"}) > 0
|||,
max by (%s) (cortex_ring_members{state="Unhealthy", name="%s"}) > 0
||| % [$._config.group_by_cluster, $._config.jobs.compactor],
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy compactor(s).',
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy',
},
},
{
alert: 'TempoDistributorUnhealthy',
'for': '15m',
expr: |||
max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="distributor"}) > 0
|||,
max by (%s) (cortex_ring_members{state="Unhealthy", name="%s"}) > 0
||| % [$._config.group_by_cluster, $._config.jobs.distributor],
labels: {
severity: 'warning',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy distributor(s).',
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy',
},
},
{
alert: 'TempoCompactionsFailing',
expr: |||
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[5m])) > 0
||| % $._config.alerts.compactions_per_hour_failed,
sum by (%s) (increase(tempodb_compaction_errors_total{}[1h])) > %s and
sum by (%s) (increase(tempodb_compaction_errors_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.compactions_per_hour_failed, $._config.group_by_cluster],
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s compactions have failed in the past hour.' % $._config.alerts.compactions_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing',
},
},
{
alert: 'TempoIngesterFlushesFailing',
expr: |||
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
||| % $._config.alerts.flushes_per_hour_failed,
sum by (%s) (increase(tempo_ingester_failed_flushes_total{}[1h])) > %s and
sum by (%s) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.flushes_per_hour_failed, $._config.group_by_cluster],
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s flushes have failed in the past hour.' % $._config.alerts.flushes_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing',
},
},
{
alert: 'TempoPollsFailing',
expr: |||
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0
||| % $._config.alerts.polls_per_hour_failed,
sum by (%s) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > %s and
sum by (%s) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.polls_per_hour_failed, $._config.group_by_cluster],
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s polls have failed in the past hour.' % $._config.alerts.polls_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing',
},
},
{
alert: 'TempoTenantIndexFailures',
expr: |||
sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[5m])) > 0
||| % $._config.alerts.polls_per_hour_failed,
sum by (%s) (increase(tempodb_blocklist_tenant_index_errors_total{}[1h])) > %s and
sum by (%s) (increase(tempodb_blocklist_tenant_index_errors_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.polls_per_hour_failed, $._config.group_by_cluster],
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s tenant index failures in the past hour.' % $._config.alerts.polls_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures',
},
},
{
alert: 'TempoNoTenantIndexBuilders',
expr: |||
sum by (cluster, namespace) (tempodb_blocklist_tenant_index_builder{}) == 0
|||,
sum by (%(group_by_cluster)s) (tempodb_blocklist_tenant_index_builder{}) == 0
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'No tenant index builders. Tenant index is out of date.',
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders',
},
},
{
alert: 'TempoTenantIndexTooOld',
expr: |||
max by (cluster, namespace) (tempodb_blocklist_tenant_index_age_seconds{}) > %s
||| % $._config.alerts.max_tenant_index_age_seconds,
max by (%s) (tempodb_blocklist_tenant_index_age_seconds{}) > %s
||| % [$._config.group_by_cluster, $._config.alerts.max_tenant_index_age_seconds],
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Tenant index age is %s seconds old.' % $._config.alerts.max_tenant_index_age_seconds,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld'
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld',
},
},
],
},
],
},
}
}
17 changes: 16 additions & 1 deletion operations/tempo-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{
local makePrefix(groups) = std.join('_', groups),
local makeGroupBy(groups) = std.join(', ', groups),

_config+:: {
http_api_prefix: '',
jobs: {
Expand All @@ -15,5 +18,17 @@
polls_per_hour_failed: 2,
max_tenant_index_age_seconds: 600,
},

// Groups labels to uniquely identify and group by {jobs, clusters}
cluster_selectors: ['cluster', 'namespace'],
job_selectors: ['namespace', 'job'],

// Each group prefix is composed of `_`-separated labels
group_prefix_jobs: makePrefix($._config.job_selectors),
group_prefix_clusters: makePrefix($._config.cluster_selectors),

// Each group-by label list is `, `-separated and unique identifies
group_by_job: makeGroupBy($._config.job_selectors),
group_by_cluster: makeGroupBy($._config.cluster_selectors),
},
}
}
2 changes: 1 addition & 1 deletion operations/tempo-mixin/rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
groups+: [{
name: 'tempo_rules',
rules:
utils.histogramRules('tempo_request_duration_seconds', ['namespace', 'job', 'route']),
utils.histogramRules('tempo_request_duration_seconds', $._config.job_labels + ['route']),
}],
},
}

0 comments on commit f2ae32b

Please sign in to comment.