Skip to content

Commit

Permalink
Merge pull request #319 from grafana/darrenjaneczek/config-job-aggreg…
Browse files Browse the repository at this point in the history
…ation

refactor: config for job aggregation strings
  • Loading branch information
pracucci authored Jun 10, 2021
2 parents e7cbfe4 + a03451c commit 8c82746
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 21 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
* [CHANGE] Store-gateway: increased `-blocks-storage.bucket-store.max-chunk-pool-bytes` from 2GB (default) to 12GB. #322
* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319
* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319

## 1.9.0 / 2021-05-18

Expand Down
2 changes: 1 addition & 1 deletion cortex-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
(import 'alerts/compactor.libsonnet')
else {}) +

{ _config:: $._config },
{ _config:: $._config + $._group_config },
}
2 changes: 1 addition & 1 deletion cortex-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
{
alert: 'CortexRequestLatency',
expr: |||
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
%(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
>
%(cortex_p99_latency_threshold_seconds)s
||| % $._config,
Expand Down
7 changes: 4 additions & 3 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@
compactor: 'compactor.*', // Match also custom compactor deployments.
},

// Labels used to in alert aggregations - should uniquely identify
// a single Cortex cluster.
alert_aggregation_labels: 'cluster, namespace',
// Grouping labels, to uniquely identify and group by {jobs, clusters}
job_labels: ['cluster', 'namespace', 'job'],
cluster_labels: ['cluster', 'namespace'],

cortex_p99_latency_threshold_seconds: 2.5,

// Whether resources dashboards are enabled (based on cAdvisor metrics).
Expand Down
2 changes: 1 addition & 1 deletion cortex-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@
(import 'dashboards/writes-resources.libsonnet') +
(import 'dashboards/alertmanager-resources.libsonnet')) +

{ _config:: $._config },
{ _config:: $._config + $._group_config },
}
15 changes: 11 additions & 4 deletions cortex-mixin/dashboards/writes.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
})
.addPanel(
$.panel('Samples / s') +
$.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher($._config.job_names.distributor), format='reqps')
$.statPanel(
'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % (
$._config {
job: $.jobMatcher($._config.job_names.distributor),
}
),
format='reqps'
)
)
.addPanel(
$.panel('Active Series') +
$.statPanel(|||
sum(cortex_ingester_memory_series{%(ingester)s}
/ on(namespace) group_left
max by (namespace) (cortex_distributor_replication_factor{%(distributor)s}))
||| % {
/ on(%(group_by_cluster)s) group_left
max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}))
||| % ($._config) {
ingester: $.jobMatcher($._config.job_names.ingester),
distributor: $.jobMatcher($._config.job_names.distributor),
}, format='short')
Expand Down
45 changes: 45 additions & 0 deletions cortex-mixin/groups.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
local makePrefix(groups) = std.join('_', groups),
local makeGroupBy(groups) = std.join(', ', groups),

local group_by_cluster = makeGroupBy($._config.cluster_labels),

_group_config+:: {
// Each group prefix is composed of `_`-separated labels
group_prefix_jobs: makePrefix($._config.job_labels),
group_prefix_clusters: makePrefix($._config.cluster_labels),

// Each group-by label list is `, `-separated and unique identifies
group_by_job: makeGroupBy($._config.job_labels),
group_by_cluster: group_by_cluster,
},

// The following works around the deprecation of `$._config.alert_aggregation_labels`
// - If an override of that value is detected, a warning will be printed
// - If no override was detected, it will be set to the `group_by_cluster` value,
// which will replace it altogether in the future.
local alert_aggregation_labels_override = (
{
alert_aggregation_labels: null,
} + super._config
).alert_aggregation_labels,

_config+:: {
alert_aggregation_labels:
if alert_aggregation_labels_override != null
then std.trace(
|||
Deprecated: _config.alert_aggregation_labels
This field has been explicitly overridden to "%s".
Instead, express the override in terms of _config.cluster_labels.
E.g., cluster_labels: %s will automatically convert to "%s".
||| % [
alert_aggregation_labels_override,
$._config.cluster_labels,
group_by_cluster,
],
alert_aggregation_labels_override
)
else group_by_cluster,
},
}
1 change: 1 addition & 0 deletions cortex-mixin/mixin.libsonnet
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
(import 'config.libsonnet') +
(import 'groups.libsonnet') +
(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet') +
(import 'recording_rules.libsonnet')
22 changes: 11 additions & 11 deletions cortex-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
local utils = import 'mixin-utils/utils.libsonnet';

{
local _config = {
max_series_per_ingester: 1.5e6,
max_samples_per_sec_per_ingester: 80e3,
max_samples_per_sec_per_distributor: 240e3,
limit_utilisation_target: 0.6,
} + $._config + $._group_config,
prometheusRules+:: {
groups+: [
{
Expand Down Expand Up @@ -51,20 +57,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
name: 'cortex_received_samples',
rules: [
{
record: 'cluster_namespace_job:cortex_distributor_received_samples:rate5m',
record: '%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m' % _config,
expr: |||
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
|||,
sum by (%(group_by_job)s) (rate(cortex_distributor_received_samples_total[5m]))
||| % _config,
},
],
},
{
local _config = {
max_series_per_ingester: 1.5e6,
max_samples_per_sec_per_ingester: 80e3,
max_samples_per_sec_per_distributor: 240e3,
limit_utilisation_target: 0.6,
},
name: 'cortex_scaling_rules',
rules: [
{
Expand All @@ -89,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
)[24h:]
)
/ %(max_samples_per_sec_per_distributor)s
Expand Down Expand Up @@ -123,7 +123,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
)[24h:]
)
* 3 / %(max_samples_per_sec_per_ingester)s
Expand Down

0 comments on commit 8c82746

Please sign in to comment.