Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add function to customize compactor statefulset #287

Merged
merged 2 commits into from
Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## master / unreleased

* [ENHANCEMENT] Added `newCompactorStatefulSet()` function to create a custom statefulset for the compactor. #287
* [ENHANCEMENT] Added option to configure compactor job name used in dashboards and alerts. #287

## 1.8.0 / 2021-03-25

* [CHANGE] Updated the trunk branch from `master` to `main`. You need to run the following in your local fork: #265
Expand Down
10 changes: 5 additions & 5 deletions cortex-mixin/alerts/compactor.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@
alert: 'CortexCompactorHasNotUploadedBlocks',
'for': '15m',
expr: |||
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} > 60 * 60 * 24)
and
(thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 0)
|||,
(thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} > 0)
||| % $._config.job_names,
labels: {
severity: 'critical',
},
Expand All @@ -54,8 +54,8 @@
alert: 'CortexCompactorHasNotUploadedBlocksSinceStart',
'for': '24h',
expr: |||
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
|||,
thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0
||| % $._config.job_names,
labels: {
severity: 'critical',
},
Expand Down
1 change: 1 addition & 0 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
table_manager: '(table-manager|cortex$)',
store_gateway: '(store-gateway|cortex$)',
gateway: 'cortex-gw',
compactor: 'compactor.*', // Match also custom compactor deployments.
},

// Labels used to in alert aggregations - should uniquely identify
Expand Down
2 changes: 1 addition & 1 deletion cortex-mixin/dashboards/compactor-resources.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'compactor'),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change here because this function wants the container name not the job name (same for CPU panel). The container name doesn't change when you customize the statefulset.

)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'compactor'),
$.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.compactor),
)
)
.addRow(
Expand Down
28 changes: 14 additions & 14 deletions cortex-mixin/dashboards/compactor.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.startedCompletedFailedPanel(
'Per-instance runs / sec',
'sum(rate(cortex_compactor_runs_started_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'),
'sum(rate(cortex_compactor_runs_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'),
'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')
'sum(rate(cortex_compactor_runs_started_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor),
'sum(rate(cortex_compactor_runs_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor),
'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor)
) +
$.bars +
{ yaxes: $.yaxes('ops') },
Expand All @@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
cortex_compactor_tenants_processing_failed{%s} +
cortex_compactor_tenants_skipped{%s}
) / cortex_compactor_tenants_discovered{%s}
||| % [$.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor')], '{{%s}}' % $._config.per_instance_label) +
||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{%s}}' % $._config.per_instance_label) +
{ yaxes: $.yaxes({ format: 'percentunit', max: 1 }) },
)
)
Expand All @@ -44,12 +44,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Compacted blocks / sec') +
$.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') +
$.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
{ yaxes: $.yaxes('ops') },
)
.addPanel(
$.panel('Per-block compaction duration') +
$.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher('compactor'))
$.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor))
)
)
.addRow(
Expand All @@ -62,27 +62,27 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Average blocks / tenant') +
$.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), 'avg'),
$.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'),
)
.addPanel(
$.panel('Tenants with largest number of blocks') +
$.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), '{{user}}'),
$.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}'),
)
)
.addRow(
$.row('Garbage Collector')
.addPanel(
$.panel('Blocks marked for deletion / sec') +
$.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') +
$.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
{ yaxes: $.yaxes('ops') },
)
.addPanel(
$.successFailurePanel(
'Blocks deletions / sec',
// The cortex_compactor_blocks_cleaned_total tracks the number of successfully
// deleted blocks.
'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'),
'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'),
'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor),
'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor),
) + { yaxes: $.yaxes('ops') }
)
)
Expand All @@ -93,14 +93,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Metadata Syncs / sec',
// The cortex_compactor_meta_syncs_total metric is incremented each time a per-tenant
// metadata sync is triggered.
'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__rate_interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')],
'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'),
'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__rate_interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)],
'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor),
) + { yaxes: $.yaxes('ops') }
)
.addPanel(
$.panel('Metadata Sync Duration') +
// This metric tracks the duration of a per-tenant metadata sync.
$.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher('compactor')),
$.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)),
)
)
.addRow($.objectStorePanels1('Object Store', 'compactor'))
Expand Down
15 changes: 9 additions & 6 deletions cortex/tsdb.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,13 @@
$.util.readinessProbe +
$.jaeger_mixin,

compactor_statefulset:
statefulSet.new('compactor', 1, [$.compactor_container], compactor_data_pvc) +
statefulSet.mixin.spec.withServiceName('compactor') +
newCompactorStatefulSet(name, container)::
statefulSet.new(name, 1, [container], compactor_data_pvc) +
statefulSet.mixin.spec.withServiceName(name) +
statefulSet.mixin.metadata.withNamespace($._config.namespace) +
statefulSet.mixin.metadata.withLabels({ name: 'compactor' }) +
statefulSet.mixin.spec.template.metadata.withLabels({ name: 'compactor' }) +
statefulSet.mixin.spec.selector.withMatchLabels({ name: 'compactor' }) +
statefulSet.mixin.metadata.withLabels({ name: name }) +
statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) +
statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) +
statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) +
Expand All @@ -189,6 +189,9 @@
// ready).
statefulSet.mixin.spec.withPodManagementPolicy('Parallel'),

compactor_statefulset:
$.newCompactorStatefulSet('compactor', $.compactor_container),

// The store-gateway runs a statefulset.
local store_gateway_data_pvc =
pvc.new() +
Expand Down