Merge branch 'main' into darrenjaneczek/dashboard-descriptions-reads-…

…writes
grafana · Jun 22, 2021 · edecc3d · edecc3d
2 parents 8984245 + 46a8a0e
commit edecc3d
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 53 deletions.
diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet
@@ -92,39 +92,30 @@
           },
         },
         {
-          alert: 'CortexInconsistentConfig',
+          alert: 'CortexInconsistentRuntimeConfig',
           expr: |||
-            count(count by(%s, job, sha256) (cortex_config_hash)) without(sha256) > 1
+            count(count by(%s, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
           ||| % $._config.alert_aggregation_labels,
           'for': '1h',
           labels: {
-            severity: 'warning',
+            severity: 'critical',
           },
           annotations: {
             message: |||
-              An inconsistent config file hash is used across cluster {{ $labels.job }}.
+              An inconsistent runtime config file is used across cluster {{ $labels.job }}.
             |||,
           },
         },
         {
-          // As of https://github.com/cortexproject/cortex/pull/2092, this metric is
-          // only exposed when it is supposed to be non-zero, so we don't need to do
-          // any special filtering on the job label.
-          // The metric itself was renamed in
-          // https://github.com/cortexproject/cortex/pull/2874
-          //
-          // TODO: Remove deprecated metric name of
-          // cortex_overrides_last_reload_successful in the future
           alert: 'CortexBadRuntimeConfig',
           expr: |||
+            # The metric value is reset to 0 on error while reloading the config at runtime.
             cortex_runtime_config_last_reload_successful == 0
-              or
-            cortex_overrides_last_reload_successful == 0
           |||,
           // Alert quicker for human errors.
           'for': '5m',
           labels: {
-            severity: 'warning',
+            severity: 'critical',
           },
           annotations: {
             message: |||

diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet
@@ -47,6 +47,19 @@
             message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.',
           },
         },
+        {
+          // Alert if compactor failed to run 2 consecutive compactions.
+          alert: 'CortexCompactorHasNotSuccessfullyRunCompaction',
+          expr: |||
+            increase(cortex_compactor_runs_failed_total[2h]) >= 2
+          |||,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.',
+          },
+        },
         {
           // Alert if the compactor has not uploaded anything in the last 24h.
           alert: 'CortexCompactorHasNotUploadedBlocks',
@@ -65,7 +78,7 @@
         },
         {
           // Alert if the compactor has not uploaded anything since its start.
-          alert: 'CortexCompactorHasNotUploadedBlocksSinceStart',
+          alert: 'CortexCompactorHasNotUploadedBlocks',
           'for': '24h',
           expr: |||
             thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0
@@ -77,21 +90,6 @@
             message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.',
           },
         },
-        {
-          // Alert if compactor fails.
-          alert: 'CortexCompactorRunFailed',
-          expr: |||
-            increase(cortex_compactor_runs_failed_total[2h]) >= 2
-          |||,
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} failed to run compaction.
-            |||,
-          },
-        },
       ],
     },
   ],

diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet
@@ -34,7 +34,7 @@
       query_scheduler: 'query-scheduler',  // Not part of single-binary.
       table_manager: '(table-manager|cortex$)',
       store_gateway: '(store-gateway|cortex$)',
-      gateway: '(gateway|cortex-gw)',
+      gateway: '(gateway|cortex-gw|cortex-gw-internal)',
       compactor: 'compactor.*',  // Match also custom compactor deployments.
     },
 

diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md
@@ -26,11 +26,63 @@ If nothing obvious from the above, check for increased load:
 
 ### CortexIngesterReachingSeriesLimit
 
-_TODO: this playbook has not been written yet._
+This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed.
+
+In case of **emergency**:
+- If the actual number of series is very close or already hit the limit, then you can increase the limit via runtime config to gain some time
+- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard
+
+How the limit is **configured**:
+- The limit can be configured either on CLI (`-ingester.instance-limits.max-series`) or in the runtime config:
+  ```
+  ingester_limits:
+    max_series: <int>
+  ```
+- The mixin configures the limit in the runtime config and can be fine-tuned via:
+  ```
+  _config+:: {
+    ingester_instance_limits+:: {
+      max_series: <int>
+    }
+  }
+  ```
+- When configured in the runtime config, changes are applied live without requiring an ingester restart
+- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}`
+
+How to **fix**:
+1. **Scale up ingesters**<br />
+   Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h).
+2. **Temporarily increase the limit**<br />
+   If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit.
 
 ### CortexIngesterReachingTenantsLimit
 
-_TODO: this playbook has not been written yet._
+This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones.
+
+In case of **emergency**:
+- If the actual number of tenants is very close or already hit the limit, then you can increase the limit via runtime config to gain some time
+- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard
+
+How the limit is **configured**:
+- The limit can be configured either on CLI (`-ingester.instance-limits.max-tenants`) or in the runtime config:
+  ```
+  ingester_limits:
+    max_tenants: <int>
+  ```
+- The mixin configures the limit in the runtime config and can be fine-tuned via:
+  ```
+  _config+:: {
+    ingester_instance_limits+:: {
+      max_tenants: <int>
+    }
+  }
+  ```
+- When configured in the runtime config, changes are applied live without requiring an ingester restart
+- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_tenants"}`
+
+How to **fix**:
+1. Ensure shuffle-sharding is enabled in the Cortex cluster
+1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit
 
 ### CortexRequestLatency
 First establish if the alert is for read or write latency. The alert should say.
@@ -220,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas
 This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time.
 
 How to **investigate**:
-- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first
+- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first
 - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first
 - Ensure ingesters are successfully shipping blocks to the storage
 - Look for any error in the compactor logs
 
+### CortexCompactorHasNotSuccessfullyRunCompaction
+
+This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants).
+
+When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block.
+
+How to **investigate**:
+- Look for any error in the compactor logs
+  - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found)
+
 #### Compactor is failing because of `not healthy index found`
 
 The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks:
@@ -249,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI:
 gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK
 ```
 
-### CortexCompactorHasNotUploadedBlocksSinceStart
-
-Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks).
-
-### CortexCompactorHasNotSuccessfullyRunCompaction
-
-_TODO: this playbook has not been written yet._
-
-### CortexCompactorRunFailed
-
-_TODO: this playbook has not been written yet._
-
 ### CortexBucketIndexNotUpdated
 
 This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. 
@@ -317,13 +367,33 @@ _TODO: this playbook has not been written yet._
 
 _TODO: this playbook has not been written yet._
 
-### CortexInconsistentConfig
+### CortexInconsistentRuntimeConfig
 
-_TODO: this playbook has not been written yet._
+This alert fires if multiple replicas of the same Cortex service are using a different runtime config for a longer period of time.
+
+The Cortex runtime config is a config file which gets live reloaded by Cortex at runtime. In order for Cortex to work properly, the loaded config is expected to be the exact same across multiple replicas of the same Cortex service (eg. distributors, ingesters, ...). When the config changes, there may be short periods of time during which some replicas have loaded the new config and others are still running on the previous one, but it shouldn't last for more than few minutes.
+
+How to **investigate**:
+- Check how many different config file versions (hashes) are reported
+  ```
+  count by (sha256) (cortex_runtime_config_hash{namespace="<namespace>"})
+  ```
+- Check which replicas are running a different version
+  ```
+  cortex_runtime_config_hash{namespace="<namespace>",sha256="<unexpected>"}
+  ```
+- Check if the runtime config has been updated on the affected replicas' filesystem. Check `-runtime-config.file` command line argument to find the location of the file.
+- Check the affected replicas logs and look for any error loading the runtime config
 
 ### CortexBadRuntimeConfig
 
-_TODO: this playbook has not been written yet._
+This alert fires if Cortex is unable to reload the runtime config.
+
+This typically means an invalid runtime config was deployed. Cortex keeps running with the previous (valid) version of the runtime config; running Cortex replicas and the system availability shouldn't be affected, but new replicas won't be able to startup until the runtime config is fixed.
+
+How to **investigate**:
+- Check the latest runtime config update (it's likely to be broken)
+- Check Cortex logs to get more details about what's wrong with the config
 
 ### CortexQuerierCapacityFull
 
@@ -347,15 +417,15 @@ _TODO: this playbook has not been written yet._
 
 ### CortexCheckpointCreationFailed
 
-_TODO: this playbook has not been written yet._
+_This alert applies to Cortex chunks storage only._
 
 ### CortexCheckpointDeletionFailed
 
-_TODO: this playbook has not been written yet._
+_This alert applies to Cortex chunks storage only._
 
 ### CortexProvisioningMemcachedTooSmall
 
-_TODO: this playbook has not been written yet._
+_This alert applies to Cortex chunks storage only._
 
 ### CortexProvisioningTooManyActiveSeries