diff --git a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts index c7c1eb5454d1de7..45eef3cc85a57d2 100644 --- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts +++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/lib/evaluate_alert.ts @@ -64,6 +64,7 @@ export const evaluateAlert = { const { criteria, groupBy, filterQuery, shouldDropPartialBuckets } = params; @@ -91,21 +92,53 @@ export const evaluateAlert = { - if (isTooManyBucketsPreviewException(points)) throw points; - return { - ...criterion, - metric: criterion.metric ?? DOCUMENT_COUNT_I18N, - currentValue: Array.isArray(points) ? last(points)?.value : NaN, - timestamp: Array.isArray(points) ? last(points)?.key : NaN, - shouldFire: pointsEvaluator(points, threshold, comparator), - shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator), - isNoData: Array.isArray(points) - ? points.map((point) => point?.value === null || point === null) - : [points === null], - isError: isNaN(Array.isArray(points) ? last(points)?.value : points), - }; - }); + // If any previous groups are no longer being reported, backfill them with null values + const currentGroups = Object.keys(currentValues); + + const missingGroups = prevGroups.filter((g) => !currentGroups.includes(g)); + if (currentGroups.length === 0 && missingGroups.length === 0) { + missingGroups.push(UNGROUPED_FACTORY_KEY); + } + const backfillTimestamp = + last(last(Object.values(currentValues)))?.key ?? new Date().toISOString(); + const backfilledPrevGroups: Record< + string, + Array<{ key: string; value: number }> + > = missingGroups.reduce( + (result, group) => ({ + ...result, + [group]: [ + { + key: backfillTimestamp, + value: criterion.aggType === Aggregators.COUNT ? 0 : null, + }, + ], + }), + {} + ); + const currentValuesWithBackfilledPrevGroups = { + ...currentValues, + ...backfilledPrevGroups, + }; + + return mapValues( + currentValuesWithBackfilledPrevGroups, + (points: any[] | typeof NaN | null) => { + if (isTooManyBucketsPreviewException(points)) throw points; + return { + ...criterion, + metric: criterion.metric ?? DOCUMENT_COUNT_I18N, + currentValue: Array.isArray(points) ? last(points)?.value : NaN, + timestamp: Array.isArray(points) ? last(points)?.key : NaN, + shouldFire: pointsEvaluator(points, threshold, comparator), + shouldWarn: pointsEvaluator(points, warningThreshold, warningComparator), + isNoData: Array.isArray(points) + ? points.map((point) => point?.value === null || point === null) + : [points === null], + isError: isNaN(Array.isArray(points) ? last(points)?.value : points), + }; + } + ); }) ); }; @@ -119,7 +152,7 @@ const getMetric: ( filterQuery: string | undefined, timeframe?: { start?: number; end: number }, shouldDropPartialBuckets?: boolean -) => Promise> = async function ( +) => Promise>> = async function ( esClient, params, index, diff --git a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts index 8eb19ad58205766..869d0afd5236713 100644 --- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts +++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.test.ts @@ -37,10 +37,13 @@ let persistAlertInstances = false; // eslint-disable-line prefer-const type TestRuleState = Record & { aRuleStateKey: string; + groups: string[]; + groupBy?: string | string[]; }; const initialRuleState: TestRuleState = { aRuleStateKey: 'INITIAL_RULE_STATE_VALUE', + groups: [], }; const mockOptions = { @@ -90,6 +93,7 @@ const mockOptions = { describe('The metric threshold alert type', () => { describe('querying the entire infrastructure', () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') => executor({ @@ -157,20 +161,29 @@ describe('The metric threshold alert type', () => { }); describe('querying with a groupBy parameter', () => { - const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') => + afterAll(() => clearInstances()); + const execute = ( + comparator: Comparator, + threshold: number[], + groupBy: string[] = ['something'], + metric?: string, + state?: any + ) => executor({ ...mockOptions, services, params: { - groupBy: 'something', + groupBy, criteria: [ { ...baseNonCountCriterion, comparator, threshold, + metric: metric ?? baseNonCountCriterion.metric, }, ], }, + state: state ?? mockOptions.state.wrapped, }); const instanceIdA = 'a'; const instanceIdB = 'b'; @@ -194,9 +207,35 @@ describe('The metric threshold alert type', () => { expect(mostRecentAction(instanceIdA).action.group).toBe('a'); expect(mostRecentAction(instanceIdB).action.group).toBe('b'); }); + test('reports previous groups and the groupBy parameter in its state', async () => { + const stateResult = await execute(Comparator.GT, [0.75]); + expect(stateResult.groups).toEqual(expect.arrayContaining(['a', 'b'])); + expect(stateResult.groupBy).toEqual(['something']); + }); + test('persists previous groups that go missing, until the groupBy param changes', async () => { + const stateResult1 = await execute(Comparator.GT, [0.75], ['something'], 'test.metric.2'); + expect(stateResult1.groups).toEqual(expect.arrayContaining(['a', 'b', 'c'])); + const stateResult2 = await execute( + Comparator.GT, + [0.75], + ['something'], + 'test.metric.1', + stateResult1 + ); + expect(stateResult2.groups).toEqual(expect.arrayContaining(['a', 'b', 'c'])); + const stateResult3 = await execute( + Comparator.GT, + [0.75], + ['something', 'something-else'], + 'test.metric.1', + stateResult2 + ); + expect(stateResult3.groups).toEqual(expect.arrayContaining(['a', 'b'])); + }); }); describe('querying with multiple criteria', () => { + afterAll(() => clearInstances()); const execute = ( comparator: Comparator, thresholdA: number[], @@ -257,6 +296,7 @@ describe('The metric threshold alert type', () => { }); }); describe('querying with the count aggregator', () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') => executor({ @@ -279,8 +319,47 @@ describe('The metric threshold alert type', () => { await execute(Comparator.LT, [0.5]); expect(mostRecentAction(instanceID)).toBe(undefined); }); + describe('with a groupBy parameter', () => { + const executeGroupBy = ( + comparator: Comparator, + threshold: number[], + sourceId: string = 'default', + state?: any + ) => + executor({ + ...mockOptions, + services, + params: { + sourceId, + groupBy: 'something', + criteria: [ + { + ...baseCountCriterion, + comparator, + threshold, + }, + ], + }, + state: state ?? mockOptions.state.wrapped, + }); + const instanceIdA = 'a'; + const instanceIdB = 'b'; + + test('successfully detects and alerts on a document count of 0', async () => { + const resultState = await executeGroupBy(Comparator.LT_OR_EQ, [0]); + expect(mostRecentAction(instanceIdA)).toBe(undefined); + expect(mostRecentAction(instanceIdB)).toBe(undefined); + await executeGroupBy(Comparator.LT_OR_EQ, [0], 'empty-response', resultState); + expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id); + expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id); + await executeGroupBy(Comparator.LT_OR_EQ, [0]); + expect(mostRecentAction(instanceIdA)).toBe(undefined); + expect(mostRecentAction(instanceIdB)).toBe(undefined); + }); + }); }); describe('querying with the p99 aggregator', () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') => executor({ @@ -306,6 +385,7 @@ describe('The metric threshold alert type', () => { }); }); describe('querying with the p95 aggregator', () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (comparator: Comparator, threshold: number[], sourceId: string = 'default') => executor({ @@ -332,6 +412,7 @@ describe('The metric threshold alert type', () => { }); }); describe("querying a metric that hasn't reported data", () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (alertOnNoData: boolean, sourceId: string = 'default') => executor({ @@ -360,7 +441,51 @@ describe('The metric threshold alert type', () => { }); }); + describe('querying a groupBy alert that starts reporting no data, and then later reports data', () => { + afterAll(() => clearInstances()); + const instanceID = '*'; + const instanceIdA = 'a'; + const instanceIdB = 'b'; + const execute = (metric: string, state?: any) => + executor({ + ...mockOptions, + services, + params: { + groupBy: 'something', + sourceId: 'default', + criteria: [ + { + ...baseNonCountCriterion, + comparator: Comparator.GT, + threshold: [0], + metric, + }, + ], + alertOnNoData: true, + }, + state: state ?? mockOptions.state.wrapped, + }); + const resultState: any[] = []; + test('first sends a No Data alert with the * group, but then reports groups when data is available', async () => { + resultState.push(await execute('test.metric.3')); + expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id); + resultState.push(await execute('test.metric.3', resultState.pop())); + expect(mostRecentAction(instanceID).id).toBe(FIRED_ACTIONS.id); + resultState.push(await execute('test.metric.1', resultState.pop())); + expect(mostRecentAction(instanceID)).toBe(undefined); + expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id); + expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id); + }); + test('sends No Data alerts for the previously detected groups when they stop reporting data, but not the * group', async () => { + await execute('test.metric.3', resultState.pop()); + expect(mostRecentAction(instanceID)).toBe(undefined); + expect(mostRecentAction(instanceIdA).id).toBe(FIRED_ACTIONS.id); + expect(mostRecentAction(instanceIdB).id).toBe(FIRED_ACTIONS.id); + }); + }); + describe("querying a rate-aggregated metric that hasn't reported data", () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = (sourceId: string = 'default') => executor({ @@ -439,6 +564,7 @@ describe('The metric threshold alert type', () => { */ describe('querying a metric with a percentage metric', () => { + afterAll(() => clearInstances()); const instanceID = '*'; const execute = () => executor({ @@ -497,7 +623,15 @@ const services: AlertServicesMock & services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: any): any => { const from = params?.body.query.bool.filter[0]?.range['@timestamp'].gte; if (params.index === 'alternatebeat-*') return mocks.changedSourceIdResponse(from); + if (params.index === 'empty-response') return mocks.emptyMetricResponse; const metric = params?.body.query.bool.filter[1]?.exists.field; + if (metric === 'test.metric.3') { + return elasticsearchClientMock.createSuccessTransportRequestPromise( + params?.body.aggs.aggregatedIntervals?.aggregations.aggregatedValueMax + ? mocks.emptyRateResponse + : mocks.emptyMetricResponse + ); + } if (params?.body.aggs.groupings) { if (params?.body.aggs.groupings.composite.after) { return elasticsearchClientMock.createSuccessTransportRequestPromise( @@ -517,12 +651,6 @@ services.scopedClusterClient.asCurrentUser.search.mockImplementation((params?: a return elasticsearchClientMock.createSuccessTransportRequestPromise( mocks.alternateMetricResponse() ); - } else if (metric === 'test.metric.3') { - return elasticsearchClientMock.createSuccessTransportRequestPromise( - params?.body.aggs.aggregatedIntervals.aggregations.aggregatedValueMax - ? mocks.emptyRateResponse - : mocks.emptyMetricResponse - ); } return elasticsearchClientMock.createSuccessTransportRequestPromise(mocks.basicMetricResponse()); }); @@ -534,6 +662,13 @@ services.savedObjectsClient.get.mockImplementation(async (type: string, sourceId type, references: [], }; + if (sourceId === 'empty-response') + return { + id: 'empty', + attributes: { metricAlias: 'empty-response' }, + type, + references: [], + }; return { id: 'default', attributes: { metricAlias: 'metricbeat-*' }, type, references: [] }; }); @@ -561,7 +696,13 @@ services.alertInstanceFactory.mockImplementation((instanceID: string) => { }); function mostRecentAction(id: string) { - return alertInstances.get(id)!.actionQueue.pop(); + const instance = alertInstances.get(id); + if (!instance) return undefined; + return instance.actionQueue.pop(); +} + +function clearInstances() { + alertInstances.clear(); } const baseNonCountCriterion: Pick< diff --git a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts index 9c99ad6bf49e2b5..f49b281909f4bea 100644 --- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts +++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/metric_threshold_executor.ts @@ -5,7 +5,7 @@ * 2.0. */ -import { first, last } from 'lodash'; +import { first, last, isEqual } from 'lodash'; import { i18n } from '@kbn/i18n'; import moment from 'moment'; import { ALERT_REASON } from '@kbn/rule-data-utils'; @@ -24,12 +24,16 @@ import { // buildRecoveredAlertReason, stateToAlertMessage, } from '../common/messages'; +import { UNGROUPED_FACTORY_KEY } from '../common/utils'; import { createFormatter } from '../../../../common/formatters'; import { AlertStates, Comparator } from './types'; import { evaluateAlert, EvaluatedAlertParams } from './lib/evaluate_alert'; export type MetricThresholdAlertTypeParams = Record; -export type MetricThresholdAlertTypeState = AlertTypeState; // no specific state used +export type MetricThresholdAlertTypeState = AlertTypeState & { + groups: string[]; + groupBy?: string | string[]; +}; export type MetricThresholdAlertInstanceState = AlertInstanceState; // no specific instace state used export type MetricThresholdAlertInstanceContext = AlertInstanceContext; // no specific instace state used @@ -58,7 +62,7 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) => MetricThresholdAlertInstanceContext, MetricThresholdAllowedActionGroups >(async function (options) { - const { services, params } = options; + const { services, params, state } = options; const { criteria } = params; if (criteria.length === 0) throw new Error('Cannot execute an alert with 0 conditions'); const { alertWithLifecycle, savedObjectsClient } = services; @@ -80,14 +84,28 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) => sourceId || 'default' ); const config = source.configuration; + + const previousGroupBy = state.groupBy; + const prevGroups = isEqual(previousGroupBy, params.groupBy) + ? // Filter out the * key from the previous groups, only include it if it's one of + // the current groups. In case of a groupBy alert that starts out with no data and no + // groups, we don't want to persist the existence of the * alert instance + state.groups?.filter((g) => g !== UNGROUPED_FACTORY_KEY) ?? [] + : []; + const alertResults = await evaluateAlert( services.scopedClusterClient.asCurrentUser, params as EvaluatedAlertParams, - config + config, + prevGroups ); // Because each alert result has the same group definitions, just grab the groups from the first one. - const groups = Object.keys(first(alertResults)!); + const resultGroups = Object.keys(first(alertResults)!); + // Merge the list of currently fetched groups and previous groups, and uniquify them. This is necessary for reporting + // no data results on groups that get removed + const groups = [...new Set([...prevGroups, ...resultGroups])]; + for (const group of groups) { // AND logic; all criteria must be across the threshold const shouldAlertFire = alertResults.every((result) => @@ -169,6 +187,8 @@ export const createMetricThresholdExecutor = (libs: InfraBackendLibs) => }); } } + + return { groups, groupBy: params.groupBy }; }); export const FIRED_ACTIONS = { diff --git a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts index b1173f2d611c81d..db6b771e91784a5 100644 --- a/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts +++ b/x-pack/plugins/infra/server/lib/alerting/metric_threshold/test_mocks.ts @@ -199,12 +199,20 @@ export const alternateCompositeResponse = (from: number) => ({ buckets: bucketsA(from), }, }, + { + key: { + groupBy0: 'c', + }, + aggregatedIntervals: { + buckets: bucketsC(from), + }, + }, ], }, }, hits: { total: { - value: 2, + value: 3, }, }, }); diff --git a/x-pack/plugins/infra/server/utils/get_all_composite_data.ts b/x-pack/plugins/infra/server/utils/get_all_composite_data.ts index df97c91aacd0458..1ab290796e36d23 100644 --- a/x-pack/plugins/infra/server/utils/get_all_composite_data.ts +++ b/x-pack/plugins/infra/server/utils/get_all_composite_data.ts @@ -24,7 +24,7 @@ export const getAllCompositeData = async < const { body: response } = await esClientSearch(options); // Nothing available, return the previous buckets. - if (response.hits.total.value === 0) { + if (response.hits?.total.value === 0) { return previousBuckets; } diff --git a/x-pack/test/api_integration/apis/metrics_ui/constants.ts b/x-pack/test/api_integration/apis/metrics_ui/constants.ts index f0ba9b4c368d597..2ca89f2f9ab87f5 100644 --- a/x-pack/test/api_integration/apis/metrics_ui/constants.ts +++ b/x-pack/test/api_integration/apis/metrics_ui/constants.ts @@ -31,7 +31,8 @@ export const DATES = { 'alert-test-data': { gauge: { min: 1609459200000, // '2022-01-01T00:00:00Z' - max: 1609462800000, // '2021-01-01T01:00:00Z' + max: 1609462800000, // '2021-01-01T01:00:00Z', + midpoint: 1609461000000, // '2021-01-01T00:30:00Z' }, rate: { min: 1609545600000, // '2021-01-02T00:00:00Z' diff --git a/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts b/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts index 28910bbc6b0c870..66c40e2e6e92d82 100644 --- a/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts +++ b/x-pack/test/api_integration/apis/metrics_ui/metric_threshold_alert.ts @@ -100,7 +100,7 @@ export default function ({ getService }: FtrProviderContext) { ], }; const timeFrame = { end: gauge.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { '*': { @@ -123,7 +123,7 @@ export default function ({ getService }: FtrProviderContext) { it('should alert on the last value when the end date is the same as the last event', async () => { const params = { ...baseParams }; const timeFrame = { end: gauge.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { '*': { @@ -160,7 +160,7 @@ export default function ({ getService }: FtrProviderContext) { ], }; const timeFrame = { end: gauge.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { dev: { @@ -200,7 +200,7 @@ export default function ({ getService }: FtrProviderContext) { groupBy: ['env'], }; const timeFrame = { end: gauge.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { dev: { @@ -234,6 +234,53 @@ export default function ({ getService }: FtrProviderContext) { }, ]); }); + + it('should report no data when one of the groups has a data gap', async () => { + const params = { + ...baseParams, + groupBy: ['env'], + }; + const timeFrame = { end: gauge.midpoint }; + const results = await evaluateAlert( + esClient, + params, + configuration, + ['dev', 'prod'], + timeFrame + ); + expect(results).to.eql([ + { + dev: { + timeSize: 5, + timeUnit: 'm', + threshold: [1], + comparator: '>=', + aggType: 'sum', + metric: 'value', + currentValue: null, + timestamp: '2021-01-01T00:25:00.000Z', + shouldFire: [false], + shouldWarn: [false], + isNoData: [true], + isError: false, + }, + prod: { + timeSize: 5, + timeUnit: 'm', + threshold: [1], + comparator: '>=', + aggType: 'sum', + metric: 'value', + currentValue: 0, + timestamp: '2021-01-01T00:25:00.000Z', + shouldFire: [false], + shouldWarn: [false], + isNoData: [false], + isError: false, + }, + }, + ]); + }); }); }); @@ -254,7 +301,7 @@ export default function ({ getService }: FtrProviderContext) { ], }; const timeFrame = { end: rate.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { '*': { @@ -294,7 +341,7 @@ export default function ({ getService }: FtrProviderContext) { ], }; const timeFrame = { end: rate.max }; - const results = await evaluateAlert(esClient, params, configuration, timeFrame); + const results = await evaluateAlert(esClient, params, configuration, [], timeFrame); expect(results).to.eql([ { dev: { diff --git a/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz b/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz index 1c76205f4caa2a1..92815ba80a3a5ea 100644 Binary files a/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz and b/x-pack/test/functional/es_archives/infra/alerts_test_data/data.json.gz differ