Skip to content

Commit

Permalink
Make TaskManager health status error/warning reasons more visible (el…
Browse files Browse the repository at this point in the history
…astic#154045)

Resolves: elastic#152289

With this PR we make some of the `debug` logs `warn` and return the
message to the health API as reason to add in the status summary
message.
  • Loading branch information
ersin-erdal authored and nikitaindik committed Apr 25, 2023
1 parent 3aff9fb commit ae87656
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 85 deletions.

This file was deleted.

24 changes: 16 additions & 8 deletions x-pack/plugins/task_manager/server/lib/calculate_health_status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export function calculateHealthStatus(
config: TaskManagerConfig,
shouldRunTasks: boolean,
logger: Logger
): HealthStatus {
): { status: HealthStatus; reason?: string } {
const now = Date.now();

// if "hot" health stats are any more stale than monitored_stats_required_freshness
Expand All @@ -28,27 +28,35 @@ export function calculateHealthStatus(
const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5;

if (hasStatus(summarizedStats.stats, HealthStatus.Error)) {
return HealthStatus.Error;
return {
status: HealthStatus.Error,
reason: summarizedStats.stats.capacity_estimation?.reason,
};
}

// Hot timestamps look at runtime stats which are not available when tasks are not running
if (shouldRunTasks) {
if (hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness)) {
logger.debug('setting HealthStatus.Error because of expired hot timestamps');
return HealthStatus.Error;
const reason = 'setting HealthStatus.Error because of expired hot timestamps';
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}
}

if (hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)) {
logger.debug('setting HealthStatus.Error because of expired cold timestamps');
return HealthStatus.Error;
const reason = 'setting HealthStatus.Error because of expired cold timestamps';
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}

if (hasStatus(summarizedStats.stats, HealthStatus.Warning)) {
return HealthStatus.Warning;
return {
status: HealthStatus.Warning,
reason: summarizedStats.stats.capacity_estimation?.reason,
};
}

return HealthStatus.OK;
return { status: HealthStatus.OK };
}

function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean {
Expand Down
40 changes: 30 additions & 10 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,30 @@ describe('logHealthMetrics', () => {
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from OK to Warning
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.OK,
}));
logHealthMetrics(health, logger, config, true, docLinks);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Warning,
}));
logHealthMetrics(health, logger, config, true, docLinks);
// We must change from OK to Error
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.OK,
}));
logHealthMetrics(health, logger, config, true, docLinks);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Error,
}));
logHealthMetrics(health, logger, config, true, docLinks);

const debugCalls = (logger as jest.Mocked<Logger>).debug.mock.calls;
Expand Down Expand Up @@ -175,9 +189,11 @@ describe('logHealthMetrics', () => {
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Warning,
}));

logHealthMetrics(health, logger, config, true, docLinks);

Expand All @@ -201,7 +217,11 @@ describe('logHealthMetrics', () => {
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
(
calculateHealthStatus as jest.Mock<{ status: HealthStatus; reason?: string }>
).mockImplementation(() => ({
status: HealthStatus.Error,
}));

logHealthMetrics(health, logger, config, true, docLinks);

Expand Down
9 changes: 3 additions & 6 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,9 @@ export function logHealthMetrics(
capacity_estimation: undefined,
},
};
const statusWithoutCapacity = calculateHealthStatus(
healthWithoutCapacity,
config,
shouldRunTasks,
logger
);
const healthStatus = calculateHealthStatus(healthWithoutCapacity, config, shouldRunTasks, logger);

const statusWithoutCapacity = healthStatus?.status;
if (statusWithoutCapacity === HealthStatus.Warning) {
logLevel = LogLevel.Warn;
} else if (statusWithoutCapacity === HealthStatus.Error && !isEmpty(monitoredHealth.stats)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,14 @@ export function estimateCapacity(
averageCapacityUsedByNonRecurringAndEphemeralTasksPerKibana +
averageRecurringRequiredPerMinute / assumedKibanaInstances;

const status = getHealthStatus(logger, {
const { status, reason } = getHealthStatus(logger, {
assumedRequiredThroughputPerMinutePerKibana,
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
capacityPerMinutePerKibana,
});
return {
status,
reason,
timestamp: new Date().toISOString(),
value: {
observed: mapValues(
Expand Down Expand Up @@ -231,27 +232,28 @@ interface GetHealthStatusParams {
capacityPerMinutePerKibana: number;
}

function getHealthStatus(logger: Logger, params: GetHealthStatusParams): HealthStatus {
function getHealthStatus(
logger: Logger,
params: GetHealthStatusParams
): { status: HealthStatus; reason?: string } {
const {
assumedRequiredThroughputPerMinutePerKibana,
assumedAverageRecurringRequiredThroughputPerMinutePerKibana,
capacityPerMinutePerKibana,
} = params;
if (assumedRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
return HealthStatus.OK;
return { status: HealthStatus.OK };
}

if (assumedAverageRecurringRequiredThroughputPerMinutePerKibana < capacityPerMinutePerKibana) {
logger.debug(
`setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
);
return HealthStatus.Warning;
const reason = `setting HealthStatus.Warning because assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) < capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
logger.warn(reason);
return { status: HealthStatus.Warning, reason };
}

logger.debug(
`setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`
);
return HealthStatus.Error;
const reason = `setting HealthStatus.Error because assumedRequiredThroughputPerMinutePerKibana (${assumedRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana}) AND assumedAverageRecurringRequiredThroughputPerMinutePerKibana (${assumedAverageRecurringRequiredThroughputPerMinutePerKibana}) >= capacityPerMinutePerKibana (${capacityPerMinutePerKibana})`;
logger.warn(reason);
return { status: HealthStatus.Error, reason };
}

export function withCapacityEstimate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ export interface MonitoredStat<T> {
}
export type RawMonitoredStat<T extends JsonObject> = MonitoredStat<T> & {
status: HealthStatus;
reason?: string;
};

export interface RawMonitoringStats {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,24 +375,24 @@ describe('Task Run Statistics', () => {
{ Success: 40, RetryScheduled: 40, Failed: 20, status: 'OK' },
]);

expect(logger.debug).toHaveBeenCalledTimes(5);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenCalledTimes(5);
expect(logger.warn).toHaveBeenNthCalledWith(
1,
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
2,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
3,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
4,
'Health Status error threshold has been exceeded, resultFrequencySummary.Failed (60) is greater than error_threshold (59)'
);
expect(logger.debug).toHaveBeenNthCalledWith(
expect(logger.warn).toHaveBeenNthCalledWith(
5,
'Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (40) is greater than warn_threshold (39)'
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,11 +433,11 @@ function getHealthStatus(
): HealthStatus {
if (resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold) {
if (resultFrequencySummary.Failed > executionErrorThreshold.error_threshold) {
logger.debug(
logger.warn(
`Health Status error threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than error_threshold (${executionErrorThreshold.error_threshold})`
);
} else {
logger.debug(
logger.warn(
`Health Status warn threshold has been exceeded, resultFrequencySummary.Failed (${resultFrequencySummary.Failed}) is greater than warn_threshold (${executionErrorThreshold.warn_threshold})`
);
}
Expand Down
Loading

0 comments on commit ae87656

Please sign in to comment.