From 619db365912b752552edc2f97995529e2cc26328 Mon Sep 17 00:00:00 2001 From: Gidi Meir Morris Date: Thu, 11 Feb 2021 14:46:14 +0000 Subject: [PATCH] [Task manager] Adds support for limited concurrency tasks (#90365) Adds support for limited concurrency on a Task Type. --- x-pack/plugins/task_manager/README.md | 8 +- .../server/buffered_task_store.test.ts | 10 +- .../server/buffered_task_store.ts | 4 - .../task_manager/server/lib/fill_pool.test.ts | 56 +- .../task_manager/server/lib/fill_pool.ts | 132 +- .../monitoring/task_run_statistics.test.ts | 1 + .../server/monitoring/task_run_statistics.ts | 56 +- .../task_manager/server/plugin.test.ts | 9 + x-pack/plugins/task_manager/server/plugin.ts | 10 +- .../polling/delay_on_claim_conflicts.test.ts | 61 + .../polling/delay_on_claim_conflicts.ts | 12 +- .../server/polling_lifecycle.test.ts | 151 +- .../task_manager/server/polling_lifecycle.ts | 126 +- .../mark_available_tasks_as_claimed.test.ts | 97 +- .../mark_available_tasks_as_claimed.ts | 70 +- .../server/queries/task_claiming.mock.ts | 33 + .../server/queries/task_claiming.test.ts | 1516 +++++++++++++ .../server/queries/task_claiming.ts | 488 +++++ x-pack/plugins/task_manager/server/task.ts | 10 + .../task_manager/server/task_events.ts | 16 +- .../task_manager/server/task_pool.test.ts | 2 + .../plugins/task_manager/server/task_pool.ts | 54 +- .../server/task_running/task_runner.test.ts | 1915 +++++++++-------- .../server/task_running/task_runner.ts | 191 +- .../server/task_scheduling.test.ts | 105 +- .../task_manager/server/task_scheduling.ts | 29 +- .../task_manager/server/task_store.mock.ts | 17 +- .../task_manager/server/task_store.test.ts | 1098 +--------- .../plugins/task_manager/server/task_store.ts | 240 +-- .../server/task_type_dictionary.ts | 4 + .../sample_task_plugin/server/init_routes.ts | 10 +- .../sample_task_plugin/server/plugin.ts | 14 + .../test_suites/task_manager/health_route.ts | 15 +- .../task_manager/task_management.ts | 207 +- 34 files changed, 4163 insertions(+), 2604 deletions(-) create mode 100644 x-pack/plugins/task_manager/server/queries/task_claiming.mock.ts create mode 100644 x-pack/plugins/task_manager/server/queries/task_claiming.test.ts create mode 100644 x-pack/plugins/task_manager/server/queries/task_claiming.ts diff --git a/x-pack/plugins/task_manager/README.md b/x-pack/plugins/task_manager/README.md index 9be3be14ea3fca..c20bc4b29bcc84 100644 --- a/x-pack/plugins/task_manager/README.md +++ b/x-pack/plugins/task_manager/README.md @@ -85,10 +85,10 @@ export class Plugin { // This defaults to what is configured at the task manager level. maxAttempts: 5, - // The clusterMonitoring task occupies 2 workers, so if the system has 10 worker slots, - // 5 clusterMonitoring tasks could run concurrently per Kibana instance. This value is - // overridden by the `override_num_workers` config value, if specified. - numWorkers: 2, + // The maximum number tasks of this type that can be run concurrently per Kibana instance. + // Setting this value will force Task Manager to poll for this task type seperatly from other task types which + // can add significant load to the ES cluster, so please use this configuration only when absolutly necesery. + maxConcurrency: 1, // The createTaskRunner function / method returns an object that is responsible for // performing the work of the task. context: { taskInstance }, is documented below. diff --git a/x-pack/plugins/task_manager/server/buffered_task_store.test.ts b/x-pack/plugins/task_manager/server/buffered_task_store.test.ts index 70d24b235d8805..45607713a31287 100644 --- a/x-pack/plugins/task_manager/server/buffered_task_store.test.ts +++ b/x-pack/plugins/task_manager/server/buffered_task_store.test.ts @@ -13,19 +13,17 @@ import { TaskStatus } from './task'; describe('Buffered Task Store', () => { test('proxies the TaskStore for `maxAttempts` and `remove`', async () => { - const taskStore = taskStoreMock.create({ maxAttempts: 10 }); + const taskStore = taskStoreMock.create(); taskStore.bulkUpdate.mockResolvedValue([]); const bufferedStore = new BufferedTaskStore(taskStore, {}); - expect(bufferedStore.maxAttempts).toEqual(10); - bufferedStore.remove('1'); expect(taskStore.remove).toHaveBeenCalledWith('1'); }); describe('update', () => { test("proxies the TaskStore's `bulkUpdate`", async () => { - const taskStore = taskStoreMock.create({ maxAttempts: 10 }); + const taskStore = taskStoreMock.create(); const bufferedStore = new BufferedTaskStore(taskStore, {}); const task = mockTask(); @@ -37,7 +35,7 @@ describe('Buffered Task Store', () => { }); test('handles partially successfull bulkUpdates resolving each call appropriately', async () => { - const taskStore = taskStoreMock.create({ maxAttempts: 10 }); + const taskStore = taskStoreMock.create(); const bufferedStore = new BufferedTaskStore(taskStore, {}); const tasks = [mockTask(), mockTask(), mockTask()]; @@ -61,7 +59,7 @@ describe('Buffered Task Store', () => { }); test('handles multiple items with the same id', async () => { - const taskStore = taskStoreMock.create({ maxAttempts: 10 }); + const taskStore = taskStoreMock.create(); const bufferedStore = new BufferedTaskStore(taskStore, {}); const duplicateIdTask = mockTask(); diff --git a/x-pack/plugins/task_manager/server/buffered_task_store.ts b/x-pack/plugins/task_manager/server/buffered_task_store.ts index 4e4a533303867f..ca735dd6f36389 100644 --- a/x-pack/plugins/task_manager/server/buffered_task_store.ts +++ b/x-pack/plugins/task_manager/server/buffered_task_store.ts @@ -26,10 +26,6 @@ export class BufferedTaskStore implements Updatable { ); } - public get maxAttempts(): number { - return this.taskStore.maxAttempts; - } - public async update(doc: ConcreteTaskInstance): Promise { return unwrapPromise(this.bufferedUpdate(doc)); } diff --git a/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts b/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts index 79a0d2f6900429..8e0396a453b3d9 100644 --- a/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts +++ b/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts @@ -10,27 +10,32 @@ import sinon from 'sinon'; import { fillPool, FillPoolResult } from './fill_pool'; import { TaskPoolRunResult } from '../task_pool'; import { asOk, Result } from './result_type'; -import { ClaimOwnershipResult } from '../task_store'; import { ConcreteTaskInstance, TaskStatus } from '../task'; import { TaskManagerRunner } from '../task_running/task_runner'; +import { from, Observable } from 'rxjs'; +import { ClaimOwnershipResult } from '../queries/task_claiming'; jest.mock('../task_running/task_runner'); describe('fillPool', () => { function mockFetchAvailableTasks( tasksToMock: number[][] - ): () => Promise> { - const tasks: ConcreteTaskInstance[][] = tasksToMock.map((ids) => mockTaskInstances(ids)); - let index = 0; - return async () => - asOk({ - stats: { - tasksUpdated: tasks[index + 1]?.length ?? 0, - tasksConflicted: 0, - tasksClaimed: 0, - }, - docs: tasks[index++] || [], - }); + ): () => Observable> { + const claimCycles: ConcreteTaskInstance[][] = tasksToMock.map((ids) => mockTaskInstances(ids)); + return () => + from( + claimCycles.map((tasks) => + asOk({ + stats: { + tasksUpdated: tasks?.length ?? 0, + tasksConflicted: 0, + tasksClaimed: 0, + tasksRejected: 0, + }, + docs: tasks, + }) + ) + ); } const mockTaskInstances = (ids: number[]): ConcreteTaskInstance[] => @@ -51,7 +56,7 @@ describe('fillPool', () => { ownerId: null, })); - test('stops filling when pool runs all claimed tasks, even if there is more capacity', async () => { + test('fills task pool with all claimed tasks until fetchAvailableTasks stream closes', async () => { const tasks = [ [1, 2, 3], [4, 5], @@ -62,21 +67,7 @@ describe('fillPool', () => { await fillPool(fetchAvailableTasks, converter, run); - expect(_.flattenDeep(run.args)).toEqual(mockTaskInstances([1, 2, 3])); - }); - - test('stops filling when the pool has no more capacity', async () => { - const tasks = [ - [1, 2, 3], - [4, 5], - ]; - const fetchAvailableTasks = mockFetchAvailableTasks(tasks); - const run = sinon.spy(async () => TaskPoolRunResult.RanOutOfCapacity); - const converter = _.identity; - - await fillPool(fetchAvailableTasks, converter, run); - - expect(_.flattenDeep(run.args)).toEqual(mockTaskInstances([1, 2, 3])); + expect(_.flattenDeep(run.args)).toEqual(mockTaskInstances([1, 2, 3, 4, 5])); }); test('calls the converter on the records prior to running', async () => { @@ -91,7 +82,7 @@ describe('fillPool', () => { await fillPool(fetchAvailableTasks, converter, run); - expect(_.flattenDeep(run.args)).toEqual(['1', '2', '3']); + expect(_.flattenDeep(run.args)).toEqual(['1', '2', '3', '4', '5']); }); describe('error handling', () => { @@ -101,7 +92,10 @@ describe('fillPool', () => { (instance.id as unknown) as TaskManagerRunner; try { - const fetchAvailableTasks = async () => Promise.reject('fetch is not working'); + const fetchAvailableTasks = () => + new Observable>((obs) => + obs.error('fetch is not working') + ); await fillPool(fetchAvailableTasks, converter, run); } catch (err) { diff --git a/x-pack/plugins/task_manager/server/lib/fill_pool.ts b/x-pack/plugins/task_manager/server/lib/fill_pool.ts index 45a33081bde51e..c9050ebb75d69f 100644 --- a/x-pack/plugins/task_manager/server/lib/fill_pool.ts +++ b/x-pack/plugins/task_manager/server/lib/fill_pool.ts @@ -6,12 +6,14 @@ */ import { performance } from 'perf_hooks'; +import { Observable } from 'rxjs'; +import { concatMap, last } from 'rxjs/operators'; +import { ClaimOwnershipResult } from '../queries/task_claiming'; import { ConcreteTaskInstance } from '../task'; import { WithTaskTiming, startTaskTimer } from '../task_events'; import { TaskPoolRunResult } from '../task_pool'; import { TaskManagerRunner } from '../task_running'; -import { ClaimOwnershipResult } from '../task_store'; -import { Result, map } from './result_type'; +import { Result, map as mapResult, asErr, asOk } from './result_type'; export enum FillPoolResult { Failed = 'Failed', @@ -22,6 +24,17 @@ export enum FillPoolResult { PoolFilled = 'PoolFilled', } +type FillPoolAndRunResult = Result< + { + result: TaskPoolRunResult; + stats?: ClaimOwnershipResult['stats']; + }, + { + result: FillPoolResult; + stats?: ClaimOwnershipResult['stats']; + } +>; + export type ClaimAndFillPoolResult = Partial> & { result: FillPoolResult; }; @@ -40,52 +53,81 @@ export type TimedFillPoolResult = WithTaskTiming; * @param converter - a function that converts task records to the appropriate task runner */ export async function fillPool( - fetchAvailableTasks: () => Promise>, + fetchAvailableTasks: () => Observable>, converter: (taskInstance: ConcreteTaskInstance) => TaskManagerRunner, run: (tasks: TaskManagerRunner[]) => Promise ): Promise { performance.mark('fillPool.start'); - const stopTaskTimer = startTaskTimer(); - const augmentTimingTo = ( - result: FillPoolResult, - stats?: ClaimOwnershipResult['stats'] - ): TimedFillPoolResult => ({ - result, - stats, - timing: stopTaskTimer(), - }); - return map>( - await fetchAvailableTasks(), - async ({ docs, stats }) => { - if (!docs.length) { - performance.mark('fillPool.bailNoTasks'); - performance.measure( - 'fillPool.activityDurationUntilNoTasks', - 'fillPool.start', - 'fillPool.bailNoTasks' - ); - return augmentTimingTo(FillPoolResult.NoTasksClaimed, stats); - } - - const tasks = docs.map(converter); - - switch (await run(tasks)) { - case TaskPoolRunResult.RanOutOfCapacity: - performance.mark('fillPool.bailExhaustedCapacity'); - performance.measure( - 'fillPool.activityDurationUntilExhaustedCapacity', - 'fillPool.start', - 'fillPool.bailExhaustedCapacity' + return new Promise((resolve, reject) => { + const stopTaskTimer = startTaskTimer(); + const augmentTimingTo = ( + result: FillPoolResult, + stats?: ClaimOwnershipResult['stats'] + ): TimedFillPoolResult => ({ + result, + stats, + timing: stopTaskTimer(), + }); + fetchAvailableTasks() + .pipe( + // each ClaimOwnershipResult will be sequencially consumed an ran using the `run` handler + concatMap(async (res) => + mapResult>( + res, + async ({ docs, stats }) => { + if (!docs.length) { + performance.mark('fillPool.bailNoTasks'); + performance.measure( + 'fillPool.activityDurationUntilNoTasks', + 'fillPool.start', + 'fillPool.bailNoTasks' + ); + return asOk({ result: TaskPoolRunResult.NoTaskWereRan, stats }); + } + return asOk( + await run(docs.map(converter)).then((runResult) => ({ + result: runResult, + stats, + })) + ); + }, + async (fillPoolResult) => asErr({ result: fillPoolResult }) + ) + ), + // when the final call to `run` completes, we'll complete the stream and emit the + // final accumulated result + last() + ) + .subscribe( + (claimResults) => { + resolve( + mapResult( + claimResults, + ({ result, stats }) => { + switch (result) { + case TaskPoolRunResult.RanOutOfCapacity: + performance.mark('fillPool.bailExhaustedCapacity'); + performance.measure( + 'fillPool.activityDurationUntilExhaustedCapacity', + 'fillPool.start', + 'fillPool.bailExhaustedCapacity' + ); + return augmentTimingTo(FillPoolResult.RanOutOfCapacity, stats); + case TaskPoolRunResult.RunningAtCapacity: + performance.mark('fillPool.cycle'); + return augmentTimingTo(FillPoolResult.RunningAtCapacity, stats); + case TaskPoolRunResult.NoTaskWereRan: + return augmentTimingTo(FillPoolResult.NoTasksClaimed, stats); + default: + performance.mark('fillPool.cycle'); + return augmentTimingTo(FillPoolResult.PoolFilled, stats); + } + }, + ({ result, stats }) => augmentTimingTo(result, stats) + ) ); - return augmentTimingTo(FillPoolResult.RanOutOfCapacity, stats); - case TaskPoolRunResult.RunningAtCapacity: - performance.mark('fillPool.cycle'); - return augmentTimingTo(FillPoolResult.RunningAtCapacity, stats); - default: - performance.mark('fillPool.cycle'); - return augmentTimingTo(FillPoolResult.PoolFilled, stats); - } - }, - async (result) => augmentTimingTo(result) - ); + }, + (err) => reject(err) + ); + }); } diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts index 5c32c3e7225c43..7040d5acd4eaf3 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -537,6 +537,7 @@ describe('Task Run Statistics', () => { asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed, timing })) ); events$.next(asTaskManagerStatEvent('pollingDelay', asOk(0))); + events$.next(asTaskManagerStatEvent('claimDuration', asOk(10))); events$.next( asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed, timing })) ); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index 4b7bdf595f1f55..3185d3c449c32c 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -19,6 +19,7 @@ import { RanTask, TaskTiming, isTaskManagerStatEvent, + TaskManagerStat, } from '../task_events'; import { isOk, Ok, unwrap } from '../lib/result_type'; import { ConcreteTaskInstance } from '../task'; @@ -39,6 +40,7 @@ interface FillPoolStat extends JsonObject { last_successful_poll: string; last_polling_delay: string; duration: number[]; + claim_duration: number[]; claim_conflicts: number[]; claim_mismatches: number[]; result_frequency_percent_as_number: FillPoolResult[]; @@ -51,6 +53,7 @@ interface ExecutionStat extends JsonObject { export interface TaskRunStat extends JsonObject { drift: number[]; + drift_by_type: Record; load: number[]; execution: ExecutionStat; polling: Omit & @@ -125,6 +128,7 @@ export function createTaskRunAggregator( const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); const pollingDurationQueue = createRunningAveragedStat(runningAverageWindowSize); + const claimDurationQueue = createRunningAveragedStat(runningAverageWindowSize); const claimConflictsQueue = createRunningAveragedStat(runningAverageWindowSize); const claimMismatchesQueue = createRunningAveragedStat(runningAverageWindowSize); const taskPollingEvents$: Observable> = combineLatest([ @@ -168,10 +172,26 @@ export function createTaskRunAggregator( ), map(() => new Date().toISOString()) ), + // get duration of task claim stage in polling + taskPollingLifecycle.events.pipe( + filter( + (taskEvent: TaskLifecycleEvent) => + isTaskManagerStatEvent(taskEvent) && + taskEvent.id === 'claimDuration' && + isOk(taskEvent.event) + ), + map((claimDurationEvent) => { + const duration = ((claimDurationEvent as TaskManagerStat).event as Ok).value; + return { + claimDuration: duration ? claimDurationQueue(duration) : claimDurationQueue(), + }; + }) + ), ]).pipe( - map(([{ polling }, pollingDelay]) => ({ + map(([{ polling }, pollingDelay, { claimDuration }]) => ({ polling: { last_polling_delay: pollingDelay, + claim_duration: claimDuration, ...polling, }, })) @@ -179,13 +199,18 @@ export function createTaskRunAggregator( return combineLatest([ taskRunEvents$.pipe( - startWith({ drift: [], execution: { duration: {}, result_frequency_percent_as_number: {} } }) + startWith({ + drift: [], + drift_by_type: {}, + execution: { duration: {}, result_frequency_percent_as_number: {} }, + }) ), taskManagerLoadStatEvents$.pipe(startWith({ load: [] })), taskPollingEvents$.pipe( startWith({ polling: { duration: [], + claim_duration: [], claim_conflicts: [], claim_mismatches: [], result_frequency_percent_as_number: [], @@ -218,6 +243,7 @@ function hasTiming(taskEvent: TaskLifecycleEvent) { function createTaskRunEventToStat(runningAverageWindowSize: number) { const driftQueue = createRunningAveragedStat(runningAverageWindowSize); + const driftByTaskQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); const resultFrequencyQueue = createMapOfRunningAveragedStats( runningAverageWindowSize @@ -226,13 +252,17 @@ function createTaskRunEventToStat(runningAverageWindowSize: number) { task: ConcreteTaskInstance, timing: TaskTiming, result: TaskRunResult - ): Omit => ({ - drift: driftQueue(timing!.start - task.runAt.getTime()), - execution: { - duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), - result_frequency_percent_as_number: resultFrequencyQueue(task.taskType, result), - }, - }); + ): Omit => { + const drift = timing!.start - task.runAt.getTime(); + return { + drift: driftQueue(drift), + drift_by_type: driftByTaskQueue(task.taskType, drift), + execution: { + duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), + result_frequency_percent_as_number: resultFrequencyQueue(task.taskType, result), + }, + }; + }; } const DEFAULT_TASK_RUN_FREQUENCIES = { @@ -258,11 +288,15 @@ export function summarizeTaskRunStat( // eslint-disable-next-line @typescript-eslint/naming-convention last_polling_delay, duration: pollingDuration, + // eslint-disable-next-line @typescript-eslint/naming-convention + claim_duration, result_frequency_percent_as_number: pollingResultFrequency, claim_conflicts: claimConflicts, claim_mismatches: claimMismatches, }, drift, + // eslint-disable-next-line @typescript-eslint/naming-convention + drift_by_type, load, execution: { duration, result_frequency_percent_as_number: executionResultFrequency }, }: TaskRunStat, @@ -273,6 +307,9 @@ export function summarizeTaskRunStat( polling: { ...(last_successful_poll ? { last_successful_poll } : {}), ...(last_polling_delay ? { last_polling_delay } : {}), + ...(claim_duration + ? { claim_duration: calculateRunningAverage(claim_duration as number[]) } + : {}), duration: calculateRunningAverage(pollingDuration as number[]), claim_conflicts: calculateRunningAverage(claimConflicts as number[]), claim_mismatches: calculateRunningAverage(claimMismatches as number[]), @@ -282,6 +319,7 @@ export function summarizeTaskRunStat( }, }, drift: calculateRunningAverage(drift), + drift_by_type: mapValues(drift_by_type, (typedDrift) => calculateRunningAverage(typedDrift)), load: calculateRunningAverage(load), execution: { duration: mapValues(duration, (typedDurations) => calculateRunningAverage(typedDurations)), diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts index 0a879ce92cba6e..45db18a3e83857 100644 --- a/x-pack/plugins/task_manager/server/plugin.test.ts +++ b/x-pack/plugins/task_manager/server/plugin.test.ts @@ -70,6 +70,15 @@ describe('TaskManagerPlugin', () => { const setupApi = await taskManagerPlugin.setup(coreMock.createSetup()); + // we only start a poller if we have task types that we support and we track + // phases (moving from Setup to Start) based on whether the poller is working + setupApi.registerTaskDefinitions({ + setupTimeType: { + title: 'setupTimeType', + createTaskRunner: () => ({ async run() {} }), + }, + }); + await taskManagerPlugin.start(coreMock.createStart()); expect(() => diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 149d111b08f02a..507a021214a904 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -16,13 +16,12 @@ import { ServiceStatusLevels, CoreStatus, } from '../../../../src/core/server'; -import { TaskDefinition } from './task'; import { TaskPollingLifecycle } from './polling_lifecycle'; import { TaskManagerConfig } from './config'; import { createInitialMiddleware, addMiddlewareToChain, Middleware } from './lib/middleware'; import { removeIfExists } from './lib/remove_if_exists'; import { setupSavedObjects } from './saved_objects'; -import { TaskTypeDictionary } from './task_type_dictionary'; +import { TaskDefinitionRegistry, TaskTypeDictionary } from './task_type_dictionary'; import { FetchResult, SearchOpts, TaskStore } from './task_store'; import { createManagedConfiguration } from './lib/create_managed_configuration'; import { TaskScheduling } from './task_scheduling'; @@ -100,7 +99,7 @@ export class TaskManagerPlugin this.assertStillInSetup('add Middleware'); this.middleware = addMiddlewareToChain(this.middleware, middleware); }, - registerTaskDefinitions: (taskDefinition: Record) => { + registerTaskDefinitions: (taskDefinition: TaskDefinitionRegistry) => { this.assertStillInSetup('register task definitions'); this.definitions.registerTaskDefinitions(taskDefinition); }, @@ -110,12 +109,12 @@ export class TaskManagerPlugin public start({ savedObjects, elasticsearch }: CoreStart): TaskManagerStartContract { const savedObjectsRepository = savedObjects.createInternalRepository(['task']); + const serializer = savedObjects.createSerializer(); const taskStore = new TaskStore({ - serializer: savedObjects.createSerializer(), + serializer, savedObjectsRepository, esClient: elasticsearch.createClient('taskManager').asInternalUser, index: this.config!.index, - maxAttempts: this.config!.max_attempts, definitions: this.definitions, taskManagerId: `kibana:${this.taskManagerId!}`, }); @@ -151,6 +150,7 @@ export class TaskManagerPlugin taskStore, middleware: this.middleware, taskPollingLifecycle: this.taskPollingLifecycle, + definitions: this.definitions, }); return { diff --git a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts index d4617d6549d60d..f3af6f50336eae 100644 --- a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts +++ b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts @@ -64,6 +64,7 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 8, tasksClaimed: 0, + tasksRejected: 0, }, docs: [], }) @@ -79,6 +80,63 @@ describe('delayOnClaimConflicts', () => { }) ); + test( + 'emits delay only once, no mater how many subscribers there are', + fakeSchedulers(async () => { + const taskLifecycleEvents$ = new Subject(); + + const delays$ = delayOnClaimConflicts(of(10), of(100), taskLifecycleEvents$, 80, 2); + + const firstSubscriber$ = delays$.pipe(take(2), bufferCount(2)).toPromise(); + const secondSubscriber$ = delays$.pipe(take(2), bufferCount(2)).toPromise(); + + taskLifecycleEvents$.next( + asTaskPollingCycleEvent( + asOk({ + result: FillPoolResult.PoolFilled, + stats: { + tasksUpdated: 0, + tasksConflicted: 8, + tasksClaimed: 0, + tasksRejected: 0, + }, + docs: [], + }) + ) + ); + + const thirdSubscriber$ = delays$.pipe(take(2), bufferCount(2)).toPromise(); + + taskLifecycleEvents$.next( + asTaskPollingCycleEvent( + asOk({ + result: FillPoolResult.PoolFilled, + stats: { + tasksUpdated: 0, + tasksConflicted: 10, + tasksClaimed: 0, + tasksRejected: 0, + }, + docs: [], + }) + ) + ); + + // should get the initial value of 0 delay + const [initialDelay, firstRandom] = await firstSubscriber$; + // should get the 0 delay (as a replay), which was the last value plus the first random value + const [initialDelayInSecondSub, firstRandomInSecondSub] = await secondSubscriber$; + // should get the first random value (as a replay) and the next random value + const [firstRandomInThirdSub, secondRandomInThirdSub] = await thirdSubscriber$; + + expect(initialDelay).toEqual(0); + expect(initialDelayInSecondSub).toEqual(0); + expect(firstRandom).toEqual(firstRandomInSecondSub); + expect(firstRandomInSecondSub).toEqual(firstRandomInThirdSub); + expect(secondRandomInThirdSub).toBeGreaterThanOrEqual(0); + }) + ); + test( 'doesnt emit a new delay when conflicts have reduced', fakeSchedulers(async () => { @@ -107,6 +165,7 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 8, tasksClaimed: 0, + tasksRejected: 0, }, docs: [], }) @@ -127,6 +186,7 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 7, tasksClaimed: 0, + tasksRejected: 0, }, docs: [], }) @@ -145,6 +205,7 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 9, tasksClaimed: 0, + tasksRejected: 0, }, docs: [], }) diff --git a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts index 73e7052b65a69e..6d7cb77625b580 100644 --- a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts +++ b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts @@ -11,7 +11,7 @@ import stats from 'stats-lite'; import { isNumber, random } from 'lodash'; -import { merge, of, Observable, combineLatest } from 'rxjs'; +import { merge, of, Observable, combineLatest, ReplaySubject } from 'rxjs'; import { filter, map } from 'rxjs/operators'; import { Option, none, some, isSome, Some } from 'fp-ts/lib/Option'; import { isOk } from '../lib/result_type'; @@ -32,7 +32,9 @@ export function delayOnClaimConflicts( runningAverageWindowSize: number ): Observable { const claimConflictQueue = createRunningAveragedStat(runningAverageWindowSize); - return merge( + // return a subject to allow multicast and replay the last value to new subscribers + const multiCastDelays$ = new ReplaySubject(1); + merge( of(0), combineLatest([ maxWorkersConfiguration$, @@ -70,5 +72,9 @@ export function delayOnClaimConflicts( return random(pollInterval * 0.25, pollInterval * 0.75, false); }) ) - ); + ).subscribe((delay) => { + multiCastDelays$.next(delay); + }); + + return multiCastDelays$; } diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts index 9f794450702379..63d7f6de81801f 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts @@ -7,17 +7,30 @@ import _ from 'lodash'; import sinon from 'sinon'; -import { of, Subject } from 'rxjs'; +import { Observable, of, Subject } from 'rxjs'; import { TaskPollingLifecycle, claimAvailableTasks } from './polling_lifecycle'; import { createInitialMiddleware } from './lib/middleware'; import { TaskTypeDictionary } from './task_type_dictionary'; import { taskStoreMock } from './task_store.mock'; import { mockLogger } from './test_utils'; +import { taskClaimingMock } from './queries/task_claiming.mock'; +import { TaskClaiming, ClaimOwnershipResult } from './queries/task_claiming'; +import type { TaskClaiming as TaskClaimingClass } from './queries/task_claiming'; +import { asOk, Err, isErr, isOk, Result } from './lib/result_type'; +import { FillPoolResult } from './lib/fill_pool'; + +let mockTaskClaiming = taskClaimingMock.create({}); +jest.mock('./queries/task_claiming', () => { + return { + TaskClaiming: jest.fn().mockImplementation(() => { + return mockTaskClaiming; + }), + }; +}); describe('TaskPollingLifecycle', () => { let clock: sinon.SinonFakeTimers; - const taskManagerLogger = mockLogger(); const mockTaskStore = taskStoreMock.create({}); const taskManagerOpts = { @@ -50,8 +63,9 @@ describe('TaskPollingLifecycle', () => { }; beforeEach(() => { + mockTaskClaiming = taskClaimingMock.create({}); + (TaskClaiming as jest.Mock).mockClear(); clock = sinon.useFakeTimers(); - taskManagerOpts.definitions = new TaskTypeDictionary(taskManagerLogger); }); afterEach(() => clock.restore()); @@ -60,17 +74,58 @@ describe('TaskPollingLifecycle', () => { test('begins polling once the ES and SavedObjects services are available', () => { const elasticsearchAndSOAvailability$ = new Subject(); new TaskPollingLifecycle({ - elasticsearchAndSOAvailability$, ...taskManagerOpts, + elasticsearchAndSOAvailability$, }); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).not.toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).not.toHaveBeenCalled(); elasticsearchAndSOAvailability$.next(true); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); + }); + + test('provides TaskClaiming with the capacity available', () => { + const elasticsearchAndSOAvailability$ = new Subject(); + const maxWorkers$ = new Subject(); + taskManagerOpts.definitions.registerTaskDefinitions({ + report: { + title: 'report', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + quickReport: { + title: 'quickReport', + maxConcurrency: 5, + createTaskRunner: jest.fn(), + }, + }); + + new TaskPollingLifecycle({ + ...taskManagerOpts, + elasticsearchAndSOAvailability$, + maxWorkersConfiguration$: maxWorkers$, + }); + + const taskClaimingGetCapacity = (TaskClaiming as jest.Mock).mock + .calls[0][0].getCapacity; + + maxWorkers$.next(20); + expect(taskClaimingGetCapacity()).toEqual(20); + expect(taskClaimingGetCapacity('report')).toEqual(1); + expect(taskClaimingGetCapacity('quickReport')).toEqual(5); + + maxWorkers$.next(30); + expect(taskClaimingGetCapacity()).toEqual(30); + expect(taskClaimingGetCapacity('report')).toEqual(1); + expect(taskClaimingGetCapacity('quickReport')).toEqual(5); + + maxWorkers$.next(2); + expect(taskClaimingGetCapacity()).toEqual(2); + expect(taskClaimingGetCapacity('report')).toEqual(1); + expect(taskClaimingGetCapacity('quickReport')).toEqual(2); }); }); @@ -85,13 +140,13 @@ describe('TaskPollingLifecycle', () => { elasticsearchAndSOAvailability$.next(true); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); elasticsearchAndSOAvailability$.next(false); - mockTaskStore.claimAvailableTasks.mockClear(); + mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockClear(); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).not.toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).not.toHaveBeenCalled(); }); test('restarts polling once the ES and SavedObjects services become available again', () => { @@ -104,68 +159,64 @@ describe('TaskPollingLifecycle', () => { elasticsearchAndSOAvailability$.next(true); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); elasticsearchAndSOAvailability$.next(false); - mockTaskStore.claimAvailableTasks.mockClear(); + mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockClear(); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).not.toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).not.toHaveBeenCalled(); elasticsearchAndSOAvailability$.next(true); clock.tick(150); - expect(mockTaskStore.claimAvailableTasks).toHaveBeenCalled(); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); }); }); describe('claimAvailableTasks', () => { - test('should claim Available Tasks when there are available workers', () => { - const logger = mockLogger(); - const claim = jest.fn(() => - Promise.resolve({ - docs: [], - stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0 }, - }) - ); - - const availableWorkers = 1; - - claimAvailableTasks([], claim, availableWorkers, logger); - - expect(claim).toHaveBeenCalledTimes(1); - }); - - test('should not claim Available Tasks when there are no available workers', () => { + test('should claim Available Tasks when there are available workers', async () => { const logger = mockLogger(); - const claim = jest.fn(() => - Promise.resolve({ - docs: [], - stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0 }, - }) + const taskClaiming = taskClaimingMock.create({}); + taskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockImplementation(() => + of( + asOk({ + docs: [], + stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0, tasksRejected: 0 }, + }) + ) ); - const availableWorkers = 0; + expect( + isOk(await getFirstAsPromise(claimAvailableTasks([], taskClaiming, logger))) + ).toBeTruthy(); - claimAvailableTasks([], claim, availableWorkers, logger); - - expect(claim).not.toHaveBeenCalled(); + expect(taskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalledTimes(1); }); /** * This handles the case in which Elasticsearch has had inline script disabled. * This is achieved by setting the `script.allowed_types` flag on Elasticsearch to `none` */ - test('handles failure due to inline scripts being disabled', () => { + test('handles failure due to inline scripts being disabled', async () => { const logger = mockLogger(); - const claim = jest.fn(() => { - throw Object.assign(new Error(), { - response: - '{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}],"type":"search_phase_execution_exception","reason":"all shards failed","phase":"query","grouped":true,"failed_shards":[{"shard":0,"index":".kibana_task_manager_1","node":"24A4QbjHSK6prvtopAKLKw","reason":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}],"caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts","caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}},"status":400}', - }); - }); + const taskClaiming = taskClaimingMock.create({}); + taskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockImplementation( + () => + new Observable>((observer) => { + observer.error( + Object.assign(new Error(), { + response: + '{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}],"type":"search_phase_execution_exception","reason":"all shards failed","phase":"query","grouped":true,"failed_shards":[{"shard":0,"index":".kibana_task_manager_1","node":"24A4QbjHSK6prvtopAKLKw","reason":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}],"caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts","caused_by":{"type":"illegal_argument_exception","reason":"cannot execute [inline] scripts"}}},"status":400}', + }) + ); + }) + ); + + const err = await getFirstAsPromise(claimAvailableTasks([], taskClaiming, logger)); - claimAvailableTasks([], claim, 10, logger); + expect(isErr(err)).toBeTruthy(); + expect((err as Err).error).toEqual(FillPoolResult.Failed); expect(logger.warn).toHaveBeenCalledTimes(1); expect(logger.warn).toHaveBeenCalledWith( @@ -174,3 +225,9 @@ describe('TaskPollingLifecycle', () => { }); }); }); + +function getFirstAsPromise(obs$: Observable): Promise { + return new Promise((resolve, reject) => { + obs$.subscribe(resolve, reject); + }); +} diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.ts index db8eeaaf78dee5..260f5ccc70f53c 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts @@ -6,15 +6,12 @@ */ import { Subject, Observable, Subscription } from 'rxjs'; - -import { performance } from 'perf_hooks'; - import { pipe } from 'fp-ts/lib/pipeable'; import { Option, some, map as mapOptional } from 'fp-ts/lib/Option'; import { tap } from 'rxjs/operators'; import { Logger } from '../../../../src/core/server'; -import { Result, asErr, mapErr, asOk, map } from './lib/result_type'; +import { Result, asErr, mapErr, asOk, map, mapOk } from './lib/result_type'; import { ManagedConfiguration } from './lib/create_managed_configuration'; import { TaskManagerConfig } from './config'; @@ -41,11 +38,12 @@ import { } from './polling'; import { TaskPool } from './task_pool'; import { TaskManagerRunner, TaskRunner } from './task_running'; -import { TaskStore, OwnershipClaimingOpts, ClaimOwnershipResult } from './task_store'; +import { TaskStore } from './task_store'; import { identifyEsError } from './lib/identify_es_error'; import { BufferedTaskStore } from './buffered_task_store'; import { TaskTypeDictionary } from './task_type_dictionary'; import { delayOnClaimConflicts } from './polling'; +import { TaskClaiming, ClaimOwnershipResult } from './queries/task_claiming'; export type TaskPollingLifecycleOpts = { logger: Logger; @@ -71,6 +69,7 @@ export class TaskPollingLifecycle { private definitions: TaskTypeDictionary; private store: TaskStore; + private taskClaiming: TaskClaiming; private bufferedStore: BufferedTaskStore; private logger: Logger; @@ -106,8 +105,6 @@ export class TaskPollingLifecycle { this.store = taskStore; const emitEvent = (event: TaskLifecycleEvent) => this.events$.next(event); - // pipe store events into the lifecycle event stream - this.store.events.subscribe(emitEvent); this.bufferedStore = new BufferedTaskStore(this.store, { bufferMaxOperations: config.max_workers, @@ -120,6 +117,26 @@ export class TaskPollingLifecycle { }); this.pool.load.subscribe(emitEvent); + this.taskClaiming = new TaskClaiming({ + taskStore, + maxAttempts: config.max_attempts, + definitions, + logger: this.logger, + getCapacity: (taskType?: string) => + taskType && this.definitions.get(taskType)?.maxConcurrency + ? Math.max( + Math.min( + this.pool.availableWorkers, + this.definitions.get(taskType)!.maxConcurrency! - + this.pool.getOccupiedWorkersByType(taskType) + ), + 0 + ) + : this.pool.availableWorkers, + }); + // pipe taskClaiming events into the lifecycle event stream + this.taskClaiming.events.subscribe(emitEvent); + const { max_poll_inactivity_cycles: maxPollInactivityCycles, poll_interval: pollInterval, @@ -199,6 +216,7 @@ export class TaskPollingLifecycle { beforeRun: this.middleware.beforeRun, beforeMarkRunning: this.middleware.beforeMarkRunning, onTaskEvent: this.emitEvent, + defaultMaxAttempts: this.taskClaiming.maxAttempts, }); }; @@ -212,9 +230,18 @@ export class TaskPollingLifecycle { () => claimAvailableTasks( tasksToClaim.splice(0, this.pool.availableWorkers), - this.store.claimAvailableTasks, - this.pool.availableWorkers, + this.taskClaiming, this.logger + ).pipe( + tap( + mapOk(({ timing }: ClaimOwnershipResult) => { + if (timing) { + this.emitEvent( + asTaskManagerStatEvent('claimDuration', asOk(timing.stop - timing.start)) + ); + } + }) + ) ), // wrap each task in a Task Runner this.createTaskRunnerForTask, @@ -252,59 +279,40 @@ export class TaskPollingLifecycle { } } -export async function claimAvailableTasks( +export function claimAvailableTasks( claimTasksById: string[], - claim: (opts: OwnershipClaimingOpts) => Promise, - availableWorkers: number, + taskClaiming: TaskClaiming, logger: Logger -): Promise> { - if (availableWorkers > 0) { - performance.mark('claimAvailableTasks_start'); - - try { - const claimResult = await claim({ - size: availableWorkers, +): Observable> { + return new Observable((observer) => { + taskClaiming + .claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: intervalFromNow('30s')!, claimTasksById, - }); - const { - docs, - stats: { tasksClaimed }, - } = claimResult; - - if (tasksClaimed === 0) { - performance.mark('claimAvailableTasks.noTasks'); - } - performance.mark('claimAvailableTasks_stop'); - performance.measure( - 'claimAvailableTasks', - 'claimAvailableTasks_start', - 'claimAvailableTasks_stop' + }) + .subscribe( + (claimResult) => { + observer.next(claimResult); + }, + (ex) => { + // if the `taskClaiming` stream errors out we want to catch it and see if + // we can identify the reason + // if we can - we emit an FillPoolResult error rather than erroring out the wrapping Observable + // returned by `claimAvailableTasks` + if (identifyEsError(ex).includes('cannot execute [inline] scripts')) { + logger.warn( + `Task Manager cannot operate when inline scripts are disabled in Elasticsearch` + ); + observer.next(asErr(FillPoolResult.Failed)); + observer.complete(); + } else { + // as we could't identify the reason - we'll error out the wrapping Observable too + observer.error(ex); + } + }, + () => { + observer.complete(); + } ); - - if (docs.length !== tasksClaimed) { - logger.warn( - `[Task Ownership error]: ${tasksClaimed} tasks were claimed by Kibana, but ${ - docs.length - } task(s) were fetched (${docs.map((doc) => doc.id).join(', ')})` - ); - } - return asOk(claimResult); - } catch (ex) { - if (identifyEsError(ex).includes('cannot execute [inline] scripts')) { - logger.warn( - `Task Manager cannot operate when inline scripts are disabled in Elasticsearch` - ); - return asErr(FillPoolResult.Failed); - } else { - throw ex; - } - } - } else { - performance.mark('claimAvailableTasks.noAvailableWorkers'); - logger.debug( - `[Task Ownership]: Task Manager has skipped Claiming Ownership of available tasks at it has ran out Available Workers.` - ); - return asErr(FillPoolResult.NoAvailableWorkers); - } + }); } diff --git a/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.test.ts b/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.test.ts index 75b9b2cdfa9779..57a4ab320367d4 100644 --- a/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.test.ts +++ b/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.test.ts @@ -52,6 +52,7 @@ describe('mark_available_tasks_as_claimed', () => { fieldUpdates, claimTasksById || [], definitions.getAllTypes(), + [], Array.from(definitions).reduce((accumulator, [type, { maxAttempts }]) => { return { ...accumulator, [type]: maxAttempts || defaultMaxAttempts }; }, {}) @@ -116,18 +117,23 @@ if (doc['task.runAt'].size()!=0) { seq_no_primary_term: true, script: { source: ` - if (params.registeredTaskTypes.contains(ctx._source.task.taskType)) { - if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { + if (params.claimableTaskTypes.contains(ctx._source.task.taskType)) { + if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { + ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) + .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) + .join(' ')} + } else { + ctx._source.task.status = "failed"; + } + } else if (params.skippedTaskTypes.contains(ctx._source.task.taskType) && params.claimTasksById.contains(ctx._id)) { ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) .join(' ')} + } else if (!params.skippedTaskTypes.contains(ctx._source.task.taskType)) { + ctx._source.task.status = "unrecognized"; } else { - ctx._source.task.status = "failed"; - } - } else { - ctx._source.task.status = "unrecognized"; - } - `, + ctx.op = "noop"; + }`, lang: 'painless', params: { fieldUpdates: { @@ -135,7 +141,8 @@ if (doc['task.runAt'].size()!=0) { retryAt: claimOwnershipUntil, }, claimTasksById: [], - registeredTaskTypes: ['sampleTask', 'otherTask'], + claimableTaskTypes: ['sampleTask', 'otherTask'], + skippedTaskTypes: [], taskMaxAttempts: { sampleTask: 5, otherTask: 1, @@ -144,4 +151,76 @@ if (doc['task.runAt'].size()!=0) { }, }); }); + + describe(`script`, () => { + test('it supports claiming specific tasks by id', async () => { + const taskManagerId = '3478fg6-82374f6-83467gf5-384g6f'; + const claimOwnershipUntil = '2019-02-12T21:01:22.479Z'; + const fieldUpdates = { + ownerId: taskManagerId, + retryAt: claimOwnershipUntil, + }; + + const claimTasksById = [ + '33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ]; + + expect( + updateFieldsAndMarkAsFailed(fieldUpdates, claimTasksById, ['foo', 'bar'], [], { + foo: 5, + bar: 2, + }) + ).toMatchObject({ + source: ` + if (params.claimableTaskTypes.contains(ctx._source.task.taskType)) { + if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { + ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) + .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) + .join(' ')} + } else { + ctx._source.task.status = "failed"; + } + } else if (params.skippedTaskTypes.contains(ctx._source.task.taskType) && params.claimTasksById.contains(ctx._id)) { + ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) + .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) + .join(' ')} + } else if (!params.skippedTaskTypes.contains(ctx._source.task.taskType)) { + ctx._source.task.status = "unrecognized"; + } else { + ctx.op = "noop"; + }`, + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [ + '33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + claimableTaskTypes: ['foo', 'bar'], + skippedTaskTypes: [], + taskMaxAttempts: { + foo: 5, + bar: 2, + }, + }, + }); + }); + + test('it marks the update as a noop if the type is skipped', async () => { + const taskManagerId = '3478fg6-82374f6-83467gf5-384g6f'; + const claimOwnershipUntil = '2019-02-12T21:01:22.479Z'; + const fieldUpdates = { + ownerId: taskManagerId, + retryAt: claimOwnershipUntil, + }; + + expect( + updateFieldsAndMarkAsFailed(fieldUpdates, [], ['foo', 'bar'], [], { + foo: 5, + bar: 2, + }).source + ).toMatch(/ctx.op = "noop"/); + }); + }); }); diff --git a/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.ts b/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.ts index 067de5a92adb7b..8598980a4e2363 100644 --- a/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.ts +++ b/x-pack/plugins/task_manager/server/queries/mark_available_tasks_as_claimed.ts @@ -14,6 +14,8 @@ import { mustBeAllOf, MustCondition, BoolClauseWithAnyCondition, + ShouldCondition, + FilterCondition, } from './query_clauses'; export const TaskWithSchedule: ExistsFilter = { @@ -39,14 +41,26 @@ export function taskWithLessThanMaxAttempts( }; } -export function tasksClaimedByOwner(taskManagerId: string) { +export function tasksOfType(taskTypes: string[]): ShouldCondition { + return { + bool: { + should: [...taskTypes].map((type) => ({ term: { 'task.taskType': type } })), + }, + }; +} + +export function tasksClaimedByOwner( + taskManagerId: string, + ...taskFilters: Array | ShouldCondition> +) { return mustBeAllOf( { term: { 'task.ownerId': taskManagerId, }, }, - { term: { 'task.status': 'claiming' } } + { term: { 'task.status': 'claiming' } }, + ...taskFilters ); } @@ -107,27 +121,35 @@ export const updateFieldsAndMarkAsFailed = ( [field: string]: string | number | Date; }, claimTasksById: string[], - registeredTaskTypes: string[], + claimableTaskTypes: string[], + skippedTaskTypes: string[], taskMaxAttempts: { [field: string]: number } -): ScriptClause => ({ - source: ` - if (params.registeredTaskTypes.contains(ctx._source.task.taskType)) { - if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { - ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) - .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) - .join(' ')} +): ScriptClause => { + const markAsClaimingScript = `ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) + .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) + .join(' ')}`; + return { + source: ` + if (params.claimableTaskTypes.contains(ctx._source.task.taskType)) { + if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { + ${markAsClaimingScript} + } else { + ctx._source.task.status = "failed"; + } + } else if (params.skippedTaskTypes.contains(ctx._source.task.taskType) && params.claimTasksById.contains(ctx._id)) { + ${markAsClaimingScript} + } else if (!params.skippedTaskTypes.contains(ctx._source.task.taskType)) { + ctx._source.task.status = "unrecognized"; } else { - ctx._source.task.status = "failed"; - } - } else { - ctx._source.task.status = "unrecognized"; - } - `, - lang: 'painless', - params: { - fieldUpdates, - claimTasksById, - registeredTaskTypes, - taskMaxAttempts, - }, -}); + ctx.op = "noop"; + }`, + lang: 'painless', + params: { + fieldUpdates, + claimTasksById, + claimableTaskTypes, + skippedTaskTypes, + taskMaxAttempts, + }, + }; +}; diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.mock.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.mock.ts new file mode 100644 index 00000000000000..38f02780c485e9 --- /dev/null +++ b/x-pack/plugins/task_manager/server/queries/task_claiming.mock.ts @@ -0,0 +1,33 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { Observable, Subject } from 'rxjs'; +import { TaskClaim } from '../task_events'; + +import { TaskClaiming } from './task_claiming'; + +interface TaskClaimingOptions { + maxAttempts?: number; + taskManagerId?: string; + events?: Observable; +} +export const taskClaimingMock = { + create({ + maxAttempts = 0, + taskManagerId = '', + events = new Subject(), + }: TaskClaimingOptions) { + const mocked = ({ + claimAvailableTasks: jest.fn(), + claimAvailableTasksIfCapacityIsAvailable: jest.fn(), + maxAttempts, + taskManagerId, + events, + } as unknown) as jest.Mocked; + return mocked; + }, +}; diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts new file mode 100644 index 00000000000000..bd1171d7fd2f82 --- /dev/null +++ b/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts @@ -0,0 +1,1516 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import _ from 'lodash'; +import uuid from 'uuid'; +import { filter, take, toArray } from 'rxjs/operators'; +import { some, none } from 'fp-ts/lib/Option'; + +import { TaskStatus, ConcreteTaskInstance } from '../task'; +import { SearchOpts, StoreOpts, UpdateByQueryOpts, UpdateByQuerySearchOpts } from '../task_store'; +import { asTaskClaimEvent, ClaimTaskErr, TaskClaimErrorType, TaskEvent } from '../task_events'; +import { asOk, asErr } from '../lib/result_type'; +import { TaskTypeDictionary } from '../task_type_dictionary'; +import { BoolClauseWithAnyCondition, TermFilter } from '../queries/query_clauses'; +import { mockLogger } from '../test_utils'; +import { TaskClaiming, OwnershipClaimingOpts, TaskClaimingOpts } from './task_claiming'; +import { Observable } from 'rxjs'; +import { taskStoreMock } from '../task_store.mock'; + +const taskManagerLogger = mockLogger(); + +beforeEach(() => jest.resetAllMocks()); + +const mockedDate = new Date('2019-02-12T21:01:22.479Z'); +// eslint-disable-next-line @typescript-eslint/no-explicit-any +(global as any).Date = class Date { + constructor() { + return mockedDate; + } + static now() { + return mockedDate.getTime(); + } +}; + +const taskDefinitions = new TaskTypeDictionary(taskManagerLogger); +taskDefinitions.registerTaskDefinitions({ + report: { + title: 'report', + createTaskRunner: jest.fn(), + }, + dernstraight: { + title: 'dernstraight', + createTaskRunner: jest.fn(), + }, + yawn: { + title: 'yawn', + createTaskRunner: jest.fn(), + }, +}); + +describe('TaskClaiming', () => { + test(`should log when a certain task type is skipped due to having a zero concurency configuration`, () => { + const definitions = new TaskTypeDictionary(mockLogger()); + definitions.registerTaskDefinitions({ + unlimited: { + title: 'unlimited', + createTaskRunner: jest.fn(), + }, + anotherUnlimited: { + title: 'anotherUnlimited', + createTaskRunner: jest.fn(), + }, + limitedToZero: { + title: 'limitedToZero', + maxConcurrency: 0, + createTaskRunner: jest.fn(), + }, + limitedToOne: { + title: 'limitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + anotherLimitedToZero: { + title: 'anotherLimitedToZero', + maxConcurrency: 0, + createTaskRunner: jest.fn(), + }, + limitedToTwo: { + title: 'limitedToTwo', + maxConcurrency: 2, + createTaskRunner: jest.fn(), + }, + }); + + new TaskClaiming({ + logger: taskManagerLogger, + definitions, + taskStore: taskStoreMock.create({ taskManagerId: '' }), + maxAttempts: 2, + getCapacity: () => 10, + }); + + expect(taskManagerLogger.info).toHaveBeenCalledTimes(1); + expect(taskManagerLogger.info.mock.calls[0][0]).toMatchInlineSnapshot( + `"Task Manager will never claim tasks of the following types as their \\"maxConcurrency\\" is set to 0: limitedToZero, anotherLimitedToZero"` + ); + }); + + describe('claimAvailableTasks', () => { + function initialiseTestClaiming({ + storeOpts = {}, + taskClaimingOpts = {}, + hits = [generateFakeTasks(1)], + versionConflicts = 2, + }: { + storeOpts: Partial; + taskClaimingOpts: Partial; + hits?: ConcreteTaskInstance[][]; + versionConflicts?: number; + }) { + const definitions = storeOpts.definitions ?? taskDefinitions; + const store = taskStoreMock.create({ taskManagerId: storeOpts.taskManagerId }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + if (hits.length === 1) { + store.fetch.mockResolvedValue({ docs: hits[0] }); + store.updateByQuery.mockResolvedValue({ + updated: hits[0].length, + version_conflicts: versionConflicts, + total: hits[0].length, + }); + } else { + for (const docs of hits) { + store.fetch.mockResolvedValueOnce({ docs }); + store.updateByQuery.mockResolvedValueOnce({ + updated: docs.length, + version_conflicts: versionConflicts, + total: docs.length, + }); + } + } + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + definitions, + taskStore: store, + maxAttempts: taskClaimingOpts.maxAttempts ?? 2, + getCapacity: taskClaimingOpts.getCapacity ?? (() => 10), + ...taskClaimingOpts, + }); + + return { taskClaiming, store }; + } + + async function testClaimAvailableTasks({ + storeOpts = {}, + taskClaimingOpts = {}, + claimingOpts, + hits = [generateFakeTasks(1)], + versionConflicts = 2, + }: { + storeOpts: Partial; + taskClaimingOpts: Partial; + claimingOpts: Omit; + hits?: ConcreteTaskInstance[][]; + versionConflicts?: number; + }) { + const getCapacity = taskClaimingOpts.getCapacity ?? (() => 10); + const { taskClaiming, store } = initialiseTestClaiming({ + storeOpts, + taskClaimingOpts, + hits, + versionConflicts, + }); + + const results = await getAllAsPromise(taskClaiming.claimAvailableTasks(claimingOpts)); + + expect(store.updateByQuery.mock.calls[0][1]).toMatchObject({ + max_docs: getCapacity(), + }); + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: getCapacity() }); + return results.map((result, index) => ({ + result, + args: { + search: store.fetch.mock.calls[index][0] as SearchOpts & { + query: BoolClauseWithAnyCondition; + }, + updateByQuery: store.updateByQuery.mock.calls[index] as [ + UpdateByQuerySearchOpts, + UpdateByQueryOpts + ], + }, + })); + } + + test('it filters claimed tasks down by supported types, maxAttempts, status, and runAt', async () => { + const maxAttempts = _.random(2, 43); + const customMaxAttempts = _.random(44, 100); + + const definitions = new TaskTypeDictionary(mockLogger()); + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + createTaskRunner: jest.fn(), + }, + bar: { + title: 'bar', + maxAttempts: customMaxAttempts, + createTaskRunner: jest.fn(), + }, + }); + + const [ + { + args: { + updateByQuery: [{ query, sort }], + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + definitions, + }, + taskClaimingOpts: { + maxAttempts, + }, + claimingOpts: { + claimOwnershipUntil: new Date(), + }, + }); + expect(query).toMatchObject({ + bool: { + must: [ + { + bool: { + should: [ + { + bool: { + must: [ + { term: { 'task.status': 'idle' } }, + { range: { 'task.runAt': { lte: 'now' } } }, + ], + }, + }, + { + bool: { + must: [ + { + bool: { + should: [ + { term: { 'task.status': 'running' } }, + { term: { 'task.status': 'claiming' } }, + ], + }, + }, + { range: { 'task.retryAt': { lte: 'now' } } }, + ], + }, + }, + ], + }, + }, + ], + filter: [ + { + bool: { + must_not: [ + { + bool: { + should: [ + { term: { 'task.status': 'running' } }, + { term: { 'task.status': 'claiming' } }, + ], + must: { range: { 'task.retryAt': { gt: 'now' } } }, + }, + }, + ], + }, + }, + ], + }, + }); + expect(sort).toMatchObject([ + { + _script: { + type: 'number', + order: 'asc', + script: { + lang: 'painless', + source: ` +if (doc['task.retryAt'].size()!=0) { + return doc['task.retryAt'].value.toInstant().toEpochMilli(); +} +if (doc['task.runAt'].size()!=0) { + return doc['task.runAt'].value.toInstant().toEpochMilli(); +} + `, + }, + }, + }, + ]); + }); + + test('it supports claiming specific tasks by id', async () => { + const maxAttempts = _.random(2, 43); + const customMaxAttempts = _.random(44, 100); + const definitions = new TaskTypeDictionary(mockLogger()); + const taskManagerId = uuid.v1(); + const fieldUpdates = { + ownerId: taskManagerId, + retryAt: new Date(Date.now()), + }; + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + createTaskRunner: jest.fn(), + }, + bar: { + title: 'bar', + maxAttempts: customMaxAttempts, + createTaskRunner: jest.fn(), + }, + }); + const [ + { + args: { + updateByQuery: [{ query, script, sort }], + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + definitions, + }, + taskClaimingOpts: { + maxAttempts, + }, + claimingOpts: { + claimOwnershipUntil: new Date(), + claimTasksById: [ + '33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + }, + }); + + expect(query).toMatchObject({ + bool: { + must: [ + { + pinned: { + ids: [ + 'task:33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'task:a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + organic: { + bool: { + must: [ + { + bool: { + should: [ + { + bool: { + must: [ + { term: { 'task.status': 'idle' } }, + { range: { 'task.runAt': { lte: 'now' } } }, + ], + }, + }, + { + bool: { + must: [ + { + bool: { + should: [ + { term: { 'task.status': 'running' } }, + { term: { 'task.status': 'claiming' } }, + ], + }, + }, + { range: { 'task.retryAt': { lte: 'now' } } }, + ], + }, + }, + ], + }, + }, + ], + }, + }, + }, + }, + ], + filter: [ + { + bool: { + must_not: [ + { + bool: { + should: [ + { term: { 'task.status': 'running' } }, + { term: { 'task.status': 'claiming' } }, + ], + must: { range: { 'task.retryAt': { gt: 'now' } } }, + }, + }, + ], + }, + }, + ], + }, + }); + + expect(script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [ + 'task:33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'task:a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + claimableTaskTypes: ['foo', 'bar'], + skippedTaskTypes: [], + taskMaxAttempts: { + bar: customMaxAttempts, + foo: maxAttempts, + }, + }, + }); + + expect(sort).toMatchObject([ + '_score', + { + _script: { + type: 'number', + order: 'asc', + script: { + lang: 'painless', + source: ` +if (doc['task.retryAt'].size()!=0) { + return doc['task.retryAt'].value.toInstant().toEpochMilli(); +} +if (doc['task.runAt'].size()!=0) { + return doc['task.runAt'].value.toInstant().toEpochMilli(); +} + `, + }, + }, + }, + ]); + }); + + test('it should claim in batches partitioned by maxConcurrency', async () => { + const maxAttempts = _.random(2, 43); + const definitions = new TaskTypeDictionary(mockLogger()); + const taskManagerId = uuid.v1(); + const fieldUpdates = { + ownerId: taskManagerId, + retryAt: new Date(Date.now()), + }; + definitions.registerTaskDefinitions({ + unlimited: { + title: 'unlimited', + createTaskRunner: jest.fn(), + }, + limitedToZero: { + title: 'limitedToZero', + maxConcurrency: 0, + createTaskRunner: jest.fn(), + }, + anotherUnlimited: { + title: 'anotherUnlimited', + createTaskRunner: jest.fn(), + }, + finalUnlimited: { + title: 'finalUnlimited', + createTaskRunner: jest.fn(), + }, + limitedToOne: { + title: 'limitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + anotherLimitedToOne: { + title: 'anotherLimitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + limitedToTwo: { + title: 'limitedToTwo', + maxConcurrency: 2, + createTaskRunner: jest.fn(), + }, + }); + const results = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + definitions, + }, + taskClaimingOpts: { + maxAttempts, + getCapacity: (type) => { + switch (type) { + case 'limitedToOne': + case 'anotherLimitedToOne': + return 1; + case 'limitedToTwo': + return 2; + default: + return 10; + } + }, + }, + claimingOpts: { + claimOwnershipUntil: new Date(), + claimTasksById: [ + '33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + }, + }); + + expect(results.length).toEqual(4); + + expect(results[0].args.updateByQuery[1].max_docs).toEqual(10); + expect(results[0].args.updateByQuery[0].script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [ + 'task:33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', + 'task:a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', + ], + claimableTaskTypes: ['unlimited', 'anotherUnlimited', 'finalUnlimited'], + skippedTaskTypes: [ + 'limitedToZero', + 'limitedToOne', + 'anotherLimitedToOne', + 'limitedToTwo', + ], + taskMaxAttempts: { + unlimited: maxAttempts, + }, + }, + }); + + expect(results[1].args.updateByQuery[1].max_docs).toEqual(1); + expect(results[1].args.updateByQuery[0].script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [], + claimableTaskTypes: ['limitedToOne'], + skippedTaskTypes: [ + 'unlimited', + 'limitedToZero', + 'anotherUnlimited', + 'finalUnlimited', + 'anotherLimitedToOne', + 'limitedToTwo', + ], + taskMaxAttempts: { + limitedToOne: maxAttempts, + }, + }, + }); + + expect(results[2].args.updateByQuery[1].max_docs).toEqual(1); + expect(results[2].args.updateByQuery[0].script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [], + claimableTaskTypes: ['anotherLimitedToOne'], + skippedTaskTypes: [ + 'unlimited', + 'limitedToZero', + 'anotherUnlimited', + 'finalUnlimited', + 'limitedToOne', + 'limitedToTwo', + ], + taskMaxAttempts: { + anotherLimitedToOne: maxAttempts, + }, + }, + }); + + expect(results[3].args.updateByQuery[1].max_docs).toEqual(2); + expect(results[3].args.updateByQuery[0].script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimTasksById: [], + claimableTaskTypes: ['limitedToTwo'], + skippedTaskTypes: [ + 'unlimited', + 'limitedToZero', + 'anotherUnlimited', + 'finalUnlimited', + 'limitedToOne', + 'anotherLimitedToOne', + ], + taskMaxAttempts: { + limitedToTwo: maxAttempts, + }, + }, + }); + }); + + test('it should reduce the available capacity from batch to batch', async () => { + const maxAttempts = _.random(2, 43); + const definitions = new TaskTypeDictionary(mockLogger()); + const taskManagerId = uuid.v1(); + definitions.registerTaskDefinitions({ + unlimited: { + title: 'unlimited', + createTaskRunner: jest.fn(), + }, + limitedToFive: { + title: 'limitedToFive', + maxConcurrency: 5, + createTaskRunner: jest.fn(), + }, + limitedToTwo: { + title: 'limitedToTwo', + maxConcurrency: 2, + createTaskRunner: jest.fn(), + }, + }); + const results = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + definitions, + }, + taskClaimingOpts: { + maxAttempts, + getCapacity: (type) => { + switch (type) { + case 'limitedToTwo': + return 2; + case 'limitedToFive': + return 5; + default: + return 10; + } + }, + }, + hits: [ + [ + // 7 returned by unlimited query + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + mockInstance({ + taskType: 'unlimited', + }), + ], + // 2 returned by limitedToFive query + [ + mockInstance({ + taskType: 'limitedToFive', + }), + mockInstance({ + taskType: 'limitedToFive', + }), + ], + // 1 reterned by limitedToTwo query + [ + mockInstance({ + taskType: 'limitedToTwo', + }), + ], + ], + claimingOpts: { + claimOwnershipUntil: new Date(), + claimTasksById: [], + }, + }); + + expect(results.length).toEqual(3); + + expect(results[0].args.updateByQuery[1].max_docs).toEqual(10); + + // only capacity for 3, even though 5 are allowed + expect(results[1].args.updateByQuery[1].max_docs).toEqual(3); + + // only capacity for 1, even though 2 are allowed + expect(results[2].args.updateByQuery[1].max_docs).toEqual(1); + }); + + test('it shuffles the types claimed in batches to ensure no type starves another', async () => { + const maxAttempts = _.random(2, 43); + const definitions = new TaskTypeDictionary(mockLogger()); + const taskManagerId = uuid.v1(); + definitions.registerTaskDefinitions({ + unlimited: { + title: 'unlimited', + createTaskRunner: jest.fn(), + }, + anotherUnlimited: { + title: 'anotherUnlimited', + createTaskRunner: jest.fn(), + }, + finalUnlimited: { + title: 'finalUnlimited', + createTaskRunner: jest.fn(), + }, + limitedToOne: { + title: 'limitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + anotherLimitedToOne: { + title: 'anotherLimitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + limitedToTwo: { + title: 'limitedToTwo', + maxConcurrency: 2, + createTaskRunner: jest.fn(), + }, + }); + + const { taskClaiming, store } = initialiseTestClaiming({ + storeOpts: { + taskManagerId, + definitions, + }, + taskClaimingOpts: { + maxAttempts, + getCapacity: (type) => { + switch (type) { + case 'limitedToOne': + case 'anotherLimitedToOne': + return 1; + case 'limitedToTwo': + return 2; + default: + return 10; + } + }, + }, + }); + + async function getUpdateByQueryScriptParams() { + return ( + await getAllAsPromise( + taskClaiming.claimAvailableTasks({ + claimOwnershipUntil: new Date(), + }) + ) + ).map( + (result, index) => + (store.updateByQuery.mock.calls[index][0] as { + query: BoolClauseWithAnyCondition; + size: number; + sort: string | string[]; + script: { + params: { + claimableTaskTypes: string[]; + }; + }; + }).script.params.claimableTaskTypes + ); + } + + const firstCycle = await getUpdateByQueryScriptParams(); + store.updateByQuery.mockClear(); + const secondCycle = await getUpdateByQueryScriptParams(); + + expect(firstCycle.length).toEqual(4); + expect(secondCycle.length).toEqual(4); + expect(firstCycle).not.toMatchObject(secondCycle); + }); + + test('it claims tasks by setting their ownerId, status and retryAt', async () => { + const taskManagerId = uuid.v1(); + const claimOwnershipUntil = new Date(Date.now()); + const fieldUpdates = { + ownerId: taskManagerId, + retryAt: claimOwnershipUntil, + }; + const [ + { + args: { + updateByQuery: [{ script }], + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + }, + taskClaimingOpts: {}, + claimingOpts: { + claimOwnershipUntil, + }, + }); + expect(script).toMatchObject({ + source: expect.any(String), + lang: 'painless', + params: { + fieldUpdates, + claimableTaskTypes: ['report', 'dernstraight', 'yawn'], + skippedTaskTypes: [], + taskMaxAttempts: { + dernstraight: 2, + report: 2, + yawn: 2, + }, + }, + }); + }); + + test('it filters out running tasks', async () => { + const taskManagerId = uuid.v1(); + const claimOwnershipUntil = new Date(Date.now()); + const runAt = new Date(); + const tasks = [ + mockInstance({ + id: 'aaa', + runAt, + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + }), + ]; + const [ + { + result: { docs }, + args: { + search: { query }, + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + }, + taskClaimingOpts: {}, + claimingOpts: { + claimOwnershipUntil, + }, + hits: [tasks], + }); + + expect(query).toMatchObject({ + bool: { + must: [ + { + term: { + 'task.ownerId': taskManagerId, + }, + }, + { term: { 'task.status': 'claiming' } }, + { + bool: { + should: [ + { + term: { + 'task.taskType': 'report', + }, + }, + { + term: { + 'task.taskType': 'dernstraight', + }, + }, + { + term: { + 'task.taskType': 'yawn', + }, + }, + ], + }, + }, + ], + }, + }); + + expect(docs).toMatchObject([ + { + attempts: 0, + id: 'aaa', + schedule: undefined, + params: { hello: 'world' }, + runAt, + scope: ['reporting'], + state: { baby: 'Henhen' }, + status: 'claiming', + taskType: 'foo', + user: 'jimbo', + ownerId: taskManagerId, + }, + ]); + }); + + test('it returns task objects', async () => { + const taskManagerId = uuid.v1(); + const claimOwnershipUntil = new Date(Date.now()); + const runAt = new Date(); + const tasks = [ + mockInstance({ + id: 'aaa', + runAt, + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + }), + mockInstance({ + id: 'bbb', + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: TaskStatus.Claiming, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + }), + ]; + const [ + { + result: { docs }, + args: { + search: { query }, + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + }, + taskClaimingOpts: {}, + claimingOpts: { + claimOwnershipUntil, + }, + hits: [tasks], + }); + + expect(query).toMatchObject({ + bool: { + must: [ + { + term: { + 'task.ownerId': taskManagerId, + }, + }, + { term: { 'task.status': 'claiming' } }, + { + bool: { + should: [ + { + term: { + 'task.taskType': 'report', + }, + }, + { + term: { + 'task.taskType': 'dernstraight', + }, + }, + { + term: { + 'task.taskType': 'yawn', + }, + }, + ], + }, + }, + ], + }, + }); + + expect(docs).toMatchObject([ + { + attempts: 0, + id: 'aaa', + schedule: undefined, + params: { hello: 'world' }, + runAt, + scope: ['reporting'], + state: { baby: 'Henhen' }, + status: 'claiming', + taskType: 'foo', + user: 'jimbo', + ownerId: taskManagerId, + }, + { + attempts: 2, + id: 'bbb', + schedule: { interval: '5m' }, + params: { shazm: 1 }, + runAt, + scope: ['reporting', 'ceo'], + state: { henry: 'The 8th' }, + status: 'claiming', + taskType: 'bar', + user: 'dabo', + ownerId: taskManagerId, + }, + ]); + }); + + test('it returns version_conflicts that do not include conflicts that were proceeded against', async () => { + const taskManagerId = uuid.v1(); + const claimOwnershipUntil = new Date(Date.now()); + const runAt = new Date(); + const tasks = [ + mockInstance({ + runAt, + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + }), + mockInstance({ + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: TaskStatus.Claiming, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + }), + ]; + const maxDocs = 10; + const [ + { + result: { + stats: { tasksUpdated, tasksConflicted, tasksClaimed }, + }, + }, + ] = await testClaimAvailableTasks({ + storeOpts: { + taskManagerId, + }, + taskClaimingOpts: { getCapacity: () => maxDocs }, + claimingOpts: { + claimOwnershipUntil, + }, + hits: [tasks], + // assume there were 20 version conflists, but thanks to `conflicts="proceed"` + // we proceeded to claim tasks + versionConflicts: 20, + }); + + expect(tasksUpdated).toEqual(2); + // ensure we only count conflicts that *may* have counted against max_docs, no more than that + expect(tasksConflicted).toEqual(10 - tasksUpdated!); + expect(tasksClaimed).toEqual(2); + }); + }); + + describe('task events', () => { + function generateTasks(taskManagerId: string) { + const runAt = new Date(); + const tasks = [ + { + id: 'claimed-by-id', + runAt, + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }, + { + id: 'claimed-by-schedule', + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: TaskStatus.Claiming, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }, + { + id: 'already-running', + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: TaskStatus.Running, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }, + ]; + + return { taskManagerId, runAt, tasks }; + } + + function instantiateStoreWithMockedApiResponses({ + taskManagerId = uuid.v4(), + definitions = taskDefinitions, + getCapacity = () => 10, + tasksClaimed, + }: Partial> & { + taskManagerId?: string; + tasksClaimed?: ConcreteTaskInstance[][]; + } = {}) { + const { runAt, tasks: generatedTasks } = generateTasks(taskManagerId); + const taskCycles = tasksClaimed ?? [generatedTasks]; + + const taskStore = taskStoreMock.create({ taskManagerId }); + taskStore.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + for (const docs of taskCycles) { + taskStore.fetch.mockResolvedValueOnce({ docs }); + taskStore.updateByQuery.mockResolvedValueOnce({ + updated: docs.length, + version_conflicts: 0, + total: docs.length, + }); + } + + taskStore.fetch.mockResolvedValue({ docs: [] }); + taskStore.updateByQuery.mockResolvedValue({ + updated: 0, + version_conflicts: 0, + total: 0, + }); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + definitions, + taskStore, + maxAttempts: 2, + getCapacity, + }); + + return { taskManagerId, runAt, taskClaiming }; + } + + test('emits an event when a task is succesfully claimed by id', async () => { + const { taskManagerId, runAt, taskClaiming } = instantiateStoreWithMockedApiResponses(); + + const promise = taskClaiming.events + .pipe( + filter( + (event: TaskEvent) => event.id === 'claimed-by-id' + ), + take(1) + ) + .toPromise(); + + await getFirstAsPromise( + taskClaiming.claimAvailableTasks({ + claimTasksById: ['claimed-by-id'], + claimOwnershipUntil: new Date(), + }) + ); + + const event = await promise; + expect(event).toMatchObject( + asTaskClaimEvent( + 'claimed-by-id', + asOk({ + id: 'claimed-by-id', + runAt, + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: 'claiming' as TaskStatus, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }) + ) + ); + }); + + test('emits an event when a task is succesfully claimed by id by is rejected as it would exceed maxCapacity of its taskType', async () => { + const definitions = new TaskTypeDictionary(mockLogger()); + definitions.registerTaskDefinitions({ + unlimited: { + title: 'unlimited', + createTaskRunner: jest.fn(), + }, + limitedToOne: { + title: 'limitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + anotherLimitedToOne: { + title: 'anotherLimitedToOne', + maxConcurrency: 1, + createTaskRunner: jest.fn(), + }, + }); + + const taskManagerId = uuid.v4(); + const { runAt, taskClaiming } = instantiateStoreWithMockedApiResponses({ + taskManagerId, + definitions, + getCapacity: (type) => { + switch (type) { + case 'limitedToOne': + // return 0 as there's already a `limitedToOne` task running + return 0; + default: + return 10; + } + }, + tasksClaimed: [ + // find on first claim cycle + [ + { + id: 'claimed-by-id-limited-concurrency', + runAt: new Date(), + taskType: 'limitedToOne', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }, + ], + // second cycle + [ + { + id: 'claimed-by-schedule-unlimited', + runAt: new Date(), + taskType: 'unlimited', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }, + ], + ], + }); + + const promise = taskClaiming.events + .pipe( + filter( + (event: TaskEvent) => + event.id === 'claimed-by-id-limited-concurrency' + ), + take(1) + ) + .toPromise(); + + const [firstCycleResult, secondCycleResult] = await getAllAsPromise( + taskClaiming.claimAvailableTasks({ + claimTasksById: ['claimed-by-id-limited-concurrency'], + claimOwnershipUntil: new Date(), + }) + ); + + expect(firstCycleResult.stats.tasksClaimed).toEqual(0); + expect(firstCycleResult.stats.tasksRejected).toEqual(1); + expect(firstCycleResult.stats.tasksUpdated).toEqual(1); + + // values accumulate from cycle to cycle + expect(secondCycleResult.stats.tasksClaimed).toEqual(0); + expect(secondCycleResult.stats.tasksRejected).toEqual(1); + expect(secondCycleResult.stats.tasksUpdated).toEqual(1); + + const event = await promise; + expect(event).toMatchObject( + asTaskClaimEvent( + 'claimed-by-id-limited-concurrency', + asErr({ + task: some({ + id: 'claimed-by-id-limited-concurrency', + runAt, + taskType: 'limitedToOne', + schedule: undefined, + attempts: 0, + status: 'claiming' as TaskStatus, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }), + errorType: TaskClaimErrorType.CLAIMED_BY_ID_OUT_OF_CAPACITY, + }) + ) + ); + }); + + test('emits an event when a task is succesfully by scheduling', async () => { + const { taskManagerId, runAt, taskClaiming } = instantiateStoreWithMockedApiResponses(); + + const promise = taskClaiming.events + .pipe( + filter( + (event: TaskEvent) => + event.id === 'claimed-by-schedule' + ), + take(1) + ) + .toPromise(); + + await getFirstAsPromise( + taskClaiming.claimAvailableTasks({ + claimTasksById: ['claimed-by-id'], + claimOwnershipUntil: new Date(), + }) + ); + + const event = await promise; + expect(event).toMatchObject( + asTaskClaimEvent( + 'claimed-by-schedule', + asOk({ + id: 'claimed-by-schedule', + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: 'claiming' as TaskStatus, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }) + ) + ); + }); + + test('emits an event when the store fails to claim a required task by id', async () => { + const { taskManagerId, runAt, taskClaiming } = instantiateStoreWithMockedApiResponses(); + + const promise = taskClaiming.events + .pipe( + filter( + (event: TaskEvent) => event.id === 'already-running' + ), + take(1) + ) + .toPromise(); + + await getFirstAsPromise( + taskClaiming.claimAvailableTasks({ + claimTasksById: ['already-running'], + claimOwnershipUntil: new Date(), + }) + ); + + const event = await promise; + expect(event).toMatchObject( + asTaskClaimEvent( + 'already-running', + asErr({ + task: some({ + id: 'already-running', + runAt, + taskType: 'bar', + schedule: { interval: '5m' }, + attempts: 2, + status: 'running' as TaskStatus, + params: { shazm: 1 }, + state: { henry: 'The 8th' }, + user: 'dabo', + scope: ['reporting', 'ceo'], + ownerId: taskManagerId, + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + }), + errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_IN_CLAIMING_STATUS, + }) + ) + ); + }); + + test('emits an event when the store fails to find a task which was required by id', async () => { + const { taskClaiming } = instantiateStoreWithMockedApiResponses(); + + const promise = taskClaiming.events + .pipe( + filter( + (event: TaskEvent) => event.id === 'unknown-task' + ), + take(1) + ) + .toPromise(); + + await getFirstAsPromise( + taskClaiming.claimAvailableTasks({ + claimTasksById: ['unknown-task'], + claimOwnershipUntil: new Date(), + }) + ); + + const event = await promise; + expect(event).toMatchObject( + asTaskClaimEvent( + 'unknown-task', + asErr({ + task: none, + errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED, + }) + ) + ); + }); + }); +}); + +function generateFakeTasks(count: number = 1) { + return _.times(count, (index) => mockInstance({ id: `task:id-${index}` })); +} + +function mockInstance(instance: Partial = {}) { + return Object.assign( + { + id: uuid.v4(), + taskType: 'bar', + sequenceNumber: 32, + primaryTerm: 32, + runAt: new Date(), + scheduledAt: new Date(), + startedAt: null, + retryAt: null, + attempts: 0, + params: {}, + scope: ['reporting'], + state: {}, + status: 'idle', + user: 'example', + ownerId: null, + }, + instance + ); +} + +function getFirstAsPromise(obs$: Observable): Promise { + return new Promise((resolve, reject) => { + obs$.subscribe(resolve, reject); + }); +} +function getAllAsPromise(obs$: Observable): Promise { + return new Promise((resolve, reject) => { + obs$.pipe(toArray()).subscribe(resolve, reject); + }); +} diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.ts new file mode 100644 index 00000000000000..b4e11dbf81eb10 --- /dev/null +++ b/x-pack/plugins/task_manager/server/queries/task_claiming.ts @@ -0,0 +1,488 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/* + * This module contains helpers for managing the task manager storage layer. + */ +import apm from 'elastic-apm-node'; +import { Subject, Observable, from, of } from 'rxjs'; +import { map, mergeScan } from 'rxjs/operators'; +import { difference, partition, groupBy, mapValues, countBy, pick } from 'lodash'; +import { some, none } from 'fp-ts/lib/Option'; + +import { Logger } from '../../../../../src/core/server'; + +import { asOk, asErr, Result } from '../lib/result_type'; +import { ConcreteTaskInstance, TaskStatus } from '../task'; +import { + TaskClaim, + asTaskClaimEvent, + TaskClaimErrorType, + startTaskTimer, + TaskTiming, +} from '../task_events'; + +import { + asUpdateByQuery, + shouldBeOneOf, + mustBeAllOf, + filterDownBy, + asPinnedQuery, + matchesClauses, + SortOptions, +} from './query_clauses'; + +import { + updateFieldsAndMarkAsFailed, + IdleTaskWithExpiredRunAt, + InactiveTasks, + RunningOrClaimingTaskWithExpiredRetryAt, + SortByRunAtAndRetryAt, + tasksClaimedByOwner, + tasksOfType, +} from './mark_available_tasks_as_claimed'; +import { TaskTypeDictionary } from '../task_type_dictionary'; +import { + correctVersionConflictsForContinuation, + TaskStore, + UpdateByQueryResult, +} from '../task_store'; +import { FillPoolResult } from '../lib/fill_pool'; + +export interface TaskClaimingOpts { + logger: Logger; + definitions: TaskTypeDictionary; + taskStore: TaskStore; + maxAttempts: number; + getCapacity: (taskType?: string) => number; +} + +export interface OwnershipClaimingOpts { + claimOwnershipUntil: Date; + claimTasksById?: string[]; + size: number; + taskTypes: Set; +} +export type IncrementalOwnershipClaimingOpts = OwnershipClaimingOpts & { + precedingQueryResult: UpdateByQueryResult; +}; +export type IncrementalOwnershipClaimingReduction = ( + opts: IncrementalOwnershipClaimingOpts +) => Promise; + +export interface FetchResult { + docs: ConcreteTaskInstance[]; +} + +export interface ClaimOwnershipResult { + stats: { + tasksUpdated: number; + tasksConflicted: number; + tasksClaimed: number; + tasksRejected: number; + }; + docs: ConcreteTaskInstance[]; + timing?: TaskTiming; +} + +enum BatchConcurrency { + Unlimited, + Limited, +} + +type TaskClaimingBatches = Array; +interface TaskClaimingBatch { + concurrency: Concurrency; + tasksTypes: TaskType; +} +type UnlimitedBatch = TaskClaimingBatch>; +type LimitedBatch = TaskClaimingBatch; + +export class TaskClaiming { + public readonly errors$ = new Subject(); + public readonly maxAttempts: number; + + private definitions: TaskTypeDictionary; + private events$: Subject; + private taskStore: TaskStore; + private getCapacity: (taskType?: string) => number; + private logger: Logger; + private readonly taskClaimingBatchesByType: TaskClaimingBatches; + private readonly taskMaxAttempts: Record; + + /** + * Constructs a new TaskStore. + * @param {TaskClaimingOpts} opts + * @prop {number} maxAttempts - The maximum number of attempts before a task will be abandoned + * @prop {TaskDefinition} definition - The definition of the task being run + */ + constructor(opts: TaskClaimingOpts) { + this.definitions = opts.definitions; + this.maxAttempts = opts.maxAttempts; + this.taskStore = opts.taskStore; + this.getCapacity = opts.getCapacity; + this.logger = opts.logger; + this.taskClaimingBatchesByType = this.partitionIntoClaimingBatches(this.definitions); + this.taskMaxAttempts = Object.fromEntries(this.normalizeMaxAttempts(this.definitions)); + + this.events$ = new Subject(); + } + + private partitionIntoClaimingBatches(definitions: TaskTypeDictionary): TaskClaimingBatches { + const { + limitedConcurrency, + unlimitedConcurrency, + skippedTypes, + } = groupBy(definitions.getAllDefinitions(), (definition) => + definition.maxConcurrency + ? 'limitedConcurrency' + : definition.maxConcurrency === 0 + ? 'skippedTypes' + : 'unlimitedConcurrency' + ); + + if (skippedTypes?.length) { + this.logger.info( + `Task Manager will never claim tasks of the following types as their "maxConcurrency" is set to 0: ${skippedTypes + .map(({ type }) => type) + .join(', ')}` + ); + } + return [ + ...(unlimitedConcurrency + ? [asUnlimited(new Set(unlimitedConcurrency.map(({ type }) => type)))] + : []), + ...(limitedConcurrency ? limitedConcurrency.map(({ type }) => asLimited(type)) : []), + ]; + } + + private normalizeMaxAttempts(definitions: TaskTypeDictionary) { + return new Map( + [...definitions].map(([type, { maxAttempts }]) => [type, maxAttempts || this.maxAttempts]) + ); + } + + private claimingBatchIndex = 0; + private getClaimingBatches() { + // return all batches, starting at index and cycling back to where we began + const batch = [ + ...this.taskClaimingBatchesByType.slice(this.claimingBatchIndex), + ...this.taskClaimingBatchesByType.slice(0, this.claimingBatchIndex), + ]; + // shift claimingBatchIndex by one so that next cycle begins at the next index + this.claimingBatchIndex = (this.claimingBatchIndex + 1) % this.taskClaimingBatchesByType.length; + return batch; + } + + public get events(): Observable { + return this.events$; + } + + private emitEvents = (events: TaskClaim[]) => { + events.forEach((event) => this.events$.next(event)); + }; + + public claimAvailableTasksIfCapacityIsAvailable( + claimingOptions: Omit + ): Observable> { + if (this.getCapacity()) { + return this.claimAvailableTasks(claimingOptions).pipe( + map((claimResult) => asOk(claimResult)) + ); + } + this.logger.debug( + `[Task Ownership]: Task Manager has skipped Claiming Ownership of available tasks at it has ran out Available Workers.` + ); + return of(asErr(FillPoolResult.NoAvailableWorkers)); + } + + public claimAvailableTasks({ + claimOwnershipUntil, + claimTasksById = [], + }: Omit): Observable { + const initialCapacity = this.getCapacity(); + return from(this.getClaimingBatches()).pipe( + mergeScan( + (accumulatedResult, batch) => { + const stopTaskTimer = startTaskTimer(); + const capacity = Math.min( + initialCapacity - accumulatedResult.stats.tasksClaimed, + isLimited(batch) ? this.getCapacity(batch.tasksTypes) : this.getCapacity() + ); + // if we have no more capacity, short circuit here + if (capacity <= 0) { + return of(accumulatedResult); + } + return from( + this.executClaimAvailableTasks({ + claimOwnershipUntil, + claimTasksById: claimTasksById.splice(0, capacity), + size: capacity, + taskTypes: isLimited(batch) ? new Set([batch.tasksTypes]) : batch.tasksTypes, + }).then((result) => { + const { stats, docs } = accumulateClaimOwnershipResults(accumulatedResult, result); + stats.tasksConflicted = correctVersionConflictsForContinuation( + stats.tasksClaimed, + stats.tasksConflicted, + initialCapacity + ); + return { stats, docs, timing: stopTaskTimer() }; + }) + ); + }, + // initialise the accumulation with no results + accumulateClaimOwnershipResults(), + // only run one batch at a time + 1 + ) + ); + } + + private executClaimAvailableTasks = async ({ + claimOwnershipUntil, + claimTasksById = [], + size, + taskTypes, + }: OwnershipClaimingOpts): Promise => { + const claimTasksByIdWithRawIds = this.taskStore.convertToSavedObjectIds(claimTasksById); + const { + updated: tasksUpdated, + version_conflicts: tasksConflicted, + } = await this.markAvailableTasksAsClaimed({ + claimOwnershipUntil, + claimTasksById: claimTasksByIdWithRawIds, + size, + taskTypes, + }); + + const docs = + tasksUpdated > 0 + ? await this.sweepForClaimedTasks(claimTasksByIdWithRawIds, taskTypes, size) + : []; + + const [documentsReturnedById, documentsClaimedBySchedule] = partition(docs, (doc) => + claimTasksById.includes(doc.id) + ); + + const [documentsClaimedById, documentsRequestedButNotClaimed] = partition( + documentsReturnedById, + // we filter the schduled tasks down by status is 'claiming' in the esearch, + // but we do not apply this limitation on tasks claimed by ID so that we can + // provide more detailed error messages when we fail to claim them + (doc) => doc.status === TaskStatus.Claiming + ); + + // count how many tasks we've claimed by ID and validate we have capacity for them to run + const remainingCapacityOfClaimByIdByType = mapValues( + // This means we take the tasks that were claimed by their ID and count them by their type + countBy(documentsClaimedById, (doc) => doc.taskType), + (count, type) => this.getCapacity(type) - count + ); + + const [documentsClaimedByIdWithinCapacity, documentsClaimedByIdOutOfCapacity] = partition( + documentsClaimedById, + (doc) => { + // if we've exceeded capacity, we reject this task + if (remainingCapacityOfClaimByIdByType[doc.taskType] < 0) { + // as we're rejecting this task we can inc the count so that we know + // to keep the next one returned by ID of the same type + remainingCapacityOfClaimByIdByType[doc.taskType]++; + return false; + } + return true; + } + ); + + const documentsRequestedButNotReturned = difference( + claimTasksById, + documentsReturnedById.map((doc) => doc.id) + ); + + this.emitEvents([ + ...documentsClaimedByIdWithinCapacity.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))), + ...documentsClaimedByIdOutOfCapacity.map((doc) => + asTaskClaimEvent( + doc.id, + asErr({ + task: some(doc), + errorType: TaskClaimErrorType.CLAIMED_BY_ID_OUT_OF_CAPACITY, + }) + ) + ), + ...documentsClaimedBySchedule.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))), + ...documentsRequestedButNotClaimed.map((doc) => + asTaskClaimEvent( + doc.id, + asErr({ + task: some(doc), + errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_IN_CLAIMING_STATUS, + }) + ) + ), + ...documentsRequestedButNotReturned.map((id) => + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ), + ]); + + const stats = { + tasksUpdated, + tasksConflicted, + tasksRejected: documentsClaimedByIdOutOfCapacity.length, + tasksClaimed: documentsClaimedByIdWithinCapacity.length + documentsClaimedBySchedule.length, + }; + + if (docs.length !== stats.tasksClaimed + stats.tasksRejected) { + this.logger.warn( + `[Task Ownership error]: ${stats.tasksClaimed} tasks were claimed by Kibana, but ${ + docs.length + } task(s) were fetched (${docs.map((doc) => doc.id).join(', ')})` + ); + } + + return { + stats, + docs: [...documentsClaimedByIdWithinCapacity, ...documentsClaimedBySchedule], + }; + }; + + private async markAvailableTasksAsClaimed({ + claimOwnershipUntil, + claimTasksById, + size, + taskTypes, + }: OwnershipClaimingOpts): Promise { + const { taskTypesToSkip = [], taskTypesToClaim = [] } = groupBy( + this.definitions.getAllTypes(), + (type) => (taskTypes.has(type) ? 'taskTypesToClaim' : 'taskTypesToSkip') + ); + + const queryForScheduledTasks = mustBeAllOf( + // Either a task with idle status and runAt <= now or + // status running or claiming with a retryAt <= now. + shouldBeOneOf(IdleTaskWithExpiredRunAt, RunningOrClaimingTaskWithExpiredRetryAt) + ); + + // The documents should be sorted by runAt/retryAt, unless there are pinned + // tasks being queried, in which case we want to sort by score first, and then + // the runAt/retryAt. That way we'll get the pinned tasks first. Note that + // the score seems to favor newer documents rather than older documents, so + // if there are not pinned tasks being queried, we do NOT want to sort by score + // at all, just by runAt/retryAt. + const sort: SortOptions = [SortByRunAtAndRetryAt]; + if (claimTasksById && claimTasksById.length) { + sort.unshift('_score'); + } + + const apmTrans = apm.startTransaction(`taskManager markAvailableTasksAsClaimed`, 'taskManager'); + const result = await this.taskStore.updateByQuery( + asUpdateByQuery({ + query: matchesClauses( + claimTasksById && claimTasksById.length + ? mustBeAllOf(asPinnedQuery(claimTasksById, queryForScheduledTasks)) + : queryForScheduledTasks, + filterDownBy(InactiveTasks) + ), + update: updateFieldsAndMarkAsFailed( + { + ownerId: this.taskStore.taskManagerId, + retryAt: claimOwnershipUntil, + }, + claimTasksById || [], + taskTypesToClaim, + taskTypesToSkip, + pick(this.taskMaxAttempts, taskTypesToClaim) + ), + sort, + }), + { + max_docs: size, + } + ); + + if (apmTrans) apmTrans.end(); + return result; + } + + /** + * Fetches tasks from the index, which are owned by the current Kibana instance + */ + private async sweepForClaimedTasks( + claimTasksById: OwnershipClaimingOpts['claimTasksById'], + taskTypes: Set, + size: number + ): Promise { + const claimedTasksQuery = tasksClaimedByOwner( + this.taskStore.taskManagerId, + tasksOfType([...taskTypes]) + ); + const { docs } = await this.taskStore.fetch({ + query: + claimTasksById && claimTasksById.length + ? asPinnedQuery(claimTasksById, claimedTasksQuery) + : claimedTasksQuery, + size, + sort: SortByRunAtAndRetryAt, + seq_no_primary_term: true, + }); + + return docs; + } +} + +const emptyClaimOwnershipResult = () => { + return { + stats: { + tasksUpdated: 0, + tasksConflicted: 0, + tasksClaimed: 0, + tasksRejected: 0, + }, + docs: [], + }; +}; + +function accumulateClaimOwnershipResults( + prev: ClaimOwnershipResult = emptyClaimOwnershipResult(), + next?: ClaimOwnershipResult +) { + if (next) { + const { stats, docs, timing } = next; + const res = { + stats: { + tasksUpdated: stats.tasksUpdated + prev.stats.tasksUpdated, + tasksConflicted: stats.tasksConflicted + prev.stats.tasksConflicted, + tasksClaimed: stats.tasksClaimed + prev.stats.tasksClaimed, + tasksRejected: stats.tasksRejected + prev.stats.tasksRejected, + }, + docs, + timing, + }; + return res; + } + return prev; +} + +function isLimited( + batch: TaskClaimingBatch +): batch is LimitedBatch { + return batch.concurrency === BatchConcurrency.Limited; +} +function asLimited(tasksType: string): LimitedBatch { + return { + concurrency: BatchConcurrency.Limited, + tasksTypes: tasksType, + }; +} +function asUnlimited(tasksTypes: Set): UnlimitedBatch { + return { + concurrency: BatchConcurrency.Unlimited, + tasksTypes, + }; +} diff --git a/x-pack/plugins/task_manager/server/task.ts b/x-pack/plugins/task_manager/server/task.ts index 04589d696427af..4b86943ff8eca2 100644 --- a/x-pack/plugins/task_manager/server/task.ts +++ b/x-pack/plugins/task_manager/server/task.ts @@ -127,6 +127,16 @@ export const taskDefinitionSchema = schema.object( min: 1, }) ), + /** + * The maximum number tasks of this type that can be run concurrently per Kibana instance. + * Setting this value will force Task Manager to poll for this task type seperatly from other task types + * which can add significant load to the ES cluster, so please use this configuration only when absolutly necesery. + */ + maxConcurrency: schema.maybe( + schema.number({ + min: 0, + }) + ), }, { validate({ timeout }) { diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index d3fb68aa367c1b..aecf7c9a2b7e89 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -23,6 +23,12 @@ export enum TaskEventType { TASK_MANAGER_STAT = 'TASK_MANAGER_STAT', } +export enum TaskClaimErrorType { + CLAIMED_BY_ID_OUT_OF_CAPACITY = 'CLAIMED_BY_ID_OUT_OF_CAPACITY', + CLAIMED_BY_ID_NOT_RETURNED = 'CLAIMED_BY_ID_NOT_RETURNED', + CLAIMED_BY_ID_NOT_IN_CLAIMING_STATUS = 'CLAIMED_BY_ID_NOT_IN_CLAIMING_STATUS', +} + export interface TaskTiming { start: number; stop: number; @@ -47,14 +53,18 @@ export interface RanTask { export type ErroredTask = RanTask & { error: Error; }; +export interface ClaimTaskErr { + task: Option; + errorType: TaskClaimErrorType; +} export type TaskMarkRunning = TaskEvent; export type TaskRun = TaskEvent; -export type TaskClaim = TaskEvent>; +export type TaskClaim = TaskEvent; export type TaskRunRequest = TaskEvent; export type TaskPollingCycle = TaskEvent>; -export type TaskManagerStats = 'load' | 'pollingDelay'; +export type TaskManagerStats = 'load' | 'pollingDelay' | 'claimDuration'; export type TaskManagerStat = TaskEvent; export type OkResultOf = EventType extends TaskEvent @@ -92,7 +102,7 @@ export function asTaskRunEvent( export function asTaskClaimEvent( id: string, - event: Result>, + event: Result, timing?: TaskTiming ): TaskClaim { return { diff --git a/x-pack/plugins/task_manager/server/task_pool.test.ts b/x-pack/plugins/task_manager/server/task_pool.test.ts index 6f82c477dca9e2..05eb7bd1b43e10 100644 --- a/x-pack/plugins/task_manager/server/task_pool.test.ts +++ b/x-pack/plugins/task_manager/server/task_pool.test.ts @@ -15,6 +15,7 @@ import { asOk } from './lib/result_type'; import { SavedObjectsErrorHelpers } from '../../../../src/core/server'; import moment from 'moment'; import uuid from 'uuid'; +import { TaskRunningStage } from './task_running'; describe('TaskPool', () => { test('occupiedWorkers are a sum of running tasks', async () => { @@ -370,6 +371,7 @@ describe('TaskPool', () => { cancel: async () => undefined, markTaskAsRunning: jest.fn(async () => true), run: mockRun(), + stage: TaskRunningStage.PENDING, toString: () => `TaskType "shooooo"`, get expiration() { return new Date(); diff --git a/x-pack/plugins/task_manager/server/task_pool.ts b/x-pack/plugins/task_manager/server/task_pool.ts index e30f9ef3154b2a..14c0c4581a15bb 100644 --- a/x-pack/plugins/task_manager/server/task_pool.ts +++ b/x-pack/plugins/task_manager/server/task_pool.ts @@ -25,6 +25,8 @@ interface Opts { } export enum TaskPoolRunResult { + // This mean we have no Run Result becuse no tasks were Ran in this cycle + NoTaskWereRan = 'NoTaskWereRan', // This means we're running all the tasks we claimed RunningAllClaimedTasks = 'RunningAllClaimedTasks', // This means we're running all the tasks we claimed and we're at capacity @@ -40,7 +42,7 @@ const VERSION_CONFLICT_MESSAGE = 'Task has been claimed by another Kibana servic */ export class TaskPool { private maxWorkers: number = 0; - private running = new Set(); + private tasksInPool = new Map(); private logger: Logger; private load$ = new Subject(); @@ -68,7 +70,7 @@ export class TaskPool { * Gets how many workers are currently in use. */ public get occupiedWorkers() { - return this.running.size; + return this.tasksInPool.size; } /** @@ -93,6 +95,16 @@ export class TaskPool { return this.maxWorkers - this.occupiedWorkers; } + /** + * Gets how many workers are currently in use by type. + */ + public getOccupiedWorkersByType(type: string) { + return [...this.tasksInPool.values()].reduce( + (count, runningTask) => (runningTask.definition.type === type ? ++count : count), + 0 + ); + } + /** * Attempts to run the specified list of tasks. Returns true if it was able * to start every task in the list, false if there was not enough capacity @@ -106,9 +118,11 @@ export class TaskPool { if (tasksToRun.length) { performance.mark('attemptToRun_start'); await Promise.all( - tasksToRun.map( - async (taskRunner) => - await taskRunner + tasksToRun + .filter((taskRunner) => !this.tasksInPool.has(taskRunner.id)) + .map(async (taskRunner) => { + this.tasksInPool.set(taskRunner.id, taskRunner); + return taskRunner .markTaskAsRunning() .then((hasTaskBeenMarkAsRunning: boolean) => hasTaskBeenMarkAsRunning @@ -118,8 +132,8 @@ export class TaskPool { message: VERSION_CONFLICT_MESSAGE, }) ) - .catch((err) => this.handleFailureOfMarkAsRunning(taskRunner, err)) - ) + .catch((err) => this.handleFailureOfMarkAsRunning(taskRunner, err)); + }) ); performance.mark('attemptToRun_stop'); @@ -139,13 +153,12 @@ export class TaskPool { public cancelRunningTasks() { this.logger.debug('Cancelling running tasks.'); - for (const task of this.running) { + for (const task of this.tasksInPool.values()) { this.cancelTask(task); } } private handleMarkAsRunning(taskRunner: TaskRunner) { - this.running.add(taskRunner); taskRunner .run() .catch((err) => { @@ -161,26 +174,31 @@ export class TaskPool { this.logger.warn(errorLogLine); } }) - .then(() => this.running.delete(taskRunner)); + .then(() => this.tasksInPool.delete(taskRunner.id)); } private handleFailureOfMarkAsRunning(task: TaskRunner, err: Error) { + this.tasksInPool.delete(task.id); this.logger.error(`Failed to mark Task ${task.toString()} as running: ${err.message}`); } private cancelExpiredTasks() { - for (const task of this.running) { - if (task.isExpired) { + for (const taskRunner of this.tasksInPool.values()) { + if (taskRunner.isExpired) { this.logger.warn( - `Cancelling task ${task.toString()} as it expired at ${task.expiration.toISOString()}${ - task.startedAt + `Cancelling task ${taskRunner.toString()} as it expired at ${taskRunner.expiration.toISOString()}${ + taskRunner.startedAt ? ` after running for ${durationAsString( - moment.duration(moment(new Date()).utc().diff(task.startedAt)) + moment.duration(moment(new Date()).utc().diff(taskRunner.startedAt)) )}` : `` - }${task.definition.timeout ? ` (with timeout set at ${task.definition.timeout})` : ``}.` + }${ + taskRunner.definition.timeout + ? ` (with timeout set at ${taskRunner.definition.timeout})` + : `` + }.` ); - this.cancelTask(task); + this.cancelTask(taskRunner); } } } @@ -188,7 +206,7 @@ export class TaskPool { private async cancelTask(task: TaskRunner) { try { this.logger.debug(`Cancelling task ${task.toString()}.`); - this.running.delete(task); + this.tasksInPool.delete(task.id); await task.cancel(); } catch (err) { this.logger.error(`Failed to cancel task ${task.toString()}: ${err}`); diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts index dff8c1f24de0ae..5a36d6affe686c 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.test.ts @@ -9,7 +9,7 @@ import _ from 'lodash'; import sinon from 'sinon'; import { secondsFromNow } from '../lib/intervals'; import { asOk, asErr } from '../lib/result_type'; -import { TaskManagerRunner, TaskRunResult } from '../task_running'; +import { TaskManagerRunner, TaskRunningStage, TaskRunResult } from '../task_running'; import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent, TaskRun } from '../task_events'; import { ConcreteTaskInstance, TaskStatus } from '../task'; import { SavedObjectsErrorHelpers } from '../../../../../src/core/server'; @@ -17,6 +17,7 @@ import moment from 'moment'; import { TaskDefinitionRegistry, TaskTypeDictionary } from '../task_type_dictionary'; import { mockLogger } from '../test_utils'; import { throwUnrecoverableError } from './errors'; +import { taskStoreMock } from '../task_store.mock'; const minutesFromNow = (mins: number): Date => secondsFromNow(mins * 60); @@ -29,980 +30,834 @@ beforeAll(() => { afterAll(() => fakeTimer.restore()); describe('TaskManagerRunner', () => { - test('provides details about the task that is running', () => { - const { runner } = testOpts({ - instance: { - id: 'foo', - taskType: 'bar', - }, - }); + const pendingStageSetup = (opts: TestOpts) => testOpts(TaskRunningStage.PENDING, opts); + const readyToRunStageSetup = (opts: TestOpts) => testOpts(TaskRunningStage.READY_TO_RUN, opts); - expect(runner.id).toEqual('foo'); - expect(runner.taskType).toEqual('bar'); - expect(runner.toString()).toEqual('bar "foo"'); - }); - - test('queues a reattempt if the task fails', async () => { - const initialAttempts = _.random(0, 2); - const id = Date.now().toString(); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - params: { a: 'b' }, - state: { hey: 'there' }, - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - throw new Error('Dangit!'); - }, - }), + describe('Pending Stage', () => { + test('provides details about the task that is running', async () => { + const { runner } = await pendingStageSetup({ + instance: { + id: 'foo', + taskType: 'bar', }, - }, + }); + + expect(runner.id).toEqual('foo'); + expect(runner.taskType).toEqual('bar'); + expect(runner.toString()).toEqual('bar "foo"'); }); - await runner.run(); + test('calculates retryAt by schedule when running a recurring task', async () => { + const intervalMinutes = 10; + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(0, 2); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: { + interval: `${intervalMinutes}m`, + }, + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + await runner.markTaskAsRunning(); - expect(instance.id).toEqual(id); - expect(instance.runAt.getTime()).toEqual(minutesFromNow(initialAttempts * 5).getTime()); - expect(instance.params).toEqual({ a: 'b' }); - expect(instance.state).toEqual({ hey: 'there' }); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test('reschedules tasks that have an schedule', async () => { - const { runner, store } = testOpts({ - instance: { - schedule: { interval: '10m' }, - status: TaskStatus.Running, - startedAt: new Date(), - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - return { state: {} }; - }, - }), - }, - }, + expect(instance.retryAt!.getTime()).toEqual( + instance.startedAt!.getTime() + intervalMinutes * 60 * 1000 + ); }); - await runner.run(); + test('calculates retryAt by default timout when it exceeds the schedule of a recurring task', async () => { + const intervalSeconds = 20; + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(0, 2); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: { + interval: `${intervalSeconds}s`, + }, + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + await runner.markTaskAsRunning(); - expect(instance.runAt.getTime()).toBeGreaterThan(minutesFromNow(9).getTime()); - expect(instance.runAt.getTime()).toBeLessThanOrEqual(minutesFromNow(10).getTime()); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test('expiration returns time after which timeout will have elapsed from start', async () => { - const now = moment(); - const { runner } = testOpts({ - instance: { - schedule: { interval: '10m' }, - status: TaskStatus.Running, - startedAt: now.toDate(), - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `1m`, - createTaskRunner: () => ({ - async run() { - return { state: {} }; - }, - }), - }, - }, + expect(instance.retryAt!.getTime()).toEqual(instance.startedAt!.getTime() + 5 * 60 * 1000); }); - await runner.run(); - - expect(runner.isExpired).toBe(false); - expect(runner.expiration).toEqual(now.add(1, 'm').toDate()); - }); - - test('runDuration returns duration which has elapsed since start', async () => { - const now = moment().subtract(30, 's').toDate(); - const { runner } = testOpts({ - instance: { - schedule: { interval: '10m' }, - status: TaskStatus.Running, - startedAt: now, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `1m`, - createTaskRunner: () => ({ - async run() { - return { state: {} }; - }, - }), + test('calculates retryAt by timeout if it exceeds the schedule when running a recurring task', async () => { + const timeoutMinutes = 1; + const intervalSeconds = 20; + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(0, 2); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: { + interval: `${intervalSeconds}s`, + }, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.run(); + await runner.markTaskAsRunning(); - expect(runner.isExpired).toBe(false); - expect(runner.startedAt).toEqual(now); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test('reschedules tasks that return a runAt', async () => { - const runAt = minutesFromNow(_.random(1, 10)); - const { runner, store } = testOpts({ - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - return { runAt, state: {} }; - }, - }), - }, - }, + expect(instance.retryAt!.getTime()).toEqual( + instance.startedAt!.getTime() + timeoutMinutes * 60 * 1000 + ); }); - await runner.run(); - - sinon.assert.calledOnce(store.update); - sinon.assert.calledWithMatch(store.update, { runAt }); - }); - - test('reschedules tasks that return a schedule', async () => { - const runAt = minutesFromNow(1); - const schedule = { - interval: '1m', - }; - const { runner, store } = testOpts({ - instance: { - status: TaskStatus.Running, - startedAt: new Date(), - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - return { schedule, state: {} }; - }, - }), + test('sets startedAt, status, attempts and retryAt when claiming a task', async () => { + const timeoutMinutes = 1; + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(0, 2); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.run(); + await runner.markTaskAsRunning(); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWithMatch(store.update, { runAt }); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test(`doesn't reschedule recurring tasks that throw an unrecoverable error`, async () => { - const id = _.random(1, 20).toString(); - const error = new Error('Dangit!'); - const onTaskEvent = jest.fn(); - const { runner, store, instance: originalInstance } = testOpts({ - onTaskEvent, - instance: { id, status: TaskStatus.Running, startedAt: new Date() }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - throwUnrecoverableError(error); - }, - }), - }, - }, + expect(instance.attempts).toEqual(initialAttempts + 1); + expect(instance.status).toBe('running'); + expect(instance.startedAt!.getTime()).toEqual(Date.now()); + expect(instance.retryAt!.getTime()).toEqual( + minutesFromNow((initialAttempts + 1) * 5).getTime() + timeoutMinutes * 60 * 1000 + ); }); - await runner.run(); - - const instance = store.update.args[0][0]; - expect(instance.status).toBe('failed'); - - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming( - asTaskRunEvent( + test('uses getRetry (returning date) to set retryAt when defined', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { id, - asErr({ - error, - task: originalInstance, - result: TaskRunResult.Failed, - }) - ) - ) - ); - expect(onTaskEvent).toHaveBeenCalledTimes(1); - }); - - test('tasks that return runAt override the schedule', async () => { - const runAt = minutesFromNow(_.random(5)); - const { runner, store } = testOpts({ - instance: { - schedule: { interval: '20m' }, - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - return { runAt, state: {} }; - }, - }), + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.run(); + await runner.markTaskAsRunning(); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWithMatch(store.update, { runAt }); - }); + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts + 1); + const instance = store.update.mock.calls[0][0]; - test('removes non-recurring tasks after they complete', async () => { - const id = _.random(1, 20).toString(); - const { runner, store } = testOpts({ - instance: { - id, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - return undefined; - }, - }), - }, - }, + expect(instance.retryAt!.getTime()).toEqual( + new Date(nextRetry.getTime() + timeoutMinutes * 60 * 1000).getTime() + ); }); - await runner.run(); - - sinon.assert.calledOnce(store.remove); - sinon.assert.calledWith(store.remove, id); - }); - - test('cancel cancels the task runner, if it is cancellable', async () => { - let wasCancelled = false; - const { runner, logger } = testOpts({ - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - async run() { - const promise = new Promise((r) => setTimeout(r, 1000)); - fakeTimer.tick(1000); - await promise; - }, - async cancel() { - wasCancelled = true; - }, - }), + test('it returns false when markTaskAsRunning fails due to VERSION_CONFLICT_STATUS', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - const promise = runner.run(); - await Promise.resolve(); - await runner.cancel(); - await promise; + store.update.mockRejectedValue( + SavedObjectsErrorHelpers.decorateConflictError(new Error('repo error')) + ); - expect(wasCancelled).toBeTruthy(); - expect(logger.warn).not.toHaveBeenCalled(); - }); + expect(await runner.markTaskAsRunning()).toEqual(false); + }); - test('debug logs if cancel is called on a non-cancellable task', async () => { - const { runner, logger } = testOpts({ - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - run: async () => undefined, - }), + test('it throw when markTaskAsRunning fails for unexpected reasons', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - const promise = runner.run(); - await runner.cancel(); - await promise; + store.update.mockRejectedValue( + SavedObjectsErrorHelpers.createGenericNotFoundError('type', 'id') + ); - expect(logger.debug).toHaveBeenCalledWith(`The task bar "foo" is not cancellable.`); - }); + return expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( + `[Error: Saved object [type/id] not found]` + ); + }); - test('sets startedAt, status, attempts and retryAt when claiming a task', async () => { - const timeoutMinutes = 1; - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(0, 2); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - createTaskRunner: () => ({ - run: async () => undefined, - }), + test(`it tries to increment a task's attempts when markTaskAsRunning fails for unexpected reasons`, async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.markTaskAsRunning(); + store.update.mockRejectedValueOnce(SavedObjectsErrorHelpers.createBadRequestError('type')); + store.update.mockResolvedValueOnce( + mockInstance({ + id, + attempts: initialAttempts, + schedule: undefined, + }) + ); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + await expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( + `[Error: type: Bad Request]` + ); - expect(instance.attempts).toEqual(initialAttempts + 1); - expect(instance.status).toBe('running'); - expect(instance.startedAt.getTime()).toEqual(Date.now()); - expect(instance.retryAt.getTime()).toEqual( - minutesFromNow((initialAttempts + 1) * 5).getTime() + timeoutMinutes * 60 * 1000 - ); - }); + expect(store.update).toHaveBeenCalledWith({ + ...mockInstance({ + id, + attempts: initialAttempts + 1, + schedule: undefined, + }), + status: TaskStatus.Idle, + startedAt: null, + retryAt: null, + ownerId: null, + }); + }); - test('calculates retryAt by schedule when running a recurring task', async () => { - const intervalMinutes = 10; - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(0, 2); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { - interval: `${intervalMinutes}m`, + test(`it doesnt try to increment a task's attempts when markTaskAsRunning fails for version conflict`, async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - run: async () => undefined, - }), + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, }, - }, - }); + }); - await runner.markTaskAsRunning(); + store.update.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createConflictError('type', 'id') + ); + store.update.mockResolvedValueOnce( + mockInstance({ + id, + attempts: initialAttempts, + schedule: undefined, + }) + ); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + await expect(runner.markTaskAsRunning()).resolves.toMatchInlineSnapshot(`false`); - expect(instance.retryAt.getTime()).toEqual( - instance.startedAt.getTime() + intervalMinutes * 60 * 1000 - ); - }); + expect(store.update).toHaveBeenCalledTimes(1); + }); - test('calculates retryAt by default timout when it exceeds the schedule of a recurring task', async () => { - const intervalSeconds = 20; - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(0, 2); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { - interval: `${intervalSeconds}s`, + test(`it doesnt try to increment a task's attempts when markTaskAsRunning fails due to Saved Object not being found`, async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(nextRetry); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - definitions: { - bar: { - title: 'Bar!', - createTaskRunner: () => ({ - run: async () => undefined, - }), + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, }, - }, - }); + }); - await runner.markTaskAsRunning(); + store.update.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createGenericNotFoundError('type', 'id') + ); + store.update.mockResolvedValueOnce( + mockInstance({ + id, + attempts: initialAttempts, + schedule: undefined, + }) + ); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + await expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( + `[Error: Saved object [type/id] not found]` + ); - expect(instance.retryAt.getTime()).toEqual(instance.startedAt.getTime() + 5 * 60 * 1000); - }); + expect(store.update).toHaveBeenCalledTimes(1); + }); - test('calculates retryAt by timeout if it exceeds the schedule when running a recurring task', async () => { - const timeoutMinutes = 1; - const intervalSeconds = 20; - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(0, 2); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { - interval: `${intervalSeconds}s`, + test('uses getRetry (returning true) to set retryAt when defined', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(true); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - createTaskRunner: () => ({ - run: async () => undefined, - }), + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, }, - }, - }); + }); - await runner.markTaskAsRunning(); + await runner.markTaskAsRunning(); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts + 1); + const instance = store.update.mock.calls[0][0]; - expect(instance.retryAt.getTime()).toEqual( - instance.startedAt.getTime() + timeoutMinutes * 60 * 1000 - ); - }); + const attemptDelay = (initialAttempts + 1) * 5 * 60 * 1000; + const timeoutDelay = timeoutMinutes * 60 * 1000; + expect(instance.retryAt!.getTime()).toEqual( + new Date(Date.now() + attemptDelay + timeoutDelay).getTime() + ); + }); - test('uses getRetry function (returning date) on error when defined', async () => { - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const id = Date.now().toString(); - const getRetryStub = sinon.stub().returns(nextRetry); - const error = new Error('Dangit!'); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - }, - definitions: { - bar: { - title: 'Bar!', - getRetry: getRetryStub, - createTaskRunner: () => ({ - async run() { - throw error; - }, - }), + test('uses getRetry (returning false) to set retryAt when defined', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(false); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.run(); + await runner.markTaskAsRunning(); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts, error); - const instance = store.update.args[0][0]; + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts + 1); + const instance = store.update.mock.calls[0][0]; - expect(instance.runAt.getTime()).toEqual(nextRetry.getTime()); - }); + expect(instance.retryAt!).toBeNull(); + expect(instance.status).toBe('running'); + }); - test('uses getRetry function (returning true) on error when defined', async () => { - const initialAttempts = _.random(1, 3); - const id = Date.now().toString(); - const getRetryStub = sinon.stub().returns(true); - const error = new Error('Dangit!'); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - }, - definitions: { - bar: { - title: 'Bar!', - getRetry: getRetryStub, - createTaskRunner: () => ({ - async run() { - throw error; - }, - }), + test('bypasses getRetry (returning false) of a recurring task to set retryAt when defined', async () => { + const id = _.random(1, 20).toString(); + const initialAttempts = _.random(1, 3); + const timeoutMinutes = 1; + const getRetryStub = sinon.stub().returns(false); + const { runner, store } = await pendingStageSetup({ + instance: { + id, + attempts: initialAttempts, + schedule: { interval: '1m' }, + startedAt: new Date(), }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + timeout: `${timeoutMinutes}m`, + getRetry: getRetryStub, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - await runner.run(); + await runner.markTaskAsRunning(); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts, error); - const instance = store.update.args[0][0]; + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.notCalled(getRetryStub); + const instance = store.update.mock.calls[0][0]; - const expectedRunAt = new Date(Date.now() + initialAttempts * 5 * 60 * 1000); - expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime()); - }); + const timeoutDelay = timeoutMinutes * 60 * 1000; + expect(instance.retryAt!.getTime()).toEqual(new Date(Date.now() + timeoutDelay).getTime()); + }); - test('uses getRetry function (returning false) on error when defined', async () => { - const initialAttempts = _.random(1, 3); - const id = Date.now().toString(); - const getRetryStub = sinon.stub().returns(false); - const error = new Error('Dangit!'); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - }, - definitions: { - bar: { - title: 'Bar!', - getRetry: getRetryStub, - createTaskRunner: () => ({ - async run() { - throw error; + describe('TaskEvents', () => { + test('emits TaskEvent when a task is marked as running', async () => { + const id = _.random(1, 20).toString(); + const onTaskEvent = jest.fn(); + const { runner, instance, store } = await pendingStageSetup({ + onTaskEvent, + instance: { + id, + }, + definitions: { + bar: { + title: 'Bar!', + timeout: `1m`, + createTaskRunner: () => ({ + run: async () => undefined, + }), }, - }), - }, - }, - }); + }, + }); - await runner.run(); + store.update.mockResolvedValueOnce(instance); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts, error); - const instance = store.update.args[0][0]; + await runner.markTaskAsRunning(); - expect(instance.status).toBe('failed'); - }); + expect(onTaskEvent).toHaveBeenCalledWith(asTaskMarkRunningEvent(id, asOk(instance))); + }); - test('bypasses getRetry function (returning false) on error of a recurring task', async () => { - const initialAttempts = _.random(1, 3); - const id = Date.now().toString(); - const getRetryStub = sinon.stub().returns(false); - const error = new Error('Dangit!'); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { interval: '1m' }, - startedAt: new Date(), - }, - definitions: { - bar: { - title: 'Bar!', - getRetry: getRetryStub, - createTaskRunner: () => ({ - async run() { - throw error; - }, - }), - }, - }, - }); + test('emits TaskEvent when a task fails to be marked as running', async () => { + expect.assertions(2); - await runner.run(); + const id = _.random(1, 20).toString(); + const onTaskEvent = jest.fn(); + const { runner, store } = await pendingStageSetup({ + onTaskEvent, + instance: { + id, + }, + definitions: { + bar: { + title: 'Bar!', + timeout: `1m`, + createTaskRunner: () => ({ + run: async () => undefined, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - sinon.assert.notCalled(getRetryStub); - const instance = store.update.args[0][0]; + store.update.mockRejectedValueOnce(new Error('cant mark as running')); - const nextIntervalDelay = 60000; // 1m - const expectedRunAt = new Date(Date.now() + nextIntervalDelay); - expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime()); + try { + await runner.markTaskAsRunning(); + } catch (err) { + expect(onTaskEvent).toHaveBeenCalledWith(asTaskMarkRunningEvent(id, asErr(err))); + } + expect(onTaskEvent).toHaveBeenCalledTimes(1); + }); + }); }); - test('uses getRetry (returning date) to set retryAt when defined', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), + describe('Ready To Run Stage', () => { + test('queues a reattempt if the task fails', async () => { + const initialAttempts = _.random(0, 2); + const id = Date.now().toString(); + const { runner, store } = await readyToRunStageSetup({ + instance: { + id, + attempts: initialAttempts, + params: { a: 'b' }, + state: { hey: 'there' }, }, - }, - }); - - await runner.markTaskAsRunning(); + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + throw new Error('Dangit!'); + }, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts + 1); - const instance = store.update.args[0][0]; + await runner.run(); - expect(instance.retryAt.getTime()).toEqual( - new Date(nextRetry.getTime() + timeoutMinutes * 60 * 1000).getTime() - ); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test('it returns false when markTaskAsRunning fails due to VERSION_CONFLICT_STATUS', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(instance.id).toEqual(id); + expect(instance.runAt.getTime()).toEqual(minutesFromNow(initialAttempts * 5).getTime()); + expect(instance.params).toEqual({ a: 'b' }); + expect(instance.state).toEqual({ hey: 'there' }); }); - store.update = sinon - .stub() - .throws(SavedObjectsErrorHelpers.decorateConflictError(new Error('repo error'))); - - expect(await runner.markTaskAsRunning()).toEqual(false); - }); - - test('it throw when markTaskAsRunning fails for unexpected reasons', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), + test('reschedules tasks that have an schedule', async () => { + const { runner, store } = await readyToRunStageSetup({ + instance: { + schedule: { interval: '10m' }, + status: TaskStatus.Running, + startedAt: new Date(), }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { state: {} }; + }, + }), + }, + }, + }); - store.update = sinon - .stub() - .throws(SavedObjectsErrorHelpers.createGenericNotFoundError('type', 'id')); + await runner.run(); - return expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( - `[Error: Saved object [type/id] not found]` - ); - }); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; - test(`it tries to increment a task's attempts when markTaskAsRunning fails for unexpected reasons`, async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(instance.runAt.getTime()).toBeGreaterThan(minutesFromNow(9).getTime()); + expect(instance.runAt.getTime()).toBeLessThanOrEqual(minutesFromNow(10).getTime()); }); - store.update = sinon.stub(); - store.update.onFirstCall().throws(SavedObjectsErrorHelpers.createBadRequestError('type')); - store.update.onSecondCall().resolves(); + test('expiration returns time after which timeout will have elapsed from start', async () => { + const now = moment(); + const { runner } = await readyToRunStageSetup({ + instance: { + schedule: { interval: '10m' }, + status: TaskStatus.Running, + startedAt: now.toDate(), + }, + definitions: { + bar: { + title: 'Bar!', + timeout: `1m`, + createTaskRunner: () => ({ + async run() { + return { state: {} }; + }, + }), + }, + }, + }); - await expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( - `[Error: type: Bad Request]` - ); + await runner.run(); - sinon.assert.calledWith(store.update, { - ...mockInstance({ - id, - attempts: initialAttempts + 1, - schedule: undefined, - }), - status: TaskStatus.Idle, - startedAt: null, - retryAt: null, - ownerId: null, + expect(runner.isExpired).toBe(false); + expect(runner.expiration).toEqual(now.add(1, 'm').toDate()); }); - }); - test(`it doesnt try to increment a task's attempts when markTaskAsRunning fails for version conflict`, async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), + test('runDuration returns duration which has elapsed since start', async () => { + const now = moment().subtract(30, 's').toDate(); + const { runner } = await readyToRunStageSetup({ + instance: { + schedule: { interval: '10m' }, + status: TaskStatus.Running, + startedAt: now, }, - }, - }); - - store.update = sinon.stub(); - store.update.onFirstCall().throws(SavedObjectsErrorHelpers.createConflictError('type', 'id')); - store.update.onSecondCall().resolves(); - - await expect(runner.markTaskAsRunning()).resolves.toMatchInlineSnapshot(`false`); + definitions: { + bar: { + title: 'Bar!', + timeout: `1m`, + createTaskRunner: () => ({ + async run() { + return { state: {} }; + }, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - }); + await runner.run(); - test(`it doesnt try to increment a task's attempts when markTaskAsRunning fails due to Saved Object not being found`, async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(nextRetry); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(runner.isExpired).toBe(false); + expect(runner.startedAt).toEqual(now); }); - store.update = sinon.stub(); - store.update - .onFirstCall() - .throws(SavedObjectsErrorHelpers.createGenericNotFoundError('type', 'id')); - store.update.onSecondCall().resolves(); - - await expect(runner.markTaskAsRunning()).rejects.toMatchInlineSnapshot( - `[Error: Saved object [type/id] not found]` - ); + test('reschedules tasks that return a runAt', async () => { + const runAt = minutesFromNow(_.random(1, 10)); + const { runner, store } = await readyToRunStageSetup({ + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { runAt, state: {} }; + }, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - }); + await runner.run(); - test('uses getRetry (returning true) to set retryAt when defined', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(true); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(store.update).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledWith(expect.objectContaining({ runAt })); }); - await runner.markTaskAsRunning(); - - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts + 1); - const instance = store.update.args[0][0]; + test('reschedules tasks that return a schedule', async () => { + const runAt = minutesFromNow(1); + const schedule = { + interval: '1m', + }; + const { runner, store } = await readyToRunStageSetup({ + instance: { + status: TaskStatus.Running, + startedAt: new Date(), + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { schedule, state: {} }; + }, + }), + }, + }, + }); - const attemptDelay = (initialAttempts + 1) * 5 * 60 * 1000; - const timeoutDelay = timeoutMinutes * 60 * 1000; - expect(instance.retryAt.getTime()).toEqual( - new Date(Date.now() + attemptDelay + timeoutDelay).getTime() - ); - }); + await runner.run(); - test('uses getRetry (returning false) to set retryAt when defined', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(false); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(store.update).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledWith(expect.objectContaining({ runAt })); }); - await runner.markTaskAsRunning(); + test(`doesn't reschedule recurring tasks that throw an unrecoverable error`, async () => { + const id = _.random(1, 20).toString(); + const error = new Error('Dangit!'); + const onTaskEvent = jest.fn(); + const { runner, store, instance: originalInstance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { id, status: TaskStatus.Running, startedAt: new Date() }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + throwUnrecoverableError(error); + }, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - sinon.assert.calledWith(getRetryStub, initialAttempts + 1); - const instance = store.update.args[0][0]; + await runner.run(); - expect(instance.retryAt).toBeNull(); - expect(instance.status).toBe('running'); - }); + const instance = store.update.mock.calls[0][0]; + expect(instance.status).toBe('failed'); - test('bypasses getRetry (returning false) of a recurring task to set retryAt when defined', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = _.random(1, 3); - const timeoutMinutes = 1; - const getRetryStub = sinon.stub().returns(false); - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { interval: '1m' }, - startedAt: new Date(), - }, - definitions: { - bar: { - title: 'Bar!', - timeout: `${timeoutMinutes}m`, - getRetry: getRetryStub, - createTaskRunner: () => ({ - run: async () => undefined, - }), - }, - }, + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ + error, + task: originalInstance, + result: TaskRunResult.Failed, + }) + ) + ) + ); + expect(onTaskEvent).toHaveBeenCalledTimes(1); }); - await runner.markTaskAsRunning(); + test('tasks that return runAt override the schedule', async () => { + const runAt = minutesFromNow(_.random(5)); + const { runner, store } = await readyToRunStageSetup({ + instance: { + schedule: { interval: '20m' }, + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { runAt, state: {} }; + }, + }), + }, + }, + }); - sinon.assert.calledOnce(store.update); - sinon.assert.notCalled(getRetryStub); - const instance = store.update.args[0][0]; + await runner.run(); - const timeoutDelay = timeoutMinutes * 60 * 1000; - expect(instance.retryAt.getTime()).toEqual(new Date(Date.now() + timeoutDelay).getTime()); - }); + expect(store.update).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledWith(expect.objectContaining({ runAt })); + }); - test('Fails non-recurring task when maxAttempts reached', async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = 3; - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: undefined, - }, - definitions: { - bar: { - title: 'Bar!', - maxAttempts: 3, - createTaskRunner: () => ({ - run: async () => { - throw new Error(); - }, - }), + test('removes non-recurring tasks after they complete', async () => { + const id = _.random(1, 20).toString(); + const { runner, store } = await readyToRunStageSetup({ + instance: { + id, + schedule: undefined, }, - }, - }); + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return undefined; + }, + }), + }, + }, + }); - await runner.run(); + await runner.run(); - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; - expect(instance.attempts).toEqual(3); - expect(instance.status).toEqual('failed'); - expect(instance.retryAt).toBeNull(); - expect(instance.runAt.getTime()).toBeLessThanOrEqual(Date.now()); - }); + expect(store.remove).toHaveBeenCalledTimes(1); + expect(store.remove).toHaveBeenCalledWith(id); + }); - test(`Doesn't fail recurring tasks when maxAttempts reached`, async () => { - const id = _.random(1, 20).toString(); - const initialAttempts = 3; - const intervalSeconds = 10; - const { runner, store } = testOpts({ - instance: { - id, - attempts: initialAttempts, - schedule: { interval: `${intervalSeconds}s` }, - startedAt: new Date(), - }, - definitions: { - bar: { - title: 'Bar!', - maxAttempts: 3, - createTaskRunner: () => ({ - run: async () => { - throw new Error(); - }, - }), + test('cancel cancels the task runner, if it is cancellable', async () => { + let wasCancelled = false; + const { runner, logger } = await readyToRunStageSetup({ + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + const promise = new Promise((r) => setTimeout(r, 1000)); + fakeTimer.tick(1000); + await promise; + }, + async cancel() { + wasCancelled = true; + }, + }), + }, }, - }, - }); + }); - await runner.run(); + const promise = runner.run(); + await Promise.resolve(); + await runner.cancel(); + await promise; - sinon.assert.calledOnce(store.update); - const instance = store.update.args[0][0]; - expect(instance.attempts).toEqual(3); - expect(instance.status).toEqual('idle'); - expect(instance.runAt.getTime()).toEqual( - new Date(Date.now() + intervalSeconds * 1000).getTime() - ); - }); + expect(wasCancelled).toBeTruthy(); + expect(logger.warn).not.toHaveBeenCalled(); + }); - describe('TaskEvents', () => { - test('emits TaskEvent when a task is marked as running', async () => { - const id = _.random(1, 20).toString(); - const onTaskEvent = jest.fn(); - const { runner, instance, store } = testOpts({ - onTaskEvent, - instance: { - id, - }, + test('debug logs if cancel is called on a non-cancellable task', async () => { + const { runner, logger } = await readyToRunStageSetup({ definitions: { bar: { title: 'Bar!', - timeout: `1m`, createTaskRunner: () => ({ run: async () => undefined, }), @@ -1010,58 +865,63 @@ describe('TaskManagerRunner', () => { }, }); - store.update.returns(instance); + const promise = runner.run(); + await runner.cancel(); + await promise; - await runner.markTaskAsRunning(); - - expect(onTaskEvent).toHaveBeenCalledWith(asTaskMarkRunningEvent(id, asOk(instance))); + expect(logger.debug).toHaveBeenCalledWith(`The task bar "foo" is not cancellable.`); }); - test('emits TaskEvent when a task fails to be marked as running', async () => { - expect.assertions(2); - - const id = _.random(1, 20).toString(); - const onTaskEvent = jest.fn(); - const { runner, store } = testOpts({ - onTaskEvent, + test('uses getRetry function (returning date) on error when defined', async () => { + const initialAttempts = _.random(1, 3); + const nextRetry = new Date(Date.now() + _.random(15, 100) * 1000); + const id = Date.now().toString(); + const getRetryStub = sinon.stub().returns(nextRetry); + const error = new Error('Dangit!'); + const { runner, store } = await readyToRunStageSetup({ instance: { id, + attempts: initialAttempts, }, definitions: { bar: { title: 'Bar!', - timeout: `1m`, + getRetry: getRetryStub, createTaskRunner: () => ({ - run: async () => undefined, + async run() { + throw error; + }, }), }, }, }); - store.update.throws(new Error('cant mark as running')); + await runner.run(); - try { - await runner.markTaskAsRunning(); - } catch (err) { - expect(onTaskEvent).toHaveBeenCalledWith(asTaskMarkRunningEvent(id, asErr(err))); - } - expect(onTaskEvent).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts, error); + const instance = store.update.mock.calls[0][0]; + + expect(instance.runAt.getTime()).toEqual(nextRetry.getTime()); }); - test('emits TaskEvent when a task is run successfully', async () => { - const id = _.random(1, 20).toString(); - const onTaskEvent = jest.fn(); - const { runner, instance } = testOpts({ - onTaskEvent, + test('uses getRetry function (returning true) on error when defined', async () => { + const initialAttempts = _.random(1, 3); + const id = Date.now().toString(); + const getRetryStub = sinon.stub().returns(true); + const error = new Error('Dangit!'); + const { runner, store } = await readyToRunStageSetup({ instance: { id, + attempts: initialAttempts, }, definitions: { bar: { title: 'Bar!', + getRetry: getRetryStub, createTaskRunner: () => ({ async run() { - return { state: {} }; + throw error; }, }), }, @@ -1070,27 +930,31 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) - ); + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts, error); + const instance = store.update.mock.calls[0][0]; + + const expectedRunAt = new Date(Date.now() + initialAttempts * 5 * 60 * 1000); + expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime()); }); - test('emits TaskEvent when a recurring task is run successfully', async () => { - const id = _.random(1, 20).toString(); - const runAt = minutesFromNow(_.random(5)); - const onTaskEvent = jest.fn(); - const { runner, instance } = testOpts({ - onTaskEvent, + test('uses getRetry function (returning false) on error when defined', async () => { + const initialAttempts = _.random(1, 3); + const id = Date.now().toString(); + const getRetryStub = sinon.stub().returns(false); + const error = new Error('Dangit!'); + const { runner, store } = await readyToRunStageSetup({ instance: { id, - schedule: { interval: '1m' }, + attempts: initialAttempts, }, definitions: { bar: { title: 'Bar!', + getRetry: getRetryStub, createTaskRunner: () => ({ async run() { - return { runAt, state: {} }; + throw error; }, }), }, @@ -1099,23 +963,29 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) - ); + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.calledWith(getRetryStub, initialAttempts, error); + const instance = store.update.mock.calls[0][0]; + + expect(instance.status).toBe('failed'); }); - test('emits TaskEvent when a task run throws an error', async () => { - const id = _.random(1, 20).toString(); + test('bypasses getRetry function (returning false) on error of a recurring task', async () => { + const initialAttempts = _.random(1, 3); + const id = Date.now().toString(); + const getRetryStub = sinon.stub().returns(false); const error = new Error('Dangit!'); - const onTaskEvent = jest.fn(); - const { runner, instance } = testOpts({ - onTaskEvent, + const { runner, store } = await readyToRunStageSetup({ instance: { id, + attempts: initialAttempts, + schedule: { interval: '1m' }, + startedAt: new Date(), }, definitions: { bar: { title: 'Bar!', + getRetry: getRetryStub, createTaskRunner: () => ({ async run() { throw error; @@ -1124,33 +994,34 @@ describe('TaskManagerRunner', () => { }, }, }); + await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming( - asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) - ) - ); - expect(onTaskEvent).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledTimes(1); + sinon.assert.notCalled(getRetryStub); + const instance = store.update.mock.calls[0][0]; + + const nextIntervalDelay = 60000; // 1m + const expectedRunAt = new Date(Date.now() + nextIntervalDelay); + expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime()); }); - test('emits TaskEvent when a task run returns an error', async () => { + test('Fails non-recurring task when maxAttempts reached', async () => { const id = _.random(1, 20).toString(); - const error = new Error('Dangit!'); - const onTaskEvent = jest.fn(); - const { runner, instance } = testOpts({ - onTaskEvent, + const initialAttempts = 3; + const { runner, store } = await readyToRunStageSetup({ instance: { id, - schedule: { interval: '1m' }, - startedAt: new Date(), + attempts: initialAttempts, + schedule: undefined, }, definitions: { bar: { title: 'Bar!', + maxAttempts: 3, createTaskRunner: () => ({ - async run() { - return { error, state: {} }; + run: async () => { + throw new Error(); }, }), }, @@ -1159,31 +1030,32 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming( - asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) - ) - ); - expect(onTaskEvent).toHaveBeenCalledTimes(1); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; + expect(instance.attempts).toEqual(3); + expect(instance.status).toEqual('failed'); + expect(instance.retryAt!).toBeNull(); + expect(instance.runAt.getTime()).toBeLessThanOrEqual(Date.now()); }); - test('emits TaskEvent when a task returns an error and is marked as failed', async () => { + test(`Doesn't fail recurring tasks when maxAttempts reached`, async () => { const id = _.random(1, 20).toString(); - const error = new Error('Dangit!'); - const onTaskEvent = jest.fn(); - const { runner, store, instance: originalInstance } = testOpts({ - onTaskEvent, + const initialAttempts = 3; + const intervalSeconds = 10; + const { runner, store } = await readyToRunStageSetup({ instance: { id, + attempts: initialAttempts, + schedule: { interval: `${intervalSeconds}s` }, startedAt: new Date(), }, definitions: { bar: { title: 'Bar!', - getRetry: () => false, + maxAttempts: 3, createTaskRunner: () => ({ - async run() { - return { error, state: {} }; + run: async () => { + throw new Error(); }, }), }, @@ -1192,29 +1064,190 @@ describe('TaskManagerRunner', () => { await runner.run(); - const instance = store.update.args[0][0]; - expect(instance.status).toBe('failed'); + expect(store.update).toHaveBeenCalledTimes(1); + const instance = store.update.mock.calls[0][0]; + expect(instance.attempts).toEqual(3); + expect(instance.status).toEqual('idle'); + expect(instance.runAt.getTime()).toEqual( + new Date(Date.now() + intervalSeconds * 1000).getTime() + ); + }); - expect(onTaskEvent).toHaveBeenCalledWith( - withAnyTiming( - asTaskRunEvent( + describe('TaskEvents', () => { + test('emits TaskEvent when a task is run successfully', async () => { + const id = _.random(1, 20).toString(); + const onTaskEvent = jest.fn(); + const { runner, instance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { id, - asErr({ - error, - task: originalInstance, - result: TaskRunResult.Failed, - }) + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { state: {} }; + }, + }), + }, + }, + }); + + await runner.run(); + + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); + }); + + test('emits TaskEvent when a recurring task is run successfully', async () => { + const id = _.random(1, 20).toString(); + const runAt = minutesFromNow(_.random(5)); + const onTaskEvent = jest.fn(); + const { runner, instance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { + id, + schedule: { interval: '1m' }, + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { runAt, state: {} }; + }, + }), + }, + }, + }); + + await runner.run(); + + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); + }); + + test('emits TaskEvent when a task run throws an error', async () => { + const id = _.random(1, 20).toString(); + const error = new Error('Dangit!'); + const onTaskEvent = jest.fn(); + const { runner, instance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { + id, + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + throw error; + }, + }), + }, + }, + }); + await runner.run(); + + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ error, task: instance, result: TaskRunResult.RetryScheduled }) + ) ) - ) - ); - expect(onTaskEvent).toHaveBeenCalledTimes(1); + ); + expect(onTaskEvent).toHaveBeenCalledTimes(1); + }); + + test('emits TaskEvent when a task run returns an error', async () => { + const id = _.random(1, 20).toString(); + const error = new Error('Dangit!'); + const onTaskEvent = jest.fn(); + const { runner, instance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { + id, + schedule: { interval: '1m' }, + startedAt: new Date(), + }, + definitions: { + bar: { + title: 'Bar!', + createTaskRunner: () => ({ + async run() { + return { error, state: {} }; + }, + }), + }, + }, + }); + + await runner.run(); + + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ error, task: instance, result: TaskRunResult.RetryScheduled }) + ) + ) + ); + expect(onTaskEvent).toHaveBeenCalledTimes(1); + }); + + test('emits TaskEvent when a task returns an error and is marked as failed', async () => { + const id = _.random(1, 20).toString(); + const error = new Error('Dangit!'); + const onTaskEvent = jest.fn(); + const { runner, store, instance: originalInstance } = await readyToRunStageSetup({ + onTaskEvent, + instance: { + id, + startedAt: new Date(), + }, + definitions: { + bar: { + title: 'Bar!', + getRetry: () => false, + createTaskRunner: () => ({ + async run() { + return { error, state: {} }; + }, + }), + }, + }, + }); + + await runner.run(); + + const instance = store.update.mock.calls[0][0]; + expect(instance.status).toBe('failed'); + + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ + error, + task: originalInstance, + result: TaskRunResult.Failed, + }) + ) + ) + ); + expect(onTaskEvent).toHaveBeenCalledTimes(1); + }); }); }); interface TestOpts { instance?: Partial; definitions?: TaskDefinitionRegistry; - onTaskEvent?: (event: TaskEvent) => void; + onTaskEvent?: jest.Mock<(event: TaskEvent) => void>; } function withAnyTiming(taskRun: TaskRun) { @@ -1247,20 +1280,16 @@ describe('TaskManagerRunner', () => { ); } - function testOpts(opts: TestOpts) { + async function testOpts(stage: TaskRunningStage, opts: TestOpts) { const callCluster = sinon.stub(); const createTaskRunner = sinon.stub(); const logger = mockLogger(); const instance = mockInstance(opts.instance); - const store = { - update: sinon.stub(), - remove: sinon.stub(), - maxAttempts: 5, - }; + const store = taskStoreMock.create(); - store.update.returns(instance); + store.update.mockResolvedValue(instance); const definitions = new TaskTypeDictionary(logger); definitions.registerTaskDefinitions({ @@ -1274,6 +1303,7 @@ describe('TaskManagerRunner', () => { } const runner = new TaskManagerRunner({ + defaultMaxAttempts: 5, beforeRun: (context) => Promise.resolve(context), beforeMarkRunning: (context) => Promise.resolve(context), logger, @@ -1283,6 +1313,15 @@ describe('TaskManagerRunner', () => { onTaskEvent: opts.onTaskEvent, }); + if (stage === TaskRunningStage.READY_TO_RUN) { + await runner.markTaskAsRunning(); + // as we're testing the ReadyToRun stage specifically, clear mocks cakked by setup + store.update.mockClear(); + if (opts.onTaskEvent) { + opts.onTaskEvent.mockClear(); + } + } + return { callCluster, createTaskRunner, diff --git a/x-pack/plugins/task_manager/server/task_running/task_runner.ts b/x-pack/plugins/task_manager/server/task_running/task_runner.ts index ad5a2e11409ec8..8e061eae460280 100644 --- a/x-pack/plugins/task_manager/server/task_running/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_running/task_runner.ts @@ -63,11 +63,22 @@ export interface TaskRunner { markTaskAsRunning: () => Promise; run: () => Promise>; id: string; + stage: string; toString: () => string; } +export enum TaskRunningStage { + PENDING = 'PENDING', + READY_TO_RUN = 'READY_TO_RUN', + RAN = 'RAN', +} +export interface TaskRunning { + timestamp: Date; + stage: Stage; + task: Instance; +} + export interface Updatable { - readonly maxAttempts: number; update(doc: ConcreteTaskInstance): Promise; remove(id: string): Promise; } @@ -78,6 +89,7 @@ type Opts = { instance: ConcreteTaskInstance; store: Updatable; onTaskEvent?: (event: TaskRun | TaskMarkRunning) => void; + defaultMaxAttempts: number; } & Pick; export enum TaskRunResult { @@ -91,6 +103,16 @@ export enum TaskRunResult { Failed = 'Failed', } +// A ConcreteTaskInstance which we *know* has a `startedAt` Date on it +type ConcreteTaskInstanceWithStartedAt = ConcreteTaskInstance & { startedAt: Date }; + +// The three possible stages for a Task Runner - Pending -> ReadyToRun -> Ran +type PendingTask = TaskRunning; +type ReadyToRunTask = TaskRunning; +type RanTask = TaskRunning; + +type TaskRunningInstance = PendingTask | ReadyToRunTask | RanTask; + /** * Runs a background task, ensures that errors are properly handled, * allows for cancellation. @@ -101,13 +123,14 @@ export enum TaskRunResult { */ export class TaskManagerRunner implements TaskRunner { private task?: CancellableTask; - private instance: ConcreteTaskInstance; + private instance: TaskRunningInstance; private definitions: TaskTypeDictionary; private logger: Logger; private bufferedTaskStore: Updatable; private beforeRun: Middleware['beforeRun']; private beforeMarkRunning: Middleware['beforeMarkRunning']; private onTaskEvent: (event: TaskRun | TaskMarkRunning) => void; + private defaultMaxAttempts: number; /** * Creates an instance of TaskManagerRunner. @@ -126,29 +149,38 @@ export class TaskManagerRunner implements TaskRunner { store, beforeRun, beforeMarkRunning, + defaultMaxAttempts, onTaskEvent = identity, }: Opts) { - this.instance = sanitizeInstance(instance); + this.instance = asPending(sanitizeInstance(instance)); this.definitions = definitions; this.logger = logger; this.bufferedTaskStore = store; this.beforeRun = beforeRun; this.beforeMarkRunning = beforeMarkRunning; this.onTaskEvent = onTaskEvent; + this.defaultMaxAttempts = defaultMaxAttempts; } /** * Gets the id of this task instance. */ public get id() { - return this.instance.id; + return this.instance.task.id; } /** * Gets the task type of this task instance. */ public get taskType() { - return this.instance.taskType; + return this.instance.task.taskType; + } + + /** + * Get the stage this TaskRunner is at + */ + public get stage() { + return this.instance.stage; } /** @@ -162,14 +194,21 @@ export class TaskManagerRunner implements TaskRunner { * Gets the time at which this task will expire. */ public get expiration() { - return intervalFromDate(this.instance.startedAt!, this.definition.timeout)!; + return intervalFromDate( + // if the task is running, use it's started at, otherwise use the timestamp at + // which it was last updated + // this allows us to catch tasks that remain in Pending/Finalizing without being + // cleaned up + isReadyToRun(this.instance) ? this.instance.task.startedAt : this.instance.timestamp, + this.definition.timeout + )!; } /** * Gets the duration of the current task run */ public get startedAt() { - return this.instance.startedAt; + return this.instance.task.startedAt; } /** @@ -195,9 +234,16 @@ export class TaskManagerRunner implements TaskRunner { * @returns {Promise>} */ public async run(): Promise> { + if (!isReadyToRun(this.instance)) { + throw new Error( + `Running task ${this} failed as it ${ + isPending(this.instance) ? `isn't ready to be ran` : `has already been ran` + }` + ); + } this.logger.debug(`Running task ${this}`); const modifiedContext = await this.beforeRun({ - taskInstance: this.instance, + taskInstance: this.instance.task, }); const stopTaskTimer = startTaskTimer(); @@ -230,10 +276,16 @@ export class TaskManagerRunner implements TaskRunner { * @returns {Promise} */ public async markTaskAsRunning(): Promise { + if (!isPending(this.instance)) { + throw new Error( + `Marking task ${this} as running has failed as it ${ + isReadyToRun(this.instance) ? `is already running` : `has already been ran` + }` + ); + } performance.mark('markTaskAsRunning_start'); const apmTrans = apm.startTransaction(`taskManager markTaskAsRunning`, 'taskManager'); - apmTrans?.addLabels({ taskType: this.taskType, }); @@ -241,7 +293,7 @@ export class TaskManagerRunner implements TaskRunner { const now = new Date(); try { const { taskInstance } = await this.beforeMarkRunning({ - taskInstance: this.instance, + taskInstance: this.instance.task, }); const attempts = taskInstance.attempts + 1; @@ -258,22 +310,29 @@ export class TaskManagerRunner implements TaskRunner { ); } - this.instance = await this.bufferedTaskStore.update({ - ...taskInstance, - status: TaskStatus.Running, - startedAt: now, - attempts, - retryAt: - (this.instance.schedule - ? maxIntervalFromDate(now, this.instance.schedule!.interval, this.definition.timeout) - : this.getRetryDelay({ - attempts, - // Fake an error. This allows retry logic when tasks keep timing out - // and lets us set a proper "retryAt" value each time. - error: new Error('Task timeout'), - addDuration: this.definition.timeout, - })) ?? null, - }); + this.instance = asReadyToRun( + (await this.bufferedTaskStore.update({ + ...taskInstance, + status: TaskStatus.Running, + startedAt: now, + attempts, + retryAt: + (this.instance.task.schedule + ? maxIntervalFromDate( + now, + this.instance.task.schedule.interval, + this.definition.timeout + ) + : this.getRetryDelay({ + attempts, + // Fake an error. This allows retry logic when tasks keep timing out + // and lets us set a proper "retryAt" value each time. + error: new Error('Task timeout'), + addDuration: this.definition.timeout, + })) ?? null, + // This is a safe convertion as we're setting the startAt above + })) as ConcreteTaskInstanceWithStartedAt + ); const timeUntilClaimExpiresAfterUpdate = howManyMsUntilOwnershipClaimExpires( ownershipClaimedUntil @@ -288,7 +347,7 @@ export class TaskManagerRunner implements TaskRunner { if (apmTrans) apmTrans.end('success'); performanceStopMarkingTaskAsRunning(); - this.onTaskEvent(asTaskMarkRunningEvent(this.id, asOk(this.instance))); + this.onTaskEvent(asTaskMarkRunningEvent(this.id, asOk(this.instance.task))); return true; } catch (error) { if (apmTrans) apmTrans.end('failure'); @@ -299,7 +358,7 @@ export class TaskManagerRunner implements TaskRunner { // try to release claim as an unknown failure prevented us from marking as running mapErr((errReleaseClaim: Error) => { this.logger.error( - `[Task Runner] Task ${this.instance.id} failed to release claim after failure: ${errReleaseClaim}` + `[Task Runner] Task ${this.id} failed to release claim after failure: ${errReleaseClaim}` ); }, await this.releaseClaimAndIncrementAttempts()); } @@ -336,9 +395,9 @@ export class TaskManagerRunner implements TaskRunner { private async releaseClaimAndIncrementAttempts(): Promise> { return promiseResult( this.bufferedTaskStore.update({ - ...this.instance, + ...this.instance.task, status: TaskStatus.Idle, - attempts: this.instance.attempts + 1, + attempts: this.instance.task.attempts + 1, startedAt: null, retryAt: null, ownerId: null, @@ -347,12 +406,12 @@ export class TaskManagerRunner implements TaskRunner { } private shouldTryToScheduleRetry(): boolean { - if (this.instance.schedule) { + if (this.instance.task.schedule) { return true; } - const maxAttempts = this.definition.maxAttempts || this.bufferedTaskStore.maxAttempts; - return this.instance.attempts < maxAttempts; + const maxAttempts = this.definition.maxAttempts || this.defaultMaxAttempts; + return this.instance.task.attempts < maxAttempts; } private rescheduleFailedRun = ( @@ -361,7 +420,7 @@ export class TaskManagerRunner implements TaskRunner { const { state, error } = failureResult; if (this.shouldTryToScheduleRetry() && !isUnrecoverableError(error)) { // if we're retrying, keep the number of attempts - const { schedule, attempts } = this.instance; + const { schedule, attempts } = this.instance.task; const reschedule = failureResult.runAt ? { runAt: failureResult.runAt } @@ -399,7 +458,7 @@ export class TaskManagerRunner implements TaskRunner { // if retrying is possible (new runAt) or this is an recurring task - reschedule mapOk( ({ runAt, schedule: reschedule, state, attempts = 0 }: Partial) => { - const { startedAt, schedule } = this.instance; + const { startedAt, schedule } = this.instance.task; return asOk({ runAt: runAt || intervalFromDate(startedAt!, reschedule?.interval ?? schedule?.interval)!, @@ -413,16 +472,18 @@ export class TaskManagerRunner implements TaskRunner { unwrap )(result); - await this.bufferedTaskStore.update( - defaults( - { - ...fieldUpdates, - // reset fields that track the lifecycle of the concluded `task run` - startedAt: null, - retryAt: null, - ownerId: null, - }, - this.instance + this.instance = asRan( + await this.bufferedTaskStore.update( + defaults( + { + ...fieldUpdates, + // reset fields that track the lifecycle of the concluded `task run` + startedAt: null, + retryAt: null, + ownerId: null, + }, + this.instance.task + ) ) ); @@ -436,7 +497,8 @@ export class TaskManagerRunner implements TaskRunner { private async processResultWhenDone(): Promise { // not a recurring task: clean up by removing the task instance from store try { - await this.bufferedTaskStore.remove(this.instance.id); + await this.bufferedTaskStore.remove(this.id); + this.instance = asRan(this.instance.task); } catch (err) { if (err.statusCode === 404) { this.logger.warn(`Task cleanup of ${this} failed in processing. Was remove called twice?`); @@ -451,7 +513,7 @@ export class TaskManagerRunner implements TaskRunner { result: Result, taskTiming: TaskTiming ): Promise> { - const task = this.instance; + const { task } = this.instance; await eitherAsync( result, async ({ runAt, schedule }: SuccessfulRunResult) => { @@ -528,3 +590,38 @@ function performanceStopMarkingTaskAsRunning() { 'markTaskAsRunning_stop' ); } + +// A type that extracts the Instance type out of TaskRunningStage +// This helps us to better communicate to the developer what the expected "stage" +// in a specific place in the code might be +type InstanceOf = T extends TaskRunning ? I : never; + +function isPending(taskRunning: TaskRunningInstance): taskRunning is PendingTask { + return taskRunning.stage === TaskRunningStage.PENDING; +} +function asPending(task: InstanceOf): PendingTask { + return { + timestamp: new Date(), + stage: TaskRunningStage.PENDING, + task, + }; +} +function isReadyToRun(taskRunning: TaskRunningInstance): taskRunning is ReadyToRunTask { + return taskRunning.stage === TaskRunningStage.READY_TO_RUN; +} +function asReadyToRun( + task: InstanceOf +): ReadyToRunTask { + return { + timestamp: new Date(), + stage: TaskRunningStage.READY_TO_RUN, + task, + }; +} +function asRan(task: InstanceOf): RanTask { + return { + timestamp: new Date(), + stage: TaskRunningStage.RAN, + task, + }; +} diff --git a/x-pack/plugins/task_manager/server/task_scheduling.test.ts b/x-pack/plugins/task_manager/server/task_scheduling.test.ts index e495d416d5ab86..b142f2091291ed 100644 --- a/x-pack/plugins/task_manager/server/task_scheduling.test.ts +++ b/x-pack/plugins/task_manager/server/task_scheduling.test.ts @@ -7,13 +7,14 @@ import _ from 'lodash'; import { Subject } from 'rxjs'; -import { none } from 'fp-ts/lib/Option'; +import { none, some } from 'fp-ts/lib/Option'; import { asTaskMarkRunningEvent, asTaskRunEvent, asTaskClaimEvent, asTaskRunRequestEvent, + TaskClaimErrorType, } from './task_events'; import { TaskLifecycleEvent } from './polling_lifecycle'; import { taskPollingLifecycleMock } from './polling_lifecycle.mock'; @@ -24,17 +25,28 @@ import { createInitialMiddleware } from './lib/middleware'; import { taskStoreMock } from './task_store.mock'; import { TaskRunResult } from './task_running'; import { mockLogger } from './test_utils'; +import { TaskTypeDictionary } from './task_type_dictionary'; describe('TaskScheduling', () => { const mockTaskStore = taskStoreMock.create({}); const mockTaskManager = taskPollingLifecycleMock.create({}); + const definitions = new TaskTypeDictionary(mockLogger()); const taskSchedulingOpts = { taskStore: mockTaskStore, taskPollingLifecycle: mockTaskManager, logger: mockLogger(), middleware: createInitialMiddleware(), + definitions, }; + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + maxConcurrency: 2, + createTaskRunner: jest.fn(), + }, + }); + beforeEach(() => { jest.resetAllMocks(); }); @@ -114,7 +126,7 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - const task = { id } as ConcreteTaskInstance; + const task = mockTask({ id }); events$.next(asTaskRunEvent(id, asOk({ task, result: TaskRunResult.Success }))); return expect(result).resolves.toEqual({ id }); @@ -131,7 +143,7 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - const task = { id } as ConcreteTaskInstance; + const task = mockTask({ id }); events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskMarkRunningEvent(id, asOk(task))); events$.next( @@ -161,7 +173,7 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - const task = { id } as ConcreteTaskInstance; + const task = mockTask({ id }); events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskMarkRunningEvent(id, asErr(new Error('some thing gone wrong')))); @@ -183,7 +195,12 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - events$.next(asTaskClaimEvent(id, asErr(none))); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ); await expect(result).rejects.toEqual( new Error(`Failed to run task "${id}" as it does not exist`) @@ -192,6 +209,34 @@ describe('TaskScheduling', () => { expect(mockTaskStore.getLifecycle).toHaveBeenCalledWith(id); }); + test('when a task claim due to insufficient capacity we return an explciit message', async () => { + const events$ = new Subject(); + const id = '01ddff11-e88a-4d13-bc4e-256164e755e2'; + + mockTaskStore.getLifecycle.mockResolvedValue(TaskLifecycleResult.NotFound); + + const taskScheduling = new TaskScheduling({ + ...taskSchedulingOpts, + taskPollingLifecycle: taskPollingLifecycleMock.create({ events$ }), + }); + + const result = taskScheduling.runNow(id); + + const task = mockTask({ id, taskType: 'foo' }); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: some(task), errorType: TaskClaimErrorType.CLAIMED_BY_ID_OUT_OF_CAPACITY }) + ) + ); + + await expect(result).rejects.toEqual( + new Error( + `Failed to run task "${id}" as we would exceed the max concurrency of "${task.taskType}" which is 2. Rescheduled the task to ensure it is picked up as soon as possible.` + ) + ); + }); + test('when a task claim fails we ensure the task isnt already claimed', async () => { const events$ = new Subject(); const id = '01ddff11-e88a-4d13-bc4e-256164e755e2'; @@ -205,7 +250,12 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - events$.next(asTaskClaimEvent(id, asErr(none))); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ); await expect(result).rejects.toEqual( new Error(`Failed to run task "${id}" as it is currently running`) @@ -227,7 +277,12 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - events$.next(asTaskClaimEvent(id, asErr(none))); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ); await expect(result).rejects.toEqual( new Error(`Failed to run task "${id}" as it is currently running`) @@ -270,7 +325,12 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - events$.next(asTaskClaimEvent(id, asErr(none))); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ); await expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2" for unknown reason (Current Task Lifecycle is "idle")]` @@ -292,7 +352,12 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - events$.next(asTaskClaimEvent(id, asErr(none))); + events$.next( + asTaskClaimEvent( + id, + asErr({ task: none, errorType: TaskClaimErrorType.CLAIMED_BY_ID_NOT_RETURNED }) + ) + ); await expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2" for unknown reason (Current Task Lifecycle is "failed")]` @@ -313,7 +378,7 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); - const task = { id } as ConcreteTaskInstance; + const task = mockTask({ id }); const otherTask = { id: differentTask } as ConcreteTaskInstance; events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskClaimEvent(differentTask, asOk(otherTask))); @@ -338,3 +403,23 @@ describe('TaskScheduling', () => { }); }); }); + +function mockTask(overrides: Partial = {}): ConcreteTaskInstance { + return { + id: 'claimed-by-id', + runAt: new Date(), + taskType: 'foo', + schedule: undefined, + attempts: 0, + status: TaskStatus.Claiming, + params: { hello: 'world' }, + state: { baby: 'Henhen' }, + user: 'jimbo', + scope: ['reporting'], + ownerId: '', + startedAt: null, + retryAt: null, + scheduledAt: new Date(), + ...overrides, + }; +} diff --git a/x-pack/plugins/task_manager/server/task_scheduling.ts b/x-pack/plugins/task_manager/server/task_scheduling.ts index 8ccedb85c560df..29e83ec911b795 100644 --- a/x-pack/plugins/task_manager/server/task_scheduling.ts +++ b/x-pack/plugins/task_manager/server/task_scheduling.ts @@ -8,7 +8,7 @@ import { filter } from 'rxjs/operators'; import { pipe } from 'fp-ts/lib/pipeable'; -import { Option, map as mapOptional, getOrElse } from 'fp-ts/lib/Option'; +import { Option, map as mapOptional, getOrElse, isSome } from 'fp-ts/lib/Option'; import { Logger } from '../../../../src/core/server'; import { asOk, either, map, mapErr, promiseResult } from './lib/result_type'; @@ -20,6 +20,8 @@ import { ErroredTask, OkResultOf, ErrResultOf, + ClaimTaskErr, + TaskClaimErrorType, } from './task_events'; import { Middleware } from './lib/middleware'; import { @@ -33,6 +35,7 @@ import { import { TaskStore } from './task_store'; import { ensureDeprecatedFieldsAreCorrected } from './lib/correct_deprecated_fields'; import { TaskLifecycleEvent, TaskPollingLifecycle } from './polling_lifecycle'; +import { TaskTypeDictionary } from './task_type_dictionary'; const VERSION_CONFLICT_STATUS = 409; @@ -41,6 +44,7 @@ export interface TaskSchedulingOpts { taskStore: TaskStore; taskPollingLifecycle: TaskPollingLifecycle; middleware: Middleware; + definitions: TaskTypeDictionary; } interface RunNowResult { @@ -52,6 +56,7 @@ export class TaskScheduling { private taskPollingLifecycle: TaskPollingLifecycle; private logger: Logger; private middleware: Middleware; + private definitions: TaskTypeDictionary; /** * Initializes the task manager, preventing any further addition of middleware, @@ -63,6 +68,7 @@ export class TaskScheduling { this.middleware = opts.middleware; this.taskPollingLifecycle = opts.taskPollingLifecycle; this.store = opts.taskStore; + this.definitions = opts.definitions; } /** @@ -122,10 +128,27 @@ export class TaskScheduling { .pipe(filter(({ id }: TaskLifecycleEvent) => id === taskId)) .subscribe((taskEvent: TaskLifecycleEvent) => { if (isTaskClaimEvent(taskEvent)) { - mapErr(async (error: Option) => { + mapErr(async (error: ClaimTaskErr) => { // reject if any error event takes place for the requested task subscription.unsubscribe(); - return reject(await this.identifyTaskFailureReason(taskId, error)); + if ( + isSome(error.task) && + error.errorType === TaskClaimErrorType.CLAIMED_BY_ID_OUT_OF_CAPACITY + ) { + const task = error.task.value; + const definition = this.definitions.get(task.taskType); + return reject( + new Error( + `Failed to run task "${taskId}" as we would exceed the max concurrency of "${ + definition?.title ?? task.taskType + }" which is ${ + definition?.maxConcurrency + }. Rescheduled the task to ensure it is picked up as soon as possible.` + ) + ); + } else { + return reject(await this.identifyTaskFailureReason(taskId, error.task)); + } }, taskEvent.event); } else { either, ErrResultOf>( diff --git a/x-pack/plugins/task_manager/server/task_store.mock.ts b/x-pack/plugins/task_manager/server/task_store.mock.ts index d4f863af6fe3b1..38d570f96220bc 100644 --- a/x-pack/plugins/task_manager/server/task_store.mock.ts +++ b/x-pack/plugins/task_manager/server/task_store.mock.ts @@ -5,38 +5,27 @@ * 2.0. */ -import { Observable, Subject } from 'rxjs'; -import { TaskClaim } from './task_events'; - import { TaskStore } from './task_store'; interface TaskStoreOptions { - maxAttempts?: number; index?: string; taskManagerId?: string; - events?: Observable; } export const taskStoreMock = { - create({ - maxAttempts = 0, - index = '', - taskManagerId = '', - events = new Subject(), - }: TaskStoreOptions) { + create({ index = '', taskManagerId = '' }: TaskStoreOptions = {}) { const mocked = ({ + convertToSavedObjectIds: jest.fn(), update: jest.fn(), remove: jest.fn(), schedule: jest.fn(), - claimAvailableTasks: jest.fn(), bulkUpdate: jest.fn(), get: jest.fn(), getLifecycle: jest.fn(), fetch: jest.fn(), aggregate: jest.fn(), - maxAttempts, + updateByQuery: jest.fn(), index, taskManagerId, - events, } as unknown) as jest.Mocked; return mocked; }, diff --git a/x-pack/plugins/task_manager/server/task_store.test.ts b/x-pack/plugins/task_manager/server/task_store.test.ts index dbf13a5f272810..25ee8cb0e23745 100644 --- a/x-pack/plugins/task_manager/server/task_store.test.ts +++ b/x-pack/plugins/task_manager/server/task_store.test.ts @@ -6,19 +6,16 @@ */ import _ from 'lodash'; -import uuid from 'uuid'; -import { filter, take, first } from 'rxjs/operators'; -import { Option, some, none } from 'fp-ts/lib/Option'; +import { first } from 'rxjs/operators'; import { TaskInstance, TaskStatus, TaskLifecycleResult, SerializedConcreteTaskInstance, - ConcreteTaskInstance, } from './task'; import { elasticsearchServiceMock } from '../../../../src/core/server/mocks'; -import { StoreOpts, OwnershipClaimingOpts, TaskStore, SearchOpts } from './task_store'; +import { TaskStore, SearchOpts } from './task_store'; import { savedObjectsRepositoryMock } from 'src/core/server/mocks'; import { SavedObjectsSerializer, @@ -26,12 +23,8 @@ import { SavedObjectAttributes, SavedObjectsErrorHelpers, } from 'src/core/server'; -import { asTaskClaimEvent, TaskEvent } from './task_events'; -import { asOk, asErr } from './lib/result_type'; import { TaskTypeDictionary } from './task_type_dictionary'; import { RequestEvent } from '@elastic/elasticsearch/lib/Transport'; -import { Search, UpdateByQuery } from '@elastic/elasticsearch/api/requestParams'; -import { BoolClauseWithAnyCondition, TermFilter } from './queries/query_clauses'; import { mockLogger } from './test_utils'; const savedObjectsClient = savedObjectsRepositoryMock.create(); @@ -76,7 +69,6 @@ describe('TaskStore', () => { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -209,7 +201,6 @@ describe('TaskStore', () => { taskManagerId: '', serializer, esClient, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -265,809 +256,6 @@ describe('TaskStore', () => { }); }); - describe('claimAvailableTasks', () => { - async function testClaimAvailableTasks({ - opts = {}, - hits = generateFakeTasks(1), - claimingOpts, - versionConflicts = 2, - }: { - opts: Partial; - hits?: unknown[]; - claimingOpts: OwnershipClaimingOpts; - versionConflicts?: number; - }) { - const esClient = elasticsearchServiceMock.createClusterClient().asInternalUser; - esClient.search.mockResolvedValue(asApiResponse({ hits: { hits } })); - esClient.updateByQuery.mockResolvedValue( - asApiResponse({ - total: hits.length + versionConflicts, - updated: hits.length, - version_conflicts: versionConflicts, - }) - ); - - const store = new TaskStore({ - esClient, - maxAttempts: 2, - definitions: taskDefinitions, - serializer, - savedObjectsRepository: savedObjectsClient, - taskManagerId: '', - index: '', - ...opts, - }); - - const result = await store.claimAvailableTasks(claimingOpts); - - expect(esClient.updateByQuery.mock.calls[0][0]).toMatchObject({ - max_docs: claimingOpts.size, - }); - expect(esClient.search.mock.calls[0][0]).toMatchObject({ body: { size: claimingOpts.size } }); - return { - result, - args: { - search: esClient.search.mock.calls[0][0]! as Search<{ - query: BoolClauseWithAnyCondition; - size: number; - sort: string | string[]; - }>, - updateByQuery: esClient.updateByQuery.mock.calls[0][0]! as UpdateByQuery<{ - query: BoolClauseWithAnyCondition; - size: number; - sort: string | string[]; - script: object; - }>, - }, - }; - } - - test('it returns normally with no tasks when the index does not exist.', async () => { - const esClient = elasticsearchServiceMock.createClusterClient().asInternalUser; - esClient.updateByQuery.mockResolvedValue( - asApiResponse({ - total: 0, - updated: 0, - }) - ); - const store = new TaskStore({ - index: 'tasky', - taskManagerId: '', - serializer, - esClient, - definitions: taskDefinitions, - maxAttempts: 2, - savedObjectsRepository: savedObjectsClient, - }); - const { docs } = await store.claimAvailableTasks({ - claimOwnershipUntil: new Date(), - size: 10, - }); - expect(esClient.updateByQuery.mock.calls[0][0]).toMatchObject({ - ignore_unavailable: true, - max_docs: 10, - }); - expect(docs.length).toBe(0); - }); - - test('it filters claimed tasks down by supported types, maxAttempts, status, and runAt', async () => { - const maxAttempts = _.random(2, 43); - const customMaxAttempts = _.random(44, 100); - - const definitions = new TaskTypeDictionary(mockLogger()); - definitions.registerTaskDefinitions({ - foo: { - title: 'foo', - createTaskRunner: jest.fn(), - }, - bar: { - title: 'bar', - maxAttempts: customMaxAttempts, - createTaskRunner: jest.fn(), - }, - }); - - const { - args: { - updateByQuery: { body: { query, sort } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - maxAttempts, - definitions, - }, - claimingOpts: { claimOwnershipUntil: new Date(), size: 10 }, - }); - expect(query).toMatchObject({ - bool: { - must: [ - { term: { type: 'task' } }, - { - bool: { - must: [ - { - bool: { - must: [ - { - bool: { - should: [ - { - bool: { - must: [ - { term: { 'task.status': 'idle' } }, - { range: { 'task.runAt': { lte: 'now' } } }, - ], - }, - }, - { - bool: { - must: [ - { - bool: { - should: [ - { term: { 'task.status': 'running' } }, - { term: { 'task.status': 'claiming' } }, - ], - }, - }, - { range: { 'task.retryAt': { lte: 'now' } } }, - ], - }, - }, - ], - }, - }, - ], - }, - }, - ], - filter: [ - { - bool: { - must_not: [ - { - bool: { - should: [ - { term: { 'task.status': 'running' } }, - { term: { 'task.status': 'claiming' } }, - ], - must: { range: { 'task.retryAt': { gt: 'now' } } }, - }, - }, - ], - }, - }, - ], - }, - }, - ], - }, - }); - expect(sort).toMatchObject([ - { - _script: { - type: 'number', - order: 'asc', - script: { - lang: 'painless', - source: ` -if (doc['task.retryAt'].size()!=0) { - return doc['task.retryAt'].value.toInstant().toEpochMilli(); -} -if (doc['task.runAt'].size()!=0) { - return doc['task.runAt'].value.toInstant().toEpochMilli(); -} - `, - }, - }, - }, - ]); - }); - - test('it supports claiming specific tasks by id', async () => { - const maxAttempts = _.random(2, 43); - const customMaxAttempts = _.random(44, 100); - const definitions = new TaskTypeDictionary(mockLogger()); - const taskManagerId = uuid.v1(); - const fieldUpdates = { - ownerId: taskManagerId, - retryAt: new Date(Date.now()), - }; - definitions.registerTaskDefinitions({ - foo: { - title: 'foo', - createTaskRunner: jest.fn(), - }, - bar: { - title: 'bar', - maxAttempts: customMaxAttempts, - createTaskRunner: jest.fn(), - }, - }); - const { - args: { - updateByQuery: { body: { query, script, sort } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - maxAttempts, - definitions, - }, - claimingOpts: { - claimOwnershipUntil: new Date(), - size: 10, - claimTasksById: [ - '33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', - 'a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', - ], - }, - }); - - expect(query).toMatchObject({ - bool: { - must: [ - { term: { type: 'task' } }, - { - bool: { - must: [ - { - pinned: { - ids: [ - 'task:33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', - 'task:a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', - ], - organic: { - bool: { - must: [ - { - bool: { - should: [ - { - bool: { - must: [ - { term: { 'task.status': 'idle' } }, - { range: { 'task.runAt': { lte: 'now' } } }, - ], - }, - }, - { - bool: { - must: [ - { - bool: { - should: [ - { term: { 'task.status': 'running' } }, - { term: { 'task.status': 'claiming' } }, - ], - }, - }, - { range: { 'task.retryAt': { lte: 'now' } } }, - ], - }, - }, - ], - }, - }, - ], - }, - }, - }, - }, - ], - filter: [ - { - bool: { - must_not: [ - { - bool: { - should: [ - { term: { 'task.status': 'running' } }, - { term: { 'task.status': 'claiming' } }, - ], - must: { range: { 'task.retryAt': { gt: 'now' } } }, - }, - }, - ], - }, - }, - ], - }, - }, - ], - }, - }); - - expect(script).toMatchObject({ - source: ` - if (params.registeredTaskTypes.contains(ctx._source.task.taskType)) { - if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { - ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) - .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) - .join(' ')} - } else { - ctx._source.task.status = "failed"; - } - } else { - ctx._source.task.status = "unrecognized"; - } - `, - lang: 'painless', - params: { - fieldUpdates, - claimTasksById: [ - 'task:33c6977a-ed6d-43bd-98d9-3f827f7b7cd8', - 'task:a208b22c-14ec-4fb4-995f-d2ff7a3b03b8', - ], - registeredTaskTypes: ['foo', 'bar'], - taskMaxAttempts: { - bar: customMaxAttempts, - foo: maxAttempts, - }, - }, - }); - - expect(sort).toMatchObject([ - '_score', - { - _script: { - type: 'number', - order: 'asc', - script: { - lang: 'painless', - source: ` -if (doc['task.retryAt'].size()!=0) { - return doc['task.retryAt'].value.toInstant().toEpochMilli(); -} -if (doc['task.runAt'].size()!=0) { - return doc['task.runAt'].value.toInstant().toEpochMilli(); -} - `, - }, - }, - }, - ]); - }); - - test('it claims tasks by setting their ownerId, status and retryAt', async () => { - const taskManagerId = uuid.v1(); - const claimOwnershipUntil = new Date(Date.now()); - const fieldUpdates = { - ownerId: taskManagerId, - retryAt: claimOwnershipUntil, - }; - const { - args: { - updateByQuery: { body: { script } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - }, - claimingOpts: { - claimOwnershipUntil, - size: 10, - }, - }); - expect(script).toMatchObject({ - source: ` - if (params.registeredTaskTypes.contains(ctx._source.task.taskType)) { - if (ctx._source.task.schedule != null || ctx._source.task.attempts < params.taskMaxAttempts[ctx._source.task.taskType] || params.claimTasksById.contains(ctx._id)) { - ctx._source.task.status = "claiming"; ${Object.keys(fieldUpdates) - .map((field) => `ctx._source.task.${field}=params.fieldUpdates.${field};`) - .join(' ')} - } else { - ctx._source.task.status = "failed"; - } - } else { - ctx._source.task.status = "unrecognized"; - } - `, - lang: 'painless', - params: { - fieldUpdates, - claimTasksById: [], - registeredTaskTypes: ['report', 'dernstraight', 'yawn'], - taskMaxAttempts: { - dernstraight: 2, - report: 2, - yawn: 2, - }, - }, - }); - }); - - test('it filters out running tasks', async () => { - const taskManagerId = uuid.v1(); - const claimOwnershipUntil = new Date(Date.now()); - const runAt = new Date(); - const tasks = [ - { - _id: 'task:aaa', - _source: { - type: 'task', - task: { - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming', - params: '{ "hello": "world" }', - state: '{ "baby": "Henhen" }', - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - }, - }, - _seq_no: 1, - _primary_term: 2, - sort: ['a', 1], - }, - { - // this is invalid as it doesn't have the `type` prefix - _id: 'bbb', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'claiming', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - ]; - const { - result: { docs }, - args: { - search: { body: { query } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - }, - claimingOpts: { - claimOwnershipUntil, - size: 10, - }, - hits: tasks, - }); - - expect(query?.bool?.must).toContainEqual({ - bool: { - must: [ - { - term: { - 'task.ownerId': taskManagerId, - }, - }, - { term: { 'task.status': 'claiming' } }, - ], - }, - }); - - expect(docs).toMatchObject([ - { - attempts: 0, - id: 'aaa', - schedule: undefined, - params: { hello: 'world' }, - runAt, - scope: ['reporting'], - state: { baby: 'Henhen' }, - status: 'claiming', - taskType: 'foo', - user: 'jimbo', - ownerId: taskManagerId, - }, - ]); - }); - - test('it filters out invalid tasks that arent SavedObjects', async () => { - const taskManagerId = uuid.v1(); - const claimOwnershipUntil = new Date(Date.now()); - const runAt = new Date(); - const tasks = [ - { - _id: 'task:aaa', - _source: { - type: 'task', - task: { - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming', - params: '{ "hello": "world" }', - state: '{ "baby": "Henhen" }', - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - }, - }, - _seq_no: 1, - _primary_term: 2, - sort: ['a', 1], - }, - { - _id: 'task:bbb', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'running', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - ]; - const { - result: { docs } = {}, - args: { - search: { body: { query } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - }, - claimingOpts: { - claimOwnershipUntil, - size: 10, - }, - hits: tasks, - }); - - expect(query?.bool?.must).toContainEqual({ - bool: { - must: [ - { - term: { - 'task.ownerId': taskManagerId, - }, - }, - { term: { 'task.status': 'claiming' } }, - ], - }, - }); - - expect(docs).toMatchObject([ - { - attempts: 0, - id: 'aaa', - schedule: undefined, - params: { hello: 'world' }, - runAt, - scope: ['reporting'], - state: { baby: 'Henhen' }, - status: 'claiming', - taskType: 'foo', - user: 'jimbo', - ownerId: taskManagerId, - }, - ]); - }); - - test('it returns task objects', async () => { - const taskManagerId = uuid.v1(); - const claimOwnershipUntil = new Date(Date.now()); - const runAt = new Date(); - const tasks = [ - { - _id: 'task:aaa', - _source: { - type: 'task', - task: { - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming', - params: '{ "hello": "world" }', - state: '{ "baby": "Henhen" }', - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - }, - }, - _seq_no: 1, - _primary_term: 2, - sort: ['a', 1], - }, - { - _id: 'task:bbb', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'claiming', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - ]; - const { - result: { docs } = {}, - args: { - search: { body: { query } = {} }, - }, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - }, - claimingOpts: { - claimOwnershipUntil, - size: 10, - }, - hits: tasks, - }); - - expect(query?.bool?.must).toContainEqual({ - bool: { - must: [ - { - term: { - 'task.ownerId': taskManagerId, - }, - }, - { term: { 'task.status': 'claiming' } }, - ], - }, - }); - - expect(docs).toMatchObject([ - { - attempts: 0, - id: 'aaa', - schedule: undefined, - params: { hello: 'world' }, - runAt, - scope: ['reporting'], - state: { baby: 'Henhen' }, - status: 'claiming', - taskType: 'foo', - user: 'jimbo', - ownerId: taskManagerId, - }, - { - attempts: 2, - id: 'bbb', - schedule: { interval: '5m' }, - params: { shazm: 1 }, - runAt, - scope: ['reporting', 'ceo'], - state: { henry: 'The 8th' }, - status: 'claiming', - taskType: 'bar', - user: 'dabo', - ownerId: taskManagerId, - }, - ]); - }); - - test('it returns version_conflicts that do not include conflicts that were proceeded against', async () => { - const taskManagerId = uuid.v1(); - const claimOwnershipUntil = new Date(Date.now()); - const runAt = new Date(); - const tasks = [ - { - _id: 'task:aaa', - _source: { - type: 'task', - task: { - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming', - params: '{ "hello": "world" }', - state: '{ "baby": "Henhen" }', - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - }, - }, - _seq_no: 1, - _primary_term: 2, - sort: ['a', 1], - }, - { - _id: 'task:bbb', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'claiming', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - ]; - const maxDocs = 10; - const { - result: { stats: { tasksUpdated, tasksConflicted, tasksClaimed } = {} } = {}, - } = await testClaimAvailableTasks({ - opts: { - taskManagerId, - }, - claimingOpts: { - claimOwnershipUntil, - size: maxDocs, - }, - hits: tasks, - // assume there were 20 version conflists, but thanks to `conflicts="proceed"` - // we proceeded to claim tasks - versionConflicts: 20, - }); - - expect(tasksUpdated).toEqual(2); - // ensure we only count conflicts that *may* have counted against max_docs, no more than that - expect(tasksConflicted).toEqual(10 - tasksUpdated!); - expect(tasksClaimed).toEqual(2); - }); - - test('pushes error from saved objects client to errors$', async () => { - const esClient = elasticsearchServiceMock.createClusterClient().asInternalUser; - const store = new TaskStore({ - index: 'tasky', - taskManagerId: '', - serializer, - esClient, - definitions: taskDefinitions, - maxAttempts: 2, - savedObjectsRepository: savedObjectsClient, - }); - - const firstErrorPromise = store.errors$.pipe(first()).toPromise(); - esClient.updateByQuery.mockRejectedValue(new Error('Failure')); - await expect( - store.claimAvailableTasks({ - claimOwnershipUntil: new Date(), - size: 10, - }) - ).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); - expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); - }); - }); - describe('update', () => { let store: TaskStore; let esClient: ReturnType['asInternalUser']; @@ -1079,7 +267,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1179,7 +366,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1219,7 +405,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1251,7 +436,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1335,7 +519,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1355,7 +538,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1373,7 +555,6 @@ if (doc['task.runAt'].size()!=0) { taskManagerId: '', serializer, esClient: elasticsearchServiceMock.createClusterClient().asInternalUser, - maxAttempts: 2, definitions: taskDefinitions, savedObjectsRepository: savedObjectsClient, }); @@ -1381,283 +562,8 @@ if (doc['task.runAt'].size()!=0) { return expect(store.getLifecycle(randomId())).rejects.toThrow('Bad Request'); }); }); - - describe('task events', () => { - function generateTasks() { - const taskManagerId = uuid.v1(); - const runAt = new Date(); - const tasks = [ - { - _id: 'task:claimed-by-id', - _source: { - type: 'task', - task: { - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming', - params: '{ "hello": "world" }', - state: '{ "baby": "Henhen" }', - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }, - }, - _seq_no: 1, - _primary_term: 2, - sort: ['a', 1], - }, - { - _id: 'task:claimed-by-schedule', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'claiming', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - { - _id: 'task:already-running', - _source: { - type: 'task', - task: { - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'running', - params: '{ "shazm": 1 }', - state: '{ "henry": "The 8th" }', - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }, - }, - _seq_no: 3, - _primary_term: 4, - sort: ['b', 2], - }, - ]; - - return { taskManagerId, runAt, tasks }; - } - - function instantiateStoreWithMockedApiResponses() { - const { taskManagerId, runAt, tasks } = generateTasks(); - - const esClient = elasticsearchServiceMock.createClusterClient().asInternalUser; - esClient.search.mockResolvedValue(asApiResponse({ hits: { hits: tasks } })); - esClient.updateByQuery.mockResolvedValue( - asApiResponse({ - total: tasks.length, - updated: tasks.length, - }) - ); - - const store = new TaskStore({ - esClient, - maxAttempts: 2, - definitions: taskDefinitions, - serializer, - savedObjectsRepository: savedObjectsClient, - taskManagerId, - index: '', - }); - - return { taskManagerId, runAt, store }; - } - - test('emits an event when a task is succesfully claimed by id', async () => { - const { taskManagerId, runAt, store } = instantiateStoreWithMockedApiResponses(); - - const promise = store.events - .pipe( - filter( - (event: TaskEvent>) => - event.id === 'claimed-by-id' - ), - take(1) - ) - .toPromise(); - - await store.claimAvailableTasks({ - claimTasksById: ['claimed-by-id'], - claimOwnershipUntil: new Date(), - size: 10, - }); - - const event = await promise; - expect(event).toMatchObject( - asTaskClaimEvent( - 'claimed-by-id', - asOk({ - id: 'claimed-by-id', - runAt, - taskType: 'foo', - schedule: undefined, - attempts: 0, - status: 'claiming' as TaskStatus, - params: { hello: 'world' }, - state: { baby: 'Henhen' }, - user: 'jimbo', - scope: ['reporting'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }) - ) - ); - }); - - test('emits an event when a task is succesfully by scheduling', async () => { - const { taskManagerId, runAt, store } = instantiateStoreWithMockedApiResponses(); - - const promise = store.events - .pipe( - filter( - (event: TaskEvent>) => - event.id === 'claimed-by-schedule' - ), - take(1) - ) - .toPromise(); - - await store.claimAvailableTasks({ - claimTasksById: ['claimed-by-id'], - claimOwnershipUntil: new Date(), - size: 10, - }); - - const event = await promise; - expect(event).toMatchObject( - asTaskClaimEvent( - 'claimed-by-schedule', - asOk({ - id: 'claimed-by-schedule', - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'claiming' as TaskStatus, - params: { shazm: 1 }, - state: { henry: 'The 8th' }, - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }) - ) - ); - }); - - test('emits an event when the store fails to claim a required task by id', async () => { - const { taskManagerId, runAt, store } = instantiateStoreWithMockedApiResponses(); - - const promise = store.events - .pipe( - filter( - (event: TaskEvent>) => - event.id === 'already-running' - ), - take(1) - ) - .toPromise(); - - await store.claimAvailableTasks({ - claimTasksById: ['already-running'], - claimOwnershipUntil: new Date(), - size: 10, - }); - - const event = await promise; - expect(event).toMatchObject( - asTaskClaimEvent( - 'already-running', - asErr( - some({ - id: 'already-running', - runAt, - taskType: 'bar', - schedule: { interval: '5m' }, - attempts: 2, - status: 'running' as TaskStatus, - params: { shazm: 1 }, - state: { henry: 'The 8th' }, - user: 'dabo', - scope: ['reporting', 'ceo'], - ownerId: taskManagerId, - startedAt: null, - retryAt: null, - scheduledAt: new Date(), - }) - ) - ) - ); - }); - - test('emits an event when the store fails to find a task which was required by id', async () => { - const { store } = instantiateStoreWithMockedApiResponses(); - - const promise = store.events - .pipe( - filter( - (event: TaskEvent>) => - event.id === 'unknown-task' - ), - take(1) - ) - .toPromise(); - - await store.claimAvailableTasks({ - claimTasksById: ['unknown-task'], - claimOwnershipUntil: new Date(), - size: 10, - }); - - const event = await promise; - expect(event).toMatchObject(asTaskClaimEvent('unknown-task', asErr(none))); - }); - }); }); -function generateFakeTasks(count: number = 1) { - return _.times(count, (index) => ({ - _id: `task:id-${index}`, - _source: { - type: 'task', - task: {}, - }, - _seq_no: _.random(1, 5), - _primary_term: _.random(1, 5), - sort: ['a', _.random(1, 5)], - })); -} - const asApiResponse = (body: T): RequestEvent => ({ body, diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index b72f1826b813bf..0b54f2779065f6 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -8,13 +8,9 @@ /* * This module contains helpers for managing the task manager storage layer. */ -import apm from 'elastic-apm-node'; -import { Subject, Observable } from 'rxjs'; -import { omit, difference, partition, map, defaults } from 'lodash'; - -import { some, none } from 'fp-ts/lib/Option'; - -import { SearchResponse, UpdateDocumentByQueryResponse } from 'elasticsearch'; +import { Subject } from 'rxjs'; +import { omit, defaults } from 'lodash'; +import { ReindexResponseBase, SearchResponse, UpdateDocumentByQueryResponse } from 'elasticsearch'; import { SavedObject, SavedObjectsSerializer, @@ -32,38 +28,15 @@ import { TaskLifecycle, TaskLifecycleResult, SerializedConcreteTaskInstance, - TaskStatus, } from './task'; -import { TaskClaim, asTaskClaimEvent } from './task_events'; - -import { - asUpdateByQuery, - shouldBeOneOf, - mustBeAllOf, - filterDownBy, - asPinnedQuery, - matchesClauses, - SortOptions, -} from './queries/query_clauses'; - -import { - updateFieldsAndMarkAsFailed, - IdleTaskWithExpiredRunAt, - InactiveTasks, - RunningOrClaimingTaskWithExpiredRetryAt, - SortByRunAtAndRetryAt, - tasksClaimedByOwner, -} from './queries/mark_available_tasks_as_claimed'; import { TaskTypeDictionary } from './task_type_dictionary'; - import { ESSearchResponse, ESSearchBody } from '../../../typings/elasticsearch'; export interface StoreOpts { esClient: ElasticsearchClient; index: string; taskManagerId: string; - maxAttempts: number; definitions: TaskTypeDictionary; savedObjectsRepository: ISavedObjectsRepository; serializer: SavedObjectsSerializer; @@ -88,25 +61,10 @@ export interface UpdateByQueryOpts extends SearchOpts { max_docs?: number; } -export interface OwnershipClaimingOpts { - claimOwnershipUntil: Date; - claimTasksById?: string[]; - size: number; -} - export interface FetchResult { docs: ConcreteTaskInstance[]; } -export interface ClaimOwnershipResult { - stats: { - tasksUpdated: number; - tasksConflicted: number; - tasksClaimed: number; - }; - docs: ConcreteTaskInstance[]; -} - export type BulkUpdateResult = Result< ConcreteTaskInstance, { entity: ConcreteTaskInstance; error: Error } @@ -123,7 +81,6 @@ export interface UpdateByQueryResult { * interface into the index. */ export class TaskStore { - public readonly maxAttempts: number; public readonly index: string; public readonly taskManagerId: string; public readonly errors$ = new Subject(); @@ -132,14 +89,12 @@ export class TaskStore { private definitions: TaskTypeDictionary; private savedObjectsRepository: ISavedObjectsRepository; private serializer: SavedObjectsSerializer; - private events$: Subject; /** * Constructs a new TaskStore. * @param {StoreOpts} opts * @prop {esClient} esClient - An elasticsearch client * @prop {string} index - The name of the task manager index - * @prop {number} maxAttempts - The maximum number of attempts before a task will be abandoned * @prop {TaskDefinition} definition - The definition of the task being run * @prop {serializer} - The saved object serializer * @prop {savedObjectsRepository} - An instance to the saved objects repository @@ -148,21 +103,22 @@ export class TaskStore { this.esClient = opts.esClient; this.index = opts.index; this.taskManagerId = opts.taskManagerId; - this.maxAttempts = opts.maxAttempts; this.definitions = opts.definitions; this.serializer = opts.serializer; this.savedObjectsRepository = opts.savedObjectsRepository; - this.events$ = new Subject(); } - public get events(): Observable { - return this.events$; + /** + * Convert ConcreteTaskInstance Ids to match their SavedObject format as serialized + * in Elasticsearch + * @param tasks - The task being scheduled. + */ + public convertToSavedObjectIds( + taskIds: Array + ): Array { + return taskIds.map((id) => this.serializer.generateRawId(undefined, 'task', id)); } - private emitEvents = (events: TaskClaim[]) => { - events.forEach((event) => this.events$.next(event)); - }; - /** * Schedules a task. * @@ -201,144 +157,6 @@ export class TaskStore { }); } - /** - * Claims available tasks from the index, which are ready to be run. - * - runAt is now or past - * - is not currently claimed by any instance of Kibana - * - has a type that is in our task definitions - * - * @param {OwnershipClaimingOpts} options - * @returns {Promise} - */ - public claimAvailableTasks = async ({ - claimOwnershipUntil, - claimTasksById = [], - size, - }: OwnershipClaimingOpts): Promise => { - const claimTasksByIdWithRawIds = claimTasksById.map((id) => - this.serializer.generateRawId(undefined, 'task', id) - ); - - const { - updated: tasksUpdated, - version_conflicts: tasksConflicted, - } = await this.markAvailableTasksAsClaimed(claimOwnershipUntil, claimTasksByIdWithRawIds, size); - - const docs = - tasksUpdated > 0 ? await this.sweepForClaimedTasks(claimTasksByIdWithRawIds, size) : []; - - const [documentsReturnedById, documentsClaimedBySchedule] = partition(docs, (doc) => - claimTasksById.includes(doc.id) - ); - - const [documentsClaimedById, documentsRequestedButNotClaimed] = partition( - documentsReturnedById, - // we filter the schduled tasks down by status is 'claiming' in the esearch, - // but we do not apply this limitation on tasks claimed by ID so that we can - // provide more detailed error messages when we fail to claim them - (doc) => doc.status === TaskStatus.Claiming - ); - - const documentsRequestedButNotReturned = difference( - claimTasksById, - map(documentsReturnedById, 'id') - ); - - this.emitEvents([ - ...documentsClaimedById.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))), - ...documentsClaimedBySchedule.map((doc) => asTaskClaimEvent(doc.id, asOk(doc))), - ...documentsRequestedButNotClaimed.map((doc) => asTaskClaimEvent(doc.id, asErr(some(doc)))), - ...documentsRequestedButNotReturned.map((id) => asTaskClaimEvent(id, asErr(none))), - ]); - - return { - stats: { - tasksUpdated, - tasksConflicted, - tasksClaimed: documentsClaimedById.length + documentsClaimedBySchedule.length, - }, - docs: docs.filter((doc) => doc.status === TaskStatus.Claiming), - }; - }; - - private async markAvailableTasksAsClaimed( - claimOwnershipUntil: OwnershipClaimingOpts['claimOwnershipUntil'], - claimTasksById: OwnershipClaimingOpts['claimTasksById'], - size: OwnershipClaimingOpts['size'] - ): Promise { - const registeredTaskTypes = this.definitions.getAllTypes(); - const taskMaxAttempts = [...this.definitions].reduce((accumulator, [type, { maxAttempts }]) => { - return { ...accumulator, [type]: maxAttempts || this.maxAttempts }; - }, {}); - const queryForScheduledTasks = mustBeAllOf( - // Either a task with idle status and runAt <= now or - // status running or claiming with a retryAt <= now. - shouldBeOneOf(IdleTaskWithExpiredRunAt, RunningOrClaimingTaskWithExpiredRetryAt) - ); - - // The documents should be sorted by runAt/retryAt, unless there are pinned - // tasks being queried, in which case we want to sort by score first, and then - // the runAt/retryAt. That way we'll get the pinned tasks first. Note that - // the score seems to favor newer documents rather than older documents, so - // if there are not pinned tasks being queried, we do NOT want to sort by score - // at all, just by runAt/retryAt. - const sort: SortOptions = [SortByRunAtAndRetryAt]; - if (claimTasksById && claimTasksById.length) { - sort.unshift('_score'); - } - - const apmTrans = apm.startTransaction(`taskManager markAvailableTasksAsClaimed`, 'taskManager'); - const result = await this.updateByQuery( - asUpdateByQuery({ - query: matchesClauses( - mustBeAllOf( - claimTasksById && claimTasksById.length - ? asPinnedQuery(claimTasksById, queryForScheduledTasks) - : queryForScheduledTasks - ), - filterDownBy(InactiveTasks) - ), - update: updateFieldsAndMarkAsFailed( - { - ownerId: this.taskManagerId, - retryAt: claimOwnershipUntil, - }, - claimTasksById || [], - registeredTaskTypes, - taskMaxAttempts - ), - sort, - }), - { - max_docs: size, - } - ); - - if (apmTrans) apmTrans.end(); - return result; - } - - /** - * Fetches tasks from the index, which are owned by the current Kibana instance - */ - private async sweepForClaimedTasks( - claimTasksById: OwnershipClaimingOpts['claimTasksById'], - size: OwnershipClaimingOpts['size'] - ): Promise { - const claimedTasksQuery = tasksClaimedByOwner(this.taskManagerId); - const { docs } = await this.search({ - query: - claimTasksById && claimTasksById.length - ? asPinnedQuery(claimTasksById, claimedTasksQuery) - : claimedTasksQuery, - size, - sort: SortByRunAtAndRetryAt, - seq_no_primary_term: true, - }); - - return docs; - } - /** * Updates the specified doc in the index, returning the doc * with its version up to date. @@ -527,7 +345,7 @@ export class TaskStore { return body; } - private async updateByQuery( + public async updateByQuery( opts: UpdateByQuerySearchOpts = {}, // eslint-disable-next-line @typescript-eslint/naming-convention { max_docs: max_docs }: UpdateByQueryOpts = {} @@ -549,17 +367,11 @@ export class TaskStore { }, }); - /** - * When we run updateByQuery with conflicts='proceed', it's possible for the `version_conflicts` - * to count against the specified `max_docs`, as per https://github.com/elastic/elasticsearch/issues/63671 - * In order to correct for that happening, we only count `version_conflicts` if we haven't updated as - * many docs as we could have. - * This is still no more than an estimation, as there might have been less docuemnt to update that the - * `max_docs`, but we bias in favour of over zealous `version_conflicts` as that's the best indicator we - * have for an unhealthy cluster distribution of Task Manager polling intervals - */ - const conflictsCorrectedForContinuation = - max_docs && version_conflicts + updated > max_docs ? max_docs - updated : version_conflicts; + const conflictsCorrectedForContinuation = correctVersionConflictsForContinuation( + updated, + version_conflicts, + max_docs + ); return { total, @@ -572,6 +384,22 @@ export class TaskStore { } } } +/** + * When we run updateByQuery with conflicts='proceed', it's possible for the `version_conflicts` + * to count against the specified `max_docs`, as per https://github.com/elastic/elasticsearch/issues/63671 + * In order to correct for that happening, we only count `version_conflicts` if we haven't updated as + * many docs as we could have. + * This is still no more than an estimation, as there might have been less docuemnt to update that the + * `max_docs`, but we bias in favour of over zealous `version_conflicts` as that's the best indicator we + * have for an unhealthy cluster distribution of Task Manager polling intervals + */ +export function correctVersionConflictsForContinuation( + updated: ReindexResponseBase['updated'], + versionConflicts: ReindexResponseBase['version_conflicts'], + maxDocs?: number +) { + return maxDocs && versionConflicts + updated > maxDocs ? maxDocs - updated : versionConflicts; +} function taskInstanceToAttributes(doc: TaskInstance): SerializedConcreteTaskInstance { return { diff --git a/x-pack/plugins/task_manager/server/task_type_dictionary.ts b/x-pack/plugins/task_manager/server/task_type_dictionary.ts index 4230eb9ce4b737..63a0548d79d322 100644 --- a/x-pack/plugins/task_manager/server/task_type_dictionary.ts +++ b/x-pack/plugins/task_manager/server/task_type_dictionary.ts @@ -28,6 +28,10 @@ export class TaskTypeDictionary { return [...this.definitions.keys()]; } + public getAllDefinitions() { + return [...this.definitions.values()]; + } + public has(type: string) { return this.definitions.has(type); } diff --git a/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/init_routes.ts b/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/init_routes.ts index 2878d7d5f8220b..57beb40b164592 100644 --- a/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/init_routes.ts +++ b/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/init_routes.ts @@ -218,10 +218,9 @@ export function initRoutes( await ensureIndexIsRefreshed(); const taskManager = await taskManagerStart; return res.ok({ body: await taskManager.get(req.params.taskId) }); - } catch (err) { - return res.ok({ body: err }); + } catch ({ isBoom, output, message }) { + return res.ok({ body: isBoom ? output.payload : { message } }); } - return res.ok({ body: {} }); } ); @@ -251,6 +250,7 @@ export function initRoutes( res: KibanaResponseFactory ): Promise> { try { + await ensureIndexIsRefreshed(); let tasksFound = 0; const taskManager = await taskManagerStart; do { @@ -261,8 +261,8 @@ export function initRoutes( await Promise.all(tasks.map((task) => taskManager.remove(task.id))); } while (tasksFound > 0); return res.ok({ body: 'OK' }); - } catch (err) { - return res.ok({ body: err }); + } catch ({ isBoom, output, message }) { + return res.ok({ body: isBoom ? output.payload : { message } }); } } ); diff --git a/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/plugin.ts b/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/plugin.ts index 3aee35ed0bff3f..2031551410894a 100644 --- a/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/plugin.ts +++ b/x-pack/test/plugin_api_integration/plugins/sample_task_plugin/server/plugin.ts @@ -105,6 +105,20 @@ export class SampleTaskManagerFixturePlugin // fail after the first failed run maxAttempts: 1, }, + sampleTaskWithSingleConcurrency: { + ...defaultSampleTaskConfig, + title: 'Sample Task With Single Concurrency', + maxConcurrency: 1, + timeout: '60s', + description: 'A sample task that can only have one concurrent instance.', + }, + sampleTaskWithLimitedConcurrency: { + ...defaultSampleTaskConfig, + title: 'Sample Task With Max Concurrency of 2', + maxConcurrency: 2, + timeout: '60s', + description: 'A sample task that can only have two concurrent instance.', + }, sampleRecurringTaskTimingOut: { title: 'Sample Recurring Task that Times Out', description: 'A sample task that times out each run.', diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 231150a8148354..d99c1dac9a25e9 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -34,6 +34,7 @@ interface MonitoringStats { timestamp: string; value: { drift: Record; + drift_by_type: Record>; load: Record; execution: { duration: Record>; @@ -43,6 +44,7 @@ interface MonitoringStats { last_successful_poll: string; last_polling_delay: string; duration: Record; + claim_duration: Record; result_frequency_percent_as_number: Record; }; }; @@ -174,7 +176,8 @@ export default function ({ getService }: FtrProviderContext) { const { runtime: { - value: { drift, load, polling, execution }, + // eslint-disable-next-line @typescript-eslint/naming-convention + value: { drift, drift_by_type, load, polling, execution }, }, } = (await getHealth()).stats; @@ -192,11 +195,21 @@ export default function ({ getService }: FtrProviderContext) { expect(typeof polling.duration.p95).to.eql('number'); expect(typeof polling.duration.p99).to.eql('number'); + expect(typeof polling.claim_duration.p50).to.eql('number'); + expect(typeof polling.claim_duration.p90).to.eql('number'); + expect(typeof polling.claim_duration.p95).to.eql('number'); + expect(typeof polling.claim_duration.p99).to.eql('number'); + expect(typeof drift.p50).to.eql('number'); expect(typeof drift.p90).to.eql('number'); expect(typeof drift.p95).to.eql('number'); expect(typeof drift.p99).to.eql('number'); + expect(typeof drift_by_type.sampleTask.p50).to.eql('number'); + expect(typeof drift_by_type.sampleTask.p90).to.eql('number'); + expect(typeof drift_by_type.sampleTask.p95).to.eql('number'); + expect(typeof drift_by_type.sampleTask.p99).to.eql('number'); + expect(typeof load.p50).to.eql('number'); expect(typeof load.p90).to.eql('number'); expect(typeof load.p95).to.eql('number'); diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts index 353be5e872aed7..26333ecabd505d 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts @@ -51,7 +51,7 @@ type SerializedConcreteTaskInstance = Omit< }; export default function ({ getService }: FtrProviderContext) { - const es = getService('legacyEs'); + const es = getService('es'); const log = getService('log'); const retry = getService('retry'); const config = getService('config'); @@ -59,30 +59,46 @@ export default function ({ getService }: FtrProviderContext) { const supertest = supertestAsPromised(url.format(config.get('servers.kibana'))); describe('scheduling and running tasks', () => { - beforeEach( - async () => await supertest.delete('/api/sample_tasks').set('kbn-xsrf', 'xxx').expect(200) - ); + beforeEach(async () => { + // clean up before each test + return await supertest.delete('/api/sample_tasks').set('kbn-xsrf', 'xxx').expect(200); + }); beforeEach(async () => { const exists = await es.indices.exists({ index: testHistoryIndex }); - if (exists) { + if (exists.body) { await es.deleteByQuery({ index: testHistoryIndex, - q: 'type:task', refresh: true, + body: { query: { term: { type: 'task' } } }, }); } else { await es.indices.create({ index: testHistoryIndex, body: { mappings: { - properties: taskManagerIndexMapping, + properties: { + type: { + type: 'keyword', + }, + taskId: { + type: 'keyword', + }, + params: taskManagerIndexMapping.params, + state: taskManagerIndexMapping.state, + runAt: taskManagerIndexMapping.runAt, + }, }, }, }); } }); + after(async () => { + // clean up after last test + return await supertest.delete('/api/sample_tasks').set('kbn-xsrf', 'xxx').expect(200); + }); + function currentTasks(): Promise<{ docs: Array>; }> { @@ -98,7 +114,27 @@ export default function ({ getService }: FtrProviderContext) { return supertest .get(`/api/sample_tasks/task/${task}`) .send({ task }) - .expect(200) + .expect((response) => { + expect(response.status).to.eql(200); + expect(typeof JSON.parse(response.text).id).to.eql(`string`); + }) + .then((response) => response.body); + } + + function currentTaskError( + task: string + ): Promise<{ + statusCode: number; + error: string; + message: string; + }> { + return supertest + .get(`/api/sample_tasks/task/${task}`) + .send({ task }) + .expect(function (response) { + expect(response.status).to.eql(200); + expect(typeof JSON.parse(response.text).message).to.eql(`string`); + }) .then((response) => response.body); } @@ -106,13 +142,21 @@ export default function ({ getService }: FtrProviderContext) { return supertest.get(`/api/ensure_tasks_index_refreshed`).send({}).expect(200); } - function historyDocs(taskId?: string): Promise { + async function historyDocs(taskId?: string): Promise { return es .search({ index: testHistoryIndex, - q: taskId ? `taskId:${taskId}` : 'type:task', + body: { + query: { + term: { type: 'task' }, + }, + }, }) - .then((result: SearchResults) => result.hits.hits); + .then((result) => + ((result.body as unknown) as SearchResults).hits.hits.filter((task) => + taskId ? task._source?.taskId === taskId : true + ) + ); } function scheduleTask( @@ -123,7 +167,10 @@ export default function ({ getService }: FtrProviderContext) { .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response: { body: SerializedConcreteTaskInstance }) => response.body); + .then((response: { body: SerializedConcreteTaskInstance }) => { + log.debug(`Task Scheduled: ${response.body.id}`); + return response.body; + }); } function runTaskNow(task: { id: string }) { @@ -252,8 +299,7 @@ export default function ({ getService }: FtrProviderContext) { }); await retry.try(async () => { - const [scheduledTask] = (await currentTasks()).docs; - expect(scheduledTask.id).to.eql(task.id); + const scheduledTask = await currentTask(task.id); expect(scheduledTask.attempts).to.be.greaterThan(0); expect(Date.parse(scheduledTask.runAt)).to.be.greaterThan( Date.parse(task.runAt) + 5 * 60 * 1000 @@ -271,8 +317,7 @@ export default function ({ getService }: FtrProviderContext) { }); await retry.try(async () => { - const [scheduledTask] = (await currentTasks()).docs; - expect(scheduledTask.id).to.eql(task.id); + const scheduledTask = await currentTask(task.id); const retryAt = Date.parse(scheduledTask.retryAt!); expect(isNaN(retryAt)).to.be(false); @@ -296,7 +341,7 @@ export default function ({ getService }: FtrProviderContext) { await retry.try(async () => { expect((await historyDocs(originalTask.id)).length).to.eql(1); - const [task] = (await currentTasks<{ count: number }>()).docs; + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(count + 1); @@ -467,6 +512,134 @@ export default function ({ getService }: FtrProviderContext) { }); }); + it('should only run as many instances of a task as its maxConcurrency will allow', async () => { + // should run as there's only one and maxConcurrency on this TaskType is 1 + const firstWithSingleConcurrency = await scheduleTask({ + taskType: 'sampleTaskWithSingleConcurrency', + params: { + waitForEvent: 'releaseFirstWaveOfTasks', + }, + }); + + // should run as there's only two and maxConcurrency on this TaskType is 2 + const [firstLimitedConcurrency, secondLimitedConcurrency] = await Promise.all([ + scheduleTask({ + taskType: 'sampleTaskWithLimitedConcurrency', + params: { + waitForEvent: 'releaseFirstWaveOfTasks', + }, + }), + scheduleTask({ + taskType: 'sampleTaskWithLimitedConcurrency', + params: { + waitForEvent: 'releaseSecondWaveOfTasks', + }, + }), + ]); + + await retry.try(async () => { + expect((await historyDocs(firstWithSingleConcurrency.id)).length).to.eql(1); + expect((await historyDocs(firstLimitedConcurrency.id)).length).to.eql(1); + expect((await historyDocs(secondLimitedConcurrency.id)).length).to.eql(1); + }); + + // should not run as there one running and maxConcurrency on this TaskType is 1 + const secondWithSingleConcurrency = await scheduleTask({ + taskType: 'sampleTaskWithSingleConcurrency', + params: { + waitForEvent: 'releaseSecondWaveOfTasks', + }, + }); + + // should not run as there are two running and maxConcurrency on this TaskType is 2 + const thirdWithLimitedConcurrency = await scheduleTask({ + taskType: 'sampleTaskWithLimitedConcurrency', + params: { + waitForEvent: 'releaseSecondWaveOfTasks', + }, + }); + + // schedule a task that should get picked up before the two blocked tasks + const taskWithUnlimitedConcurrency = await scheduleTask({ + taskType: 'sampleTask', + params: {}, + }); + + await retry.try(async () => { + expect((await historyDocs(taskWithUnlimitedConcurrency.id)).length).to.eql(1); + expect((await currentTask(secondWithSingleConcurrency.id)).status).to.eql('idle'); + expect((await currentTask(thirdWithLimitedConcurrency.id)).status).to.eql('idle'); + }); + + // release the running SingleConcurrency task and only one of the LimitedConcurrency tasks + await releaseTasksWaitingForEventToComplete('releaseFirstWaveOfTasks'); + + await retry.try(async () => { + // ensure the completed tasks were deleted + expect((await currentTaskError(firstWithSingleConcurrency.id)).message).to.eql( + `Saved object [task/${firstWithSingleConcurrency.id}] not found` + ); + expect((await currentTaskError(firstLimitedConcurrency.id)).message).to.eql( + `Saved object [task/${firstLimitedConcurrency.id}] not found` + ); + + // ensure blocked tasks is still running + expect((await currentTask(secondLimitedConcurrency.id)).status).to.eql('running'); + + // ensure the blocked tasks begin running + expect((await currentTask(secondWithSingleConcurrency.id)).status).to.eql('running'); + expect((await currentTask(thirdWithLimitedConcurrency.id)).status).to.eql('running'); + }); + + // release blocked task + await releaseTasksWaitingForEventToComplete('releaseSecondWaveOfTasks'); + }); + + it('should return a task run error result when RunNow is called at a time that would cause the task to exceed its maxConcurrency', async () => { + // should run as there's only one and maxConcurrency on this TaskType is 1 + const firstWithSingleConcurrency = await scheduleTask({ + taskType: 'sampleTaskWithSingleConcurrency', + // include a schedule so that the task isn't deleted after completion + schedule: { interval: `30m` }, + params: { + waitForEvent: 'releaseRunningTaskWithSingleConcurrency', + }, + }); + + // should not run as the first is running + const secondWithSingleConcurrency = await scheduleTask({ + taskType: 'sampleTaskWithSingleConcurrency', + params: { + waitForEvent: 'releaseRunningTaskWithSingleConcurrency', + }, + }); + + // run the first tasks once just so that we can be sure it runs in response to our + // runNow callm, rather than the initial execution + await retry.try(async () => { + expect((await historyDocs(firstWithSingleConcurrency.id)).length).to.eql(1); + }); + await releaseTasksWaitingForEventToComplete('releaseRunningTaskWithSingleConcurrency'); + + // wait for second task to stall + await retry.try(async () => { + expect((await historyDocs(secondWithSingleConcurrency.id)).length).to.eql(1); + }); + + // run the first task again using runNow - should fail due to concurrency concerns + const failedRunNowResult = await runTaskNow({ + id: firstWithSingleConcurrency.id, + }); + + expect(failedRunNowResult).to.eql({ + id: firstWithSingleConcurrency.id, + error: `Error: Failed to run task "${firstWithSingleConcurrency.id}" as we would exceed the max concurrency of "Sample Task With Single Concurrency" which is 1. Rescheduled the task to ensure it is picked up as soon as possible.`, + }); + + // release the second task + await releaseTasksWaitingForEventToComplete('releaseRunningTaskWithSingleConcurrency'); + }); + it('should return a task run error result when running a task now fails', async () => { const originalTask = await scheduleTask({ taskType: 'sampleTask',