feat(core): Cancel runner task on timeout in external mode (#12101)

This commit is contained in:
Iván Ovejero 2024-12-10 12:50:22 +01:00 committed by Tomi Turtiainen
parent 78315aca3d
commit f18263bc8f
11 changed files with 219 additions and 34 deletions

View file

@ -15,6 +15,7 @@
"N8N_RUNNERS_SERVER_ENABLED",
"N8N_RUNNERS_SERVER_HOST",
"N8N_RUNNERS_SERVER_PORT",
"N8N_RUNNERS_TASK_TIMEOUT",
"NODE_FUNCTION_ALLOW_BUILTIN",
"NODE_FUNCTION_ALLOW_EXTERNAL",
"NODE_OPTIONS",

View file

@ -43,11 +43,11 @@ export class TaskRunnersConfig {
@Env('N8N_RUNNERS_MAX_CONCURRENCY')
maxConcurrency: number = 5;
/** How long (in seconds) a task is allowed to take for completion, else the task will be aborted and the runner restarted. Must be greater than 0. */
/** How long (in seconds) a task is allowed to take for completion, else the task will be aborted. (In internal mode, the runner will also be restarted.) Must be greater than 0. */
@Env('N8N_RUNNERS_TASK_TIMEOUT')
taskTimeout: number = 60;
/** How often (in seconds) the runner must send a heartbeat to the broker, else the task will be aborted and the runner restarted. Must be greater than 0. */
/** How often (in seconds) the runner must send a heartbeat to the broker, else the task will be aborted. (In internal mode, the runner will also be restarted.) Must be greater than 0. */
@Env('N8N_RUNNERS_HEARTBEAT_INTERVAL')
heartbeatInterval: number = 30;
}

View file

@ -37,6 +37,9 @@ export class BaseRunnerConfig {
@Env('GENERIC_TIMEZONE')
timezone: string = 'America/New_York';
@Env('N8N_RUNNERS_TASK_TIMEOUT')
taskTimeout: number = 60;
@Nested
healthcheckServer!: HealthcheckServerConfig;
}

View file

@ -1,3 +1,4 @@
import { mock } from 'jest-mock-extended';
import { DateTime } from 'luxon';
import { setGlobalState, type CodeExecutionMode, type IDataObject } from 'n8n-workflow';
import fs from 'node:fs';
@ -61,7 +62,7 @@ describe('JsTaskRunner', () => {
runner?: JsTaskRunner;
}) => {
jest.spyOn(runner, 'requestData').mockResolvedValue(taskData);
return await runner.executeTask(task);
return await runner.executeTask(task, mock<AbortSignal>());
};
afterEach(() => {

View file

@ -0,0 +1,7 @@
import { ApplicationError } from 'n8n-workflow';
export class TaskCancelledError extends ApplicationError {
constructor(reason: string) {
super(`Task cancelled: ${reason}`, { level: 'warning' });
}
}

View file

@ -0,0 +1,30 @@
import { ApplicationError } from 'n8n-workflow';
export class TimeoutError extends ApplicationError {
description: string;
constructor(taskTimeout: number) {
super(
`Task execution timed out after ${taskTimeout} ${taskTimeout === 1 ? 'second' : 'seconds'}`,
);
const subtitle = 'The task runner was taking too long on this task, so the task was aborted.';
const fixes = {
optimizeScript:
'Optimize your script to prevent long-running tasks, e.g. by processing data in smaller batches.',
ensureTermination:
'Ensure that all paths in your script are able to terminate, i.e. no infinite loops.',
};
const suggestions = [fixes.optimizeScript, fixes.ensureTermination];
const suggestionsText = suggestions
.map((suggestion, index) => `${index + 1}. ${suggestion}`)
.join('<br/>');
const description = `${subtitle} You can try the following:<br/><br/>${suggestionsText}`;
this.description = description;
}
}

View file

@ -32,6 +32,7 @@ import { BuiltInsParserState } from './built-ins-parser/built-ins-parser-state';
import { isErrorLike } from './errors/error-like';
import { ExecutionError } from './errors/execution-error';
import { makeSerializable } from './errors/serializable-error';
import { TimeoutError } from './errors/timeout-error';
import type { RequireResolver } from './require-resolver';
import { createRequireResolver } from './require-resolver';
import { validateRunForAllItemsOutput, validateRunForEachItemOutput } from './result-validation';
@ -94,7 +95,7 @@ export class JsTaskRunner extends TaskRunner {
});
}
async executeTask(task: Task<JSExecSettings>): Promise<TaskResultData> {
async executeTask(task: Task<JSExecSettings>, signal: AbortSignal): Promise<TaskResultData> {
const settings = task.settings;
a.ok(settings, 'JS Code not sent to runner');
@ -133,8 +134,8 @@ export class JsTaskRunner extends TaskRunner {
const result =
settings.nodeMode === 'runOnceForAllItems'
? await this.runForAllItems(task.taskId, settings, data, workflow, customConsole)
: await this.runForEachItem(task.taskId, settings, data, workflow, customConsole);
? await this.runForAllItems(task.taskId, settings, data, workflow, customConsole, signal)
: await this.runForEachItem(task.taskId, settings, data, workflow, customConsole, signal);
return {
result,
@ -183,6 +184,7 @@ export class JsTaskRunner extends TaskRunner {
data: JsTaskData,
workflow: Workflow,
customConsole: CustomConsole,
signal: AbortSignal,
): Promise<INodeExecutionData[]> {
const dataProxy = this.createDataProxy(data, workflow, data.itemIndex);
const inputItems = data.connectionInputData;
@ -199,10 +201,26 @@ export class JsTaskRunner extends TaskRunner {
};
try {
const result = (await runInNewContext(
`globalThis.global = globalThis; module.exports = async function VmCodeWrapper() {${settings.code}\n}()`,
context,
)) as TaskResultData['result'];
const result = await new Promise<TaskResultData['result']>((resolve, reject) => {
const abortHandler = () => {
reject(new TimeoutError(this.taskTimeout));
};
signal.addEventListener('abort', abortHandler, { once: true });
const taskResult = runInNewContext(
`globalThis.global = globalThis; module.exports = async function VmCodeWrapper() {${settings.code}\n}()`,
context,
{ timeout: this.taskTimeout * 1000 },
) as Promise<TaskResultData['result']>;
void taskResult
.then(resolve)
.catch(reject)
.finally(() => {
signal.removeEventListener('abort', abortHandler);
});
});
if (result === null) {
return [];
@ -230,6 +248,7 @@ export class JsTaskRunner extends TaskRunner {
data: JsTaskData,
workflow: Workflow,
customConsole: CustomConsole,
signal: AbortSignal,
): Promise<INodeExecutionData[]> {
const inputItems = data.connectionInputData;
const returnData: INodeExecutionData[] = [];
@ -255,10 +274,26 @@ export class JsTaskRunner extends TaskRunner {
};
try {
let result = (await runInNewContext(
`module.exports = async function VmCodeWrapper() {${settings.code}\n}()`,
context,
)) as INodeExecutionData | undefined;
let result = await new Promise<INodeExecutionData | undefined>((resolve, reject) => {
const abortHandler = () => {
reject(new TimeoutError(this.taskTimeout));
};
signal.addEventListener('abort', abortHandler);
const taskResult = runInNewContext(
`module.exports = async function VmCodeWrapper() {${settings.code}\n}()`,
context,
{ timeout: this.taskTimeout * 1000 },
) as Promise<INodeExecutionData>;
void taskResult
.then(resolve)
.catch(reject)
.finally(() => {
signal.removeEventListener('abort', abortHandler);
});
});
// Filter out null values
if (result === null) {

View file

@ -8,6 +8,8 @@ import type { BrokerMessage, RunnerMessage } from '@/message-types';
import { TaskRunnerNodeTypes } from '@/node-types';
import { RPC_ALLOW_LIST, type TaskResultData } from '@/runner-types';
import { TaskCancelledError } from './js-task-runner/errors/task-cancelled-error';
export interface Task<T = unknown> {
taskId: string;
settings?: T;
@ -21,12 +23,14 @@ export interface TaskOffer {
}
interface DataRequest {
taskId: string;
requestId: string;
resolve: (data: unknown) => void;
reject: (error: unknown) => void;
}
interface NodeTypesRequest {
taskId: string;
requestId: string;
resolve: (data: unknown) => void;
reject: (error: unknown) => void;
@ -82,14 +86,20 @@ export abstract class TaskRunner extends EventEmitter {
private idleTimer: NodeJS.Timeout | undefined;
/** How long (in seconds) a task is allowed to take for completion, else the task will be aborted. */
protected readonly taskTimeout: number;
/** How long (in seconds) a runner may be idle for before exit. */
private readonly idleTimeout: number;
protected taskCancellations = new Map<Task['taskId'], AbortController>();
constructor(opts: TaskRunnerOpts) {
super();
this.taskType = opts.taskType;
this.name = opts.name ?? 'Node.js Task Runner SDK';
this.maxConcurrency = opts.maxConcurrency;
this.taskTimeout = opts.taskTimeout;
this.idleTimeout = opts.idleTimeout;
const wsUrl = `ws://${opts.n8nUri}/runners/_ws?id=${this.id}`;
@ -208,7 +218,7 @@ export abstract class TaskRunner extends EventEmitter {
this.offerAccepted(message.offerId, message.taskId);
break;
case 'broker:taskcancel':
this.taskCancelled(message.taskId);
this.taskCancelled(message.taskId, message.reason);
break;
case 'broker:tasksettings':
void this.receivedSettings(message.taskId, message.settings);
@ -283,17 +293,35 @@ export abstract class TaskRunner extends EventEmitter {
});
}
taskCancelled(taskId: string) {
taskCancelled(taskId: string, reason: string) {
const task = this.runningTasks.get(taskId);
if (!task) {
return;
}
task.cancelled = true;
if (task.active) {
// TODO
} else {
this.runningTasks.delete(taskId);
for (const [requestId, request] of this.dataRequests.entries()) {
if (request.taskId === taskId) {
request.reject(new TaskCancelledError(reason));
this.dataRequests.delete(requestId);
}
}
for (const [requestId, request] of this.nodeTypesRequests.entries()) {
if (request.taskId === taskId) {
request.reject(new TaskCancelledError(reason));
this.nodeTypesRequests.delete(requestId);
}
}
const controller = this.taskCancellations.get(taskId);
if (controller) {
controller.abort();
this.taskCancellations.delete(taskId);
}
if (!task.active) this.runningTasks.delete(taskId);
this.sendOffers();
}
@ -326,20 +354,33 @@ export abstract class TaskRunner extends EventEmitter {
this.runningTasks.delete(taskId);
return;
}
const controller = new AbortController();
this.taskCancellations.set(taskId, controller);
const taskTimeout = setTimeout(() => {
if (!task.cancelled) {
controller.abort();
this.taskCancellations.delete(taskId);
}
}, this.taskTimeout * 1_000);
task.settings = settings;
task.active = true;
try {
const data = await this.executeTask(task);
const data = await this.executeTask(task, controller.signal);
this.taskDone(taskId, data);
} catch (error) {
this.taskErrored(taskId, error);
if (!task.cancelled) this.taskErrored(taskId, error);
} finally {
clearTimeout(taskTimeout);
this.taskCancellations.delete(taskId);
this.resetIdleTimer();
}
}
// eslint-disable-next-line @typescript-eslint/naming-convention
async executeTask(_task: Task): Promise<TaskResultData> {
async executeTask(_task: Task, _signal: AbortSignal): Promise<TaskResultData> {
throw new ApplicationError('Unimplemented');
}
@ -352,6 +393,7 @@ export abstract class TaskRunner extends EventEmitter {
const nodeTypesPromise = new Promise<T>((resolve, reject) => {
this.nodeTypesRequests.set(requestId, {
requestId,
taskId,
resolve: resolve as (data: unknown) => void,
reject,
});
@ -380,6 +422,7 @@ export abstract class TaskRunner extends EventEmitter {
const p = new Promise<T>((resolve, reject) => {
this.dataRequests.set(requestId, {
requestId,
taskId,
resolve: resolve as (data: unknown) => void,
reject,
});

View file

@ -6,6 +6,7 @@ import { ApplicationError, type INodeTypeBaseDescription } from 'n8n-workflow';
import { Time } from '@/constants';
import { TaskRejectError } from '../errors';
import { TaskRunnerTimeoutError } from '../errors/task-runner-timeout.error';
import type { RunnerLifecycleEvents } from '../runner-lifecycle-events';
import { TaskBroker } from '../task-broker.service';
import type { TaskOffer, TaskRequest, TaskRunner } from '../task-broker.service';
@ -721,7 +722,7 @@ describe('TaskBroker', () => {
beforeAll(() => {
jest.useFakeTimers();
config = mock<TaskRunnersConfig>({ taskTimeout: 30 });
config = mock<TaskRunnersConfig>({ taskTimeout: 30, mode: 'internal' });
taskBroker = new TaskBroker(mock(), config, runnerLifecycleEvents);
});
@ -800,7 +801,7 @@ describe('TaskBroker', () => {
expect(taskBroker.getTasks().get(taskId)).toBeUndefined();
});
it('on timeout, we should emit `runner:timed-out-during-task` event and send error to requester', async () => {
it('[internal mode] on timeout, we should emit `runner:timed-out-during-task` event and send error to requester', async () => {
jest.spyOn(global, 'clearTimeout');
const taskId = 'task1';
@ -839,5 +840,50 @@ describe('TaskBroker', () => {
expect(taskBroker.getTasks().get(taskId)).toBeUndefined();
});
it('[external mode] on timeout, we should instruct the runner to cancel and send error to requester', async () => {
const config = mock<TaskRunnersConfig>({ taskTimeout: 30, mode: 'external' });
taskBroker = new TaskBroker(mock(), config, runnerLifecycleEvents);
jest.spyOn(global, 'clearTimeout');
const taskId = 'task1';
const runnerId = 'runner1';
const requesterId = 'requester1';
const runner = mock<TaskRunner>({ id: runnerId });
const runnerCallback = jest.fn();
const requesterCallback = jest.fn();
taskBroker.registerRunner(runner, runnerCallback);
taskBroker.registerRequester(requesterId, requesterCallback);
taskBroker.setTasks({
[taskId]: { id: taskId, runnerId, requesterId, taskType: 'test' },
});
await taskBroker.sendTaskSettings(taskId, {});
runnerCallback.mockClear();
jest.runAllTimers();
await Promise.resolve(); // for timeout callback
await Promise.resolve(); // for sending messages to runner and requester
await Promise.resolve(); // for task cleanup and removal
expect(runnerCallback).toHaveBeenLastCalledWith({
type: 'broker:taskcancel',
taskId,
reason: 'Task execution timed out',
});
expect(requesterCallback).toHaveBeenCalledWith({
type: 'broker:taskerror',
taskId,
error: expect.any(TaskRunnerTimeoutError),
});
expect(clearTimeout).toHaveBeenCalled();
expect(taskBroker.getTasks().get(taskId)).toBeUndefined();
});
});
});

View file

@ -1,15 +1,23 @@
import type { TaskRunnerMode } from '@n8n/config/src/configs/runners.config';
import { ApplicationError } from 'n8n-workflow';
export class TaskRunnerTimeoutError extends ApplicationError {
description: string;
constructor(taskTimeout: number, isSelfHosted: boolean) {
constructor({
taskTimeout,
isSelfHosted,
mode,
}: { taskTimeout: number; isSelfHosted: boolean; mode: TaskRunnerMode }) {
super(
`Task execution timed out after ${taskTimeout} ${taskTimeout === 1 ? 'second' : 'seconds'}`,
);
const subtitle =
'The task runner was taking too long on this task, so it was suspected of being unresponsive and restarted, and the task was aborted. You can try the following:';
const subtitles = {
internal:
'The task runner was taking too long on this task, so it was suspected of being unresponsive and restarted, and the task was aborted.',
external: 'The task runner was taking too long on this task, so the task was aborted.',
};
const fixes = {
optimizeScript:
@ -27,7 +35,7 @@ export class TaskRunnerTimeoutError extends ApplicationError {
.map((suggestion, index) => `${index + 1}. ${suggestion}`)
.join('<br/>');
const description = `${subtitle}<br/><br/>${suggestionsText}`;
const description = `${mode === 'internal' ? subtitles.internal : subtitles.external} You can try the following:<br/><br/>${suggestionsText}`;
this.description = description;
}

View file

@ -459,14 +459,25 @@ export class TaskBroker {
const task = this.tasks.get(taskId);
if (!task) return;
this.runnerLifecycleEvents.emit('runner:timed-out-during-task');
if (this.taskRunnersConfig.mode === 'internal') {
this.runnerLifecycleEvents.emit('runner:timed-out-during-task');
} else if (this.taskRunnersConfig.mode === 'external') {
await this.messageRunner(task.runnerId, {
type: 'broker:taskcancel',
taskId,
reason: 'Task execution timed out',
});
}
const { taskTimeout, mode } = this.taskRunnersConfig;
await this.taskErrorHandler(
taskId,
new TaskRunnerTimeoutError(
this.taskRunnersConfig.taskTimeout,
config.getEnv('deployment.type') !== 'cloud',
),
new TaskRunnerTimeoutError({
taskTimeout,
isSelfHosted: config.getEnv('deployment.type') !== 'cloud',
mode,
}),
);
}