feat(core): Detect restart loop in a task runner process (no-changelog) (#12003)

Co-authored-by: Iván Ovejero <ivov.src@gmail.com>
This commit is contained in:
Tomi Turtiainen 2024-12-09 13:11:29 +02:00 committed by GitHub
parent 00897f6634
commit 516f3b7b4b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 205 additions and 2 deletions

View file

@ -0,0 +1,57 @@
import { TaskRunnersConfig } from '@n8n/config';
import { mock } from 'jest-mock-extended';
import type { Logger } from '@/logging/logger.service';
import type { TaskRunnerAuthService } from '@/runners/auth/task-runner-auth.service';
import { TaskRunnerRestartLoopError } from '@/runners/errors/task-runner-restart-loop-error';
import { RunnerLifecycleEvents } from '@/runners/runner-lifecycle-events';
import { TaskRunnerProcess } from '@/runners/task-runner-process';
import { TaskRunnerProcessRestartLoopDetector } from '@/runners/task-runner-process-restart-loop-detector';
describe('TaskRunnerProcessRestartLoopDetector', () => {
const mockLogger = mock<Logger>();
const mockAuthService = mock<TaskRunnerAuthService>();
const runnerConfig = new TaskRunnersConfig();
const taskRunnerProcess = new TaskRunnerProcess(
mockLogger,
runnerConfig,
mockAuthService,
new RunnerLifecycleEvents(),
);
it('should detect a restart loop if process exits 5 times within 5s', () => {
const restartLoopDetector = new TaskRunnerProcessRestartLoopDetector(taskRunnerProcess);
let emittedError: TaskRunnerRestartLoopError | undefined = undefined;
restartLoopDetector.on('restart-loop-detected', (error) => {
emittedError = error;
});
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
expect(emittedError).toBeInstanceOf(TaskRunnerRestartLoopError);
});
it('should not detect a restart loop if process exits less than 5 times within 5s', () => {
jest.useFakeTimers();
const restartLoopDetector = new TaskRunnerProcessRestartLoopDetector(taskRunnerProcess);
let emittedError: TaskRunnerRestartLoopError | undefined = undefined;
restartLoopDetector.on('restart-loop-detected', (error) => {
emittedError = error;
});
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
taskRunnerProcess.emit('exit');
jest.advanceTimersByTime(5010);
taskRunnerProcess.emit('exit');
expect(emittedError).toBeUndefined();
});
});

View file

@ -0,0 +1,14 @@
import { ApplicationError } from 'n8n-workflow';
export class TaskRunnerRestartLoopError extends ApplicationError {
constructor(
public readonly howManyTimes: number,
public readonly timePeriodMs: number,
) {
const message = `Task runner has restarted ${howManyTimes} times within ${timePeriodMs / 1000} seconds. This is an abnormally high restart rate that suggests a bug or other issue is preventing your runner process from starting up. If this issues persists, please file a report at: https://github.com/n8n-io/n8n/issues`;
super(message, {
level: 'fatal',
});
}
}

View file

@ -1,9 +1,13 @@
import { TaskRunnersConfig } from '@n8n/config';
import { ErrorReporterProxy, sleep } from 'n8n-workflow';
import * as a from 'node:assert/strict';
import Container, { Service } from 'typedi';
import { OnShutdown } from '@/decorators/on-shutdown';
import { Logger } from '@/logging/logger.service';
import type { TaskRunnerRestartLoopError } from '@/runners/errors/task-runner-restart-loop-error';
import type { TaskRunnerProcess } from '@/runners/task-runner-process';
import { TaskRunnerProcessRestartLoopDetector } from '@/runners/task-runner-process-restart-loop-detector';
import { MissingAuthTokenError } from './errors/missing-auth-token.error';
import { TaskRunnerWsServer } from './runner-ws-server';
@ -25,7 +29,14 @@ export class TaskRunnerModule {
private taskRunnerProcess: TaskRunnerProcess | undefined;
constructor(private readonly runnerConfig: TaskRunnersConfig) {}
private taskRunnerProcessRestartLoopDetector: TaskRunnerProcessRestartLoopDetector | undefined;
constructor(
private readonly logger: Logger,
private readonly runnerConfig: TaskRunnersConfig,
) {
this.logger = this.logger.scoped('task-runner');
}
async start() {
a.ok(this.runnerConfig.enabled, 'Task runner is disabled');
@ -83,6 +94,14 @@ export class TaskRunnerModule {
const { TaskRunnerProcess } = await import('@/runners/task-runner-process');
this.taskRunnerProcess = Container.get(TaskRunnerProcess);
this.taskRunnerProcessRestartLoopDetector = new TaskRunnerProcessRestartLoopDetector(
this.taskRunnerProcess,
);
this.taskRunnerProcessRestartLoopDetector.on(
'restart-loop-detected',
this.onRunnerRestartLoopDetected,
);
await this.taskRunnerProcess.start();
const { InternalTaskRunnerDisconnectAnalyzer } = await import(
@ -92,4 +111,13 @@ export class TaskRunnerModule {
Container.get(InternalTaskRunnerDisconnectAnalyzer),
);
}
private onRunnerRestartLoopDetected = async (error: TaskRunnerRestartLoopError) => {
this.logger.error(error.message);
ErrorReporterProxy.error(error);
// Allow some time for the error to be flushed
await sleep(1000);
process.exit(1);
};
}

View file

@ -0,0 +1,73 @@
import { Time } from '@/constants';
import { TaskRunnerRestartLoopError } from '@/runners/errors/task-runner-restart-loop-error';
import type { TaskRunnerProcess } from '@/runners/task-runner-process';
import { TypedEmitter } from '@/typed-emitter';
const MAX_RESTARTS = 5;
const RESTARTS_WINDOW = 2 * Time.seconds.toMilliseconds;
type TaskRunnerProcessRestartLoopDetectorEventMap = {
'restart-loop-detected': TaskRunnerRestartLoopError;
};
/**
* A class to monitor the task runner process for restart loops
*/
export class TaskRunnerProcessRestartLoopDetector extends TypedEmitter<TaskRunnerProcessRestartLoopDetectorEventMap> {
/**
* How many times the process needs to restart for it to be detected
* being in a loop.
*/
private readonly maxCount = MAX_RESTARTS;
/**
* The time interval in which the process needs to restart `maxCount` times
* to be detected as being in a loop.
*/
private readonly restartsWindow = RESTARTS_WINDOW;
private numRestarts = 0;
/** Time when the first restart of a loop happened within a time window */
private firstRestartedAt = Date.now();
constructor(private readonly taskRunnerProcess: TaskRunnerProcess) {
super();
this.taskRunnerProcess.on('exit', () => {
this.increment();
if (this.isMaxCountExceeded()) {
this.emit(
'restart-loop-detected',
new TaskRunnerRestartLoopError(this.numRestarts, this.msSinceFirstIncrement()),
);
}
});
}
/**
* Increments the counter
*/
private increment() {
const now = Date.now();
if (now > this.firstRestartedAt + this.restartsWindow) {
this.reset();
}
this.numRestarts++;
}
private reset() {
this.numRestarts = 0;
this.firstRestartedAt = Date.now();
}
private isMaxCountExceeded() {
return this.numRestarts >= this.maxCount;
}
private msSinceFirstIncrement() {
return Date.now() - this.firstRestartedAt;
}
}

View file

@ -1,4 +1,5 @@
import { TaskRunnersConfig } from '@n8n/config';
import { mock } from 'jest-mock-extended';
import Container from 'typedi';
import { MissingAuthTokenError } from '@/runners/errors/missing-auth-token.error';
@ -32,7 +33,7 @@ describe('TaskRunnerModule in external mode', () => {
runnerConfig.enabled = true;
runnerConfig.authToken = '';
const module = new TaskRunnerModule(runnerConfig);
const module = new TaskRunnerModule(mock(), runnerConfig);
await expect(module.start()).rejects.toThrowError(MissingAuthTokenError);
});

View file

@ -3,6 +3,7 @@ import Container from 'typedi';
import { TaskRunnerWsServer } from '@/runners/runner-ws-server';
import { TaskBroker } from '@/runners/task-broker.service';
import { TaskRunnerProcess } from '@/runners/task-runner-process';
import { TaskRunnerProcessRestartLoopDetector } from '@/runners/task-runner-process-restart-loop-detector';
import { retryUntil } from '@test-integration/retry-until';
import { setupBrokerTestServer } from '@test-integration/utils/task-broker-test-server';
@ -84,4 +85,33 @@ describe('TaskRunnerProcess', () => {
expect(getNumRegisteredRunners()).toBe(1);
expect(runnerProcess.pid).not.toBe(processId);
});
it('should work together with restart loop detector', async () => {
// Arrange
const restartLoopDetector = new TaskRunnerProcessRestartLoopDetector(runnerProcess);
let restartLoopDetectedEventEmitted = false;
restartLoopDetector.once('restart-loop-detected', () => {
restartLoopDetectedEventEmitted = true;
});
// Act
await runnerProcess.start();
// Simulate a restart loop
for (let i = 0; i < 5; i++) {
await retryUntil(() => {
expect(runnerProcess.pid).toBeDefined();
});
// @ts-expect-error private property
runnerProcess.process?.kill();
await new Promise((resolve) => {
runnerProcess.once('exit', resolve);
});
}
// Assert
expect(restartLoopDetectedEventEmitted).toBe(true);
});
});