refactor(core): Remove watchdog interval (#11295)

This commit is contained in:
Iván Ovejero 2024-10-17 13:51:56 +02:00 committed by GitHub
parent c79aa01a48
commit 83ca7f8e90
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 12 additions and 57 deletions

View file

@ -82,10 +82,6 @@ class BullConfig {
@Nested @Nested
redis: RedisConfig; redis: RedisConfig;
/** How often (in seconds) to poll the Bull queue to identify executions finished during a Redis crash. `0` to disable. May increase Redis traffic significantly. */
@Env('QUEUE_RECOVERY_INTERVAL')
queueRecoveryInterval: number = 60; // watchdog interval
/** @deprecated How long (in seconds) a worker must wait for active executions to finish before exiting. Use `N8N_GRACEFUL_SHUTDOWN_TIMEOUT` instead */ /** @deprecated How long (in seconds) a worker must wait for active executions to finish before exiting. Use `N8N_GRACEFUL_SHUTDOWN_TIMEOUT` instead */
@Env('QUEUE_WORKER_TIMEOUT') @Env('QUEUE_WORKER_TIMEOUT')
gracefulShutdownTimeout: number = 30; gracefulShutdownTimeout: number = 30;

View file

@ -211,7 +211,6 @@ describe('GlobalConfig', () => {
clusterNodes: '', clusterNodes: '',
tls: false, tls: false,
}, },
queueRecoveryInterval: 60,
gracefulShutdownTimeout: 30, gracefulShutdownTimeout: 30,
prefix: 'bull', prefix: 'bull',
settings: { settings: {

View file

@ -2,6 +2,16 @@
This list shows all the versions which include breaking changes and how to upgrade. This list shows all the versions which include breaking changes and how to upgrade.
# 1.65.0
### What changed?
Queue polling via the env var `QUEUE_RECOVERY_INTERVAL` has been removed.
### When is action necessary?
If you have set the env var `QUEUE_RECOVERY_INTERVAL`, so you can remove it as it no longer has any effect.
# 1.63.0 # 1.63.0
### What changed? ### What changed?

View file

@ -2,7 +2,6 @@
/* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-shadow */ /* eslint-disable @typescript-eslint/no-shadow */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { GlobalConfig } from '@n8n/config';
import { InstanceSettings, WorkflowExecute } from 'n8n-core'; import { InstanceSettings, WorkflowExecute } from 'n8n-core';
import type { import type {
ExecutionError, ExecutionError,
@ -30,7 +29,7 @@ import { ExternalHooks } from '@/external-hooks';
import { Logger } from '@/logging/logger.service'; import { Logger } from '@/logging/logger.service';
import { NodeTypes } from '@/node-types'; import { NodeTypes } from '@/node-types';
import type { ScalingService } from '@/scaling/scaling.service'; import type { ScalingService } from '@/scaling/scaling.service';
import type { Job, JobData, JobResult } from '@/scaling/scaling.types'; import type { Job, JobData } from '@/scaling/scaling.types';
import { PermissionChecker } from '@/user-management/permission-checker'; import { PermissionChecker } from '@/user-management/permission-checker';
import * as WorkflowExecuteAdditionalData from '@/workflow-execute-additional-data'; import * as WorkflowExecuteAdditionalData from '@/workflow-execute-additional-data';
import * as WorkflowHelpers from '@/workflow-helpers'; import * as WorkflowHelpers from '@/workflow-helpers';
@ -439,54 +438,8 @@ export class WorkflowRunner {
reject(error); reject(error);
}); });
const jobData: Promise<JobResult> = job.finished();
const { queueRecoveryInterval } = Container.get(GlobalConfig).queue.bull;
const racingPromises: Array<Promise<JobResult>> = [jobData];
let clearWatchdogInterval;
if (queueRecoveryInterval > 0) {
/** ***********************************************
* Long explanation about what this solves: *
* This only happens in a very specific scenario *
* when Redis crashes and recovers shortly *
* but during this time, some execution(s) *
* finished. The end result is that the main *
* process will wait indefinitely and never *
* get a response. This adds an active polling to*
* the queue that allows us to identify that the *
* execution finished and get information from *
* the database. *
************************************************ */
let watchDogInterval: NodeJS.Timeout | undefined;
const watchDog: Promise<JobResult> = new Promise((res) => {
watchDogInterval = setInterval(async () => {
const currentJob = await this.scalingService.getJob(job.id);
// When null means job is finished (not found in queue)
if (currentJob === null) {
// Mimic worker's success message
res({ success: true });
}
}, queueRecoveryInterval * 1000);
});
racingPromises.push(watchDog);
clearWatchdogInterval = () => {
if (watchDogInterval) {
clearInterval(watchDogInterval);
watchDogInterval = undefined;
}
};
}
try { try {
await Promise.race(racingPromises); await job.finished();
if (clearWatchdogInterval !== undefined) {
clearWatchdogInterval();
}
} catch (error) { } catch (error) {
// We use "getWorkflowHooksWorkerExecuter" as "getWorkflowHooksWorkerMain" does not contain the // We use "getWorkflowHooksWorkerExecuter" as "getWorkflowHooksWorkerMain" does not contain the
// "workflowExecuteAfter" which we require. // "workflowExecuteAfter" which we require.
@ -497,9 +450,6 @@ export class WorkflowRunner {
{ retryOf: data.retryOf ? data.retryOf.toString() : undefined }, { retryOf: data.retryOf ? data.retryOf.toString() : undefined },
); );
this.logger.error(`Problem with execution ${executionId}: ${error.message}. Aborting.`); this.logger.error(`Problem with execution ${executionId}: ${error.message}. Aborting.`);
if (clearWatchdogInterval !== undefined) {
clearWatchdogInterval();
}
await this.processError(error, new Date(), data.executionMode, executionId, hooks); await this.processError(error, new Date(), data.executionMode, executionId, hooks);
reject(error); reject(error);