2023-01-02 03:14:39 -08:00
|
|
|
import type Bull from 'bull';
|
2023-03-16 07:34:13 -07:00
|
|
|
import { Service } from 'typedi';
|
refactor(core): Move event and telemetry handling into workers in queue mode (#7138)
# Motivation
In Queue mode, finished executions would cause the main instance to
always pull all execution data from the database, unflatten it and then
use it to send out event log events and telemetry events, as well as
required returns to Respond to Webhook nodes etc.
This could cause OOM errors when the data was large, since it had to be
fully unpacked and transformed on the main instance’s side, using up a
lot of memory (and time).
This PR attempts to limit this behaviour to only happen in those
required cases where the data has to be forwarded to some waiting
webhook, for example.
# Changes
Execution data is only required in cases, where the active execution has
a `postExecutePromise` attached to it. These usually forward the data to
some other endpoint (e.g. a listening webhook connection).
By adding a helper `getPostExecutePromiseCount()`, we can decide that in
cases where there is nothing listening at all, there is no reason to
pull the data on the main instance.
Previously, there would always be postExecutePromises because the
telemetry events were called. Now, these have been moved into the
workers, which have been given the various InternalHooks calls to their
hook function arrays, so they themselves issue these telemetry and event
calls.
This results in all event log messages to now be logged on the worker’s
event log, as well as the worker’s eventbus being the one to send out
the events to destinations. The main event log does…pretty much nothing.
We are not logging executions on the main event log any more, because
this would require all events to be replicated 1:1 from the workers to
the main instance(s) (this IS possible and implemented, see the worker’s
`replicateToRedisEventLogFunction` - but it is not enabled to reduce the
amount of traffic over redis).
Partial events in the main log could confuse the recovery process and
would result in, ironically, the recovery corrupting the execution data
by considering them crashed.
# Refactor
I have also used the opportunity to reduce duplicate code and move some
of the hook functionality into
`packages/cli/src/executionLifecycleHooks/shared/sharedHookFunctions.ts`
in preparation for a future full refactor of the hooks
2023-09-13 22:58:15 -07:00
|
|
|
import type { ExecutionError, IExecuteResponsePromiseData } from 'n8n-workflow';
|
2023-02-21 10:21:56 -08:00
|
|
|
import { ActiveExecutions } from '@/ActiveExecutions';
|
2023-10-13 02:53:59 -07:00
|
|
|
import { decodeWebhookResponse } from '@/helpers/decodeWebhookResponse';
|
|
|
|
|
2023-08-02 03:51:25 -07:00
|
|
|
import {
|
|
|
|
getRedisClusterClient,
|
|
|
|
getRedisClusterNodes,
|
|
|
|
getRedisPrefix,
|
|
|
|
getRedisStandardClient,
|
|
|
|
} from './services/redis/RedisServiceHelper';
|
|
|
|
import type { RedisClientType } from './services/redis/RedisServiceBaseClasses';
|
|
|
|
import config from '@/config';
|
2021-02-09 14:32:40 -08:00
|
|
|
|
2023-02-10 05:59:20 -08:00
|
|
|
export type JobId = Bull.JobId;
|
2022-09-09 06:14:49 -07:00
|
|
|
export type Job = Bull.Job<JobData>;
|
|
|
|
export type JobQueue = Bull.Queue<JobData>;
|
|
|
|
|
|
|
|
export interface JobData {
|
|
|
|
executionId: string;
|
|
|
|
loadStaticData: boolean;
|
|
|
|
}
|
|
|
|
|
|
|
|
export interface JobResponse {
|
|
|
|
success: boolean;
|
refactor(core): Move event and telemetry handling into workers in queue mode (#7138)
# Motivation
In Queue mode, finished executions would cause the main instance to
always pull all execution data from the database, unflatten it and then
use it to send out event log events and telemetry events, as well as
required returns to Respond to Webhook nodes etc.
This could cause OOM errors when the data was large, since it had to be
fully unpacked and transformed on the main instance’s side, using up a
lot of memory (and time).
This PR attempts to limit this behaviour to only happen in those
required cases where the data has to be forwarded to some waiting
webhook, for example.
# Changes
Execution data is only required in cases, where the active execution has
a `postExecutePromise` attached to it. These usually forward the data to
some other endpoint (e.g. a listening webhook connection).
By adding a helper `getPostExecutePromiseCount()`, we can decide that in
cases where there is nothing listening at all, there is no reason to
pull the data on the main instance.
Previously, there would always be postExecutePromises because the
telemetry events were called. Now, these have been moved into the
workers, which have been given the various InternalHooks calls to their
hook function arrays, so they themselves issue these telemetry and event
calls.
This results in all event log messages to now be logged on the worker’s
event log, as well as the worker’s eventbus being the one to send out
the events to destinations. The main event log does…pretty much nothing.
We are not logging executions on the main event log any more, because
this would require all events to be replicated 1:1 from the workers to
the main instance(s) (this IS possible and implemented, see the worker’s
`replicateToRedisEventLogFunction` - but it is not enabled to reduce the
amount of traffic over redis).
Partial events in the main log could confuse the recovery process and
would result in, ironically, the recovery corrupting the execution data
by considering them crashed.
# Refactor
I have also used the opportunity to reduce duplicate code and move some
of the hook functionality into
`packages/cli/src/executionLifecycleHooks/shared/sharedHookFunctions.ts`
in preparation for a future full refactor of the hooks
2023-09-13 22:58:15 -07:00
|
|
|
error?: ExecutionError;
|
2022-09-09 06:14:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
export interface WebhookResponse {
|
|
|
|
executionId: string;
|
|
|
|
response: IExecuteResponsePromiseData;
|
|
|
|
}
|
|
|
|
|
2023-03-16 07:34:13 -07:00
|
|
|
@Service()
|
2021-02-09 14:32:40 -08:00
|
|
|
export class Queue {
|
2022-09-09 06:14:49 -07:00
|
|
|
private jobQueue: JobQueue;
|
2021-08-29 11:58:11 -07:00
|
|
|
|
2023-02-21 10:21:56 -08:00
|
|
|
constructor(private activeExecutions: ActiveExecutions) {}
|
2021-11-05 09:45:51 -07:00
|
|
|
|
2023-01-02 03:14:39 -08:00
|
|
|
async init() {
|
2023-08-02 03:51:25 -07:00
|
|
|
const bullPrefix = config.getEnv('queue.bull.prefix');
|
|
|
|
const prefix = getRedisPrefix(bullPrefix);
|
2023-07-21 14:31:52 -07:00
|
|
|
const clusterNodes = getRedisClusterNodes();
|
|
|
|
const usesRedisCluster = clusterNodes.length > 0;
|
2023-01-02 03:14:39 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
|
|
|
const { default: Bull } = await import('bull');
|
2023-07-21 14:31:52 -07:00
|
|
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
|
|
|
const { default: Redis } = await import('ioredis');
|
2021-06-23 02:20:07 -07:00
|
|
|
// Disabling ready check is necessary as it allows worker to
|
2021-02-09 14:32:40 -08:00
|
|
|
// quickly reconnect to Redis if Redis crashes or is unreachable
|
|
|
|
// for some time. With it enabled, worker might take minutes to realize
|
|
|
|
// redis is back up and resume working.
|
|
|
|
// More here: https://github.com/OptimalBits/bull/issues/890
|
2023-07-21 14:31:52 -07:00
|
|
|
this.jobQueue = new Bull('jobs', {
|
|
|
|
prefix,
|
|
|
|
createClient: (type, clientConfig) =>
|
|
|
|
usesRedisCluster
|
2023-08-02 03:51:25 -07:00
|
|
|
? getRedisClusterClient(Redis, clientConfig, (type + '(bull)') as RedisClientType)
|
|
|
|
: getRedisStandardClient(Redis, clientConfig, (type + '(bull)') as RedisClientType),
|
2023-07-21 14:31:52 -07:00
|
|
|
});
|
2021-11-05 09:45:51 -07:00
|
|
|
|
2022-09-09 06:14:49 -07:00
|
|
|
this.jobQueue.on('global:progress', (jobId, progress: WebhookResponse) => {
|
2021-11-05 09:45:51 -07:00
|
|
|
this.activeExecutions.resolveResponsePromise(
|
|
|
|
progress.executionId,
|
2023-10-13 02:53:59 -07:00
|
|
|
decodeWebhookResponse(progress.response),
|
2021-11-05 09:45:51 -07:00
|
|
|
);
|
|
|
|
});
|
2021-02-09 14:32:40 -08:00
|
|
|
}
|
2021-08-29 11:58:11 -07:00
|
|
|
|
2022-09-09 06:14:49 -07:00
|
|
|
async add(jobData: JobData, jobOptions: object): Promise<Job> {
|
2021-02-09 14:32:40 -08:00
|
|
|
return this.jobQueue.add(jobData, jobOptions);
|
|
|
|
}
|
2021-08-29 11:58:11 -07:00
|
|
|
|
2023-02-10 05:59:20 -08:00
|
|
|
async getJob(jobId: JobId): Promise<Job | null> {
|
2021-02-09 14:32:40 -08:00
|
|
|
return this.jobQueue.getJob(jobId);
|
|
|
|
}
|
2021-08-29 11:58:11 -07:00
|
|
|
|
2022-09-09 06:14:49 -07:00
|
|
|
async getJobs(jobTypes: Bull.JobStatus[]): Promise<Job[]> {
|
2021-02-09 14:32:40 -08:00
|
|
|
return this.jobQueue.getJobs(jobTypes);
|
|
|
|
}
|
2021-08-29 11:58:11 -07:00
|
|
|
|
2023-10-13 02:53:59 -07:00
|
|
|
async process(concurrency: number, fn: Bull.ProcessCallbackFunction<JobData>): Promise<void> {
|
|
|
|
return this.jobQueue.process(concurrency, fn);
|
|
|
|
}
|
|
|
|
|
|
|
|
async ping(): Promise<string> {
|
|
|
|
return this.jobQueue.client.ping();
|
|
|
|
}
|
|
|
|
|
|
|
|
async pause(isLocal?: boolean): Promise<void> {
|
|
|
|
return this.jobQueue.pause(isLocal);
|
|
|
|
}
|
|
|
|
|
2022-09-09 06:14:49 -07:00
|
|
|
getBullObjectInstance(): JobQueue {
|
2023-10-13 02:53:59 -07:00
|
|
|
if (this.jobQueue === undefined) {
|
|
|
|
// if queue is not initialized yet throw an error, since we do not want to hand around an undefined queue
|
|
|
|
throw new Error('Queue is not initialized yet!');
|
|
|
|
}
|
2021-02-09 14:32:40 -08:00
|
|
|
return this.jobQueue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2021-06-23 02:20:07 -07:00
|
|
|
*
|
2022-09-09 06:14:49 -07:00
|
|
|
* @param job A Job instance
|
2021-02-09 14:32:40 -08:00
|
|
|
* @returns boolean true if we were able to securely stop the job
|
|
|
|
*/
|
2022-09-09 06:14:49 -07:00
|
|
|
async stopJob(job: Job): Promise<boolean> {
|
2021-02-09 14:32:40 -08:00
|
|
|
if (await job.isActive()) {
|
|
|
|
// Job is already running so tell it to stop
|
|
|
|
await job.progress(-1);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Job did not get started yet so remove from queue
|
2021-08-29 11:58:11 -07:00
|
|
|
try {
|
2021-02-09 14:32:40 -08:00
|
|
|
await job.remove();
|
|
|
|
return true;
|
|
|
|
} catch (e) {
|
|
|
|
await job.progress(-1);
|
2021-08-29 11:58:11 -07:00
|
|
|
}
|
|
|
|
|
2021-02-09 14:32:40 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|