diff --git a/packages/cli/src/evaluation/test-runner/__tests__/evaluation-metrics.ee.test.ts b/packages/cli/src/evaluation/test-runner/__tests__/evaluation-metrics.ee.test.ts new file mode 100644 index 0000000000..27daf3aa79 --- /dev/null +++ b/packages/cli/src/evaluation/test-runner/__tests__/evaluation-metrics.ee.test.ts @@ -0,0 +1,72 @@ +import { EvaluationMetrics } from '../evaluation-metrics.ee'; + +describe('EvaluationMetrics', () => { + test('should aggregate metrics correctly', () => { + const testMetricNames = new Set(['metric1', 'metric2']); + const metrics = new EvaluationMetrics(testMetricNames); + + metrics.addResults({ metric1: 1, metric2: 0 }); + metrics.addResults({ metric1: 0.5, metric2: 0.2 }); + + const aggregatedMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedMetrics).toEqual({ metric1: 0.75, metric2: 0.1 }); + }); + + test('should aggregate only numbers', () => { + const testMetricNames = new Set(['metric1', 'metric2']); + const metrics = new EvaluationMetrics(testMetricNames); + + metrics.addResults({ metric1: 1, metric2: 0 }); + metrics.addResults({ metric1: '0.5', metric2: 0.2 }); + metrics.addResults({ metric1: 'not a number', metric2: [1, 2, 3] }); + + const aggregatedUpMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedUpMetrics).toEqual({ metric1: 1, metric2: 0.1 }); + }); + + test('should handle missing values', () => { + const testMetricNames = new Set(['metric1', 'metric2']); + const metrics = new EvaluationMetrics(testMetricNames); + + metrics.addResults({ metric1: 1 }); + metrics.addResults({ metric2: 0.2 }); + + const aggregatedMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedMetrics).toEqual({ metric1: 1, metric2: 0.2 }); + }); + + test('should handle empty metrics', () => { + const testMetricNames = new Set(['metric1', 'metric2']); + const metrics = new EvaluationMetrics(testMetricNames); + + const aggregatedMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedMetrics).toEqual({}); + }); + + test('should handle empty testMetrics', () => { + const metrics = new EvaluationMetrics(new Set()); + + metrics.addResults({ metric1: 1, metric2: 0 }); + metrics.addResults({ metric1: 0.5, metric2: 0.2 }); + + const aggregatedMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedMetrics).toEqual({}); + }); + + test('should ignore non-relevant values', () => { + const testMetricNames = new Set(['metric1']); + const metrics = new EvaluationMetrics(testMetricNames); + + metrics.addResults({ metric1: 1, notRelevant: 0 }); + metrics.addResults({ metric1: 0.5, notRelevant2: { foo: 'bar' } }); + + const aggregatedMetrics = metrics.getAggregatedMetrics(); + + expect(aggregatedMetrics).toEqual({ metric1: 0.75 }); + }); +}); diff --git a/packages/cli/src/evaluation/test-runner/__tests__/mock-data/workflow.evaluation.json b/packages/cli/src/evaluation/test-runner/__tests__/mock-data/workflow.evaluation.json index c2386f010c..6ec7f2c386 100644 --- a/packages/cli/src/evaluation/test-runner/__tests__/mock-data/workflow.evaluation.json +++ b/packages/cli/src/evaluation/test-runner/__tests__/mock-data/workflow.evaluation.json @@ -57,6 +57,12 @@ "name": "success", "value": true, "type": "boolean" + }, + { + "id": "877d1bf8-31a7-4571-9293-a6837b51d22b", + "name": "metric1", + "value": 0.1, + "type": "number" } ] }, diff --git a/packages/cli/src/evaluation/test-runner/__tests__/test-runner.service.ee.test.ts b/packages/cli/src/evaluation/test-runner/__tests__/test-runner.service.ee.test.ts index 5c8f8e958b..cdb8e848d9 100644 --- a/packages/cli/src/evaluation/test-runner/__tests__/test-runner.service.ee.test.ts +++ b/packages/cli/src/evaluation/test-runner/__tests__/test-runner.service.ee.test.ts @@ -2,15 +2,17 @@ import type { SelectQueryBuilder } from '@n8n/typeorm'; import { stringify } from 'flatted'; import { readFileSync } from 'fs'; import { mock, mockDeep } from 'jest-mock-extended'; -import type { IRun } from 'n8n-workflow'; +import type { GenericValue, IRun } from 'n8n-workflow'; import path from 'path'; import type { ActiveExecutions } from '@/active-executions'; import type { ExecutionEntity } from '@/databases/entities/execution-entity'; import type { TestDefinition } from '@/databases/entities/test-definition.ee'; +import type { TestMetric } from '@/databases/entities/test-metric.ee'; import type { TestRun } from '@/databases/entities/test-run.ee'; import type { User } from '@/databases/entities/user'; import type { ExecutionRepository } from '@/databases/repositories/execution.repository'; +import type { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee'; import type { TestRunRepository } from '@/databases/repositories/test-run.repository.ee'; import type { WorkflowRepository } from '@/databases/repositories/workflow.repository'; import type { WorkflowRunner } from '@/workflow-runner'; @@ -58,12 +60,38 @@ function mockExecutionData() { }); } +function mockEvaluationExecutionData(metrics: Record) { + return mock({ + data: { + resultData: { + lastNodeExecuted: 'lastNode', + runData: { + lastNode: [ + { + data: { + main: [ + [ + { + json: metrics, + }, + ], + ], + }, + }, + ], + }, + }, + }, + }); +} + describe('TestRunnerService', () => { const executionRepository = mock(); const workflowRepository = mock(); const workflowRunner = mock(); const activeExecutions = mock(); const testRunRepository = mock(); + const testMetricRepository = mock(); beforeEach(() => { const executionsQbMock = mockDeep>({ @@ -80,6 +108,11 @@ describe('TestRunnerService', () => { .mockResolvedValueOnce(executionMocks[1]); testRunRepository.createTestRun.mockResolvedValue(mock({ id: 'test-run-id' })); + + testMetricRepository.find.mockResolvedValue([ + mock({ name: 'metric1' }), + mock({ name: 'metric2' }), + ]); }); afterEach(() => { @@ -97,6 +130,7 @@ describe('TestRunnerService', () => { executionRepository, activeExecutions, testRunRepository, + testMetricRepository, ); expect(testRunnerService).toBeInstanceOf(TestRunnerService); @@ -109,6 +143,7 @@ describe('TestRunnerService', () => { executionRepository, activeExecutions, testRunRepository, + testMetricRepository, ); workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({ @@ -143,6 +178,7 @@ describe('TestRunnerService', () => { executionRepository, activeExecutions, testRunRepository, + testMetricRepository, ); workflowRepository.findById.calledWith('workflow-under-test-id').mockResolvedValueOnce({ @@ -166,17 +202,17 @@ describe('TestRunnerService', () => { .mockResolvedValue(mockExecutionData()); activeExecutions.getPostExecutePromise - .calledWith('some-execution-id-2') + .calledWith('some-execution-id-3') .mockResolvedValue(mockExecutionData()); // Mock executions of evaluation workflow activeExecutions.getPostExecutePromise - .calledWith('some-execution-id-3') - .mockResolvedValue(mockExecutionData()); + .calledWith('some-execution-id-2') + .mockResolvedValue(mockEvaluationExecutionData({ metric1: 1, metric2: 0 })); activeExecutions.getPostExecutePromise .calledWith('some-execution-id-4') - .mockResolvedValue(mockExecutionData()); + .mockResolvedValue(mockEvaluationExecutionData({ metric1: 0.5 })); await testRunnerService.runTest( mock(), @@ -225,7 +261,8 @@ describe('TestRunnerService', () => { expect(testRunRepository.markAsRunning).toHaveBeenCalledWith('test-run-id'); expect(testRunRepository.markAsCompleted).toHaveBeenCalledTimes(1); expect(testRunRepository.markAsCompleted).toHaveBeenCalledWith('test-run-id', { - success: false, + metric1: 0.75, + metric2: 0, }); }); }); diff --git a/packages/cli/src/evaluation/test-runner/evaluation-metrics.ee.ts b/packages/cli/src/evaluation/test-runner/evaluation-metrics.ee.ts new file mode 100644 index 0000000000..ab5c921f8c --- /dev/null +++ b/packages/cli/src/evaluation/test-runner/evaluation-metrics.ee.ts @@ -0,0 +1,32 @@ +import type { IDataObject } from 'n8n-workflow'; + +export class EvaluationMetrics { + private readonly rawMetricsByName = new Map(); + + constructor(private readonly metricNames: Set) { + for (const metricName of metricNames) { + this.rawMetricsByName.set(metricName, []); + } + } + + addResults(result: IDataObject) { + for (const [metricName, metricValue] of Object.entries(result)) { + if (typeof metricValue === 'number' && this.metricNames.has(metricName)) { + this.rawMetricsByName.get(metricName)!.push(metricValue); + } + } + } + + getAggregatedMetrics() { + const aggregatedMetrics: Record = {}; + + for (const [metricName, metricValues] of this.rawMetricsByName.entries()) { + if (metricValues.length > 0) { + const metricSum = metricValues.reduce((acc, val) => acc + val, 0); + aggregatedMetrics[metricName] = metricSum / metricValues.length; + } + } + + return aggregatedMetrics; + } +} diff --git a/packages/cli/src/evaluation/test-runner/test-runner.service.ee.ts b/packages/cli/src/evaluation/test-runner/test-runner.service.ee.ts index 433a84c9dc..5aaaf25558 100644 --- a/packages/cli/src/evaluation/test-runner/test-runner.service.ee.ts +++ b/packages/cli/src/evaluation/test-runner/test-runner.service.ee.ts @@ -15,11 +15,13 @@ import type { TestDefinition } from '@/databases/entities/test-definition.ee'; import type { User } from '@/databases/entities/user'; import type { WorkflowEntity } from '@/databases/entities/workflow-entity'; import { ExecutionRepository } from '@/databases/repositories/execution.repository'; +import { TestMetricRepository } from '@/databases/repositories/test-metric.repository.ee'; import { TestRunRepository } from '@/databases/repositories/test-run.repository.ee'; import { WorkflowRepository } from '@/databases/repositories/workflow.repository'; import { getRunData } from '@/workflow-execute-additional-data'; import { WorkflowRunner } from '@/workflow-runner'; +import { EvaluationMetrics } from './evaluation-metrics.ee'; import { createPinData, getPastExecutionStartNode } from './utils.ee'; /** @@ -40,6 +42,7 @@ export class TestRunnerService { private readonly executionRepository: ExecutionRepository, private readonly activeExecutions: ActiveExecutions, private readonly testRunRepository: TestRunRepository, + private readonly testMetricRepository: TestMetricRepository, ) {} /** @@ -113,6 +116,11 @@ export class TestRunnerService { return await executePromise; } + /** + * Evaluation result is the first item in the output of the last node + * executed in the evaluation workflow. Defaults to an empty object + * in case the node doesn't produce any output items. + */ private extractEvaluationResult(execution: IRun): IDataObject { const lastNodeExecuted = execution.data.resultData.lastNodeExecuted; assert(lastNodeExecuted, 'Could not find the last node executed in evaluation workflow'); @@ -124,6 +132,21 @@ export class TestRunnerService { return mainConnectionData?.[0]?.json ?? {}; } + /** + * Get the metrics to collect from the evaluation workflow execution results. + */ + private async getTestMetricNames(testDefinitionId: string) { + const metrics = await this.testMetricRepository.find({ + where: { + testDefinition: { + id: testDefinitionId, + }, + }, + }); + + return new Set(metrics.map((m) => m.name)); + } + /** * Creates a new test run for the given test definition. */ @@ -152,11 +175,15 @@ export class TestRunnerService { .andWhere('execution.workflowId = :workflowId', { workflowId: test.workflowId }) .getMany(); + // Get the metrics to collect from the evaluation workflow + const testMetricNames = await this.getTestMetricNames(test.id); + // 2. Run over all the test cases await this.testRunRepository.markAsRunning(testRun.id); - const metrics = []; + // Object to collect the results of the evaluation workflow executions + const metrics = new EvaluationMetrics(testMetricNames); for (const { id: pastExecutionId } of pastExecutions) { // Fetch past execution with data @@ -192,12 +219,10 @@ export class TestRunnerService { assert(evalExecution); // Extract the output of the last node executed in the evaluation workflow - metrics.push(this.extractEvaluationResult(evalExecution)); + metrics.addResults(this.extractEvaluationResult(evalExecution)); } - // TODO: 3. Aggregate the results - // Now we just set success to true if all the test cases passed - const aggregatedMetrics = { success: metrics.every((metric) => metric.success) }; + const aggregatedMetrics = metrics.getAggregatedMetrics(); await this.testRunRepository.markAsCompleted(testRun.id, aggregatedMetrics); }