diff --git a/packages/core/src/NodeExecuteFunctions.ts b/packages/core/src/NodeExecuteFunctions.ts index 2c2541285c..f49bc046a9 100644 --- a/packages/core/src/NodeExecuteFunctions.ts +++ b/packages/core/src/NodeExecuteFunctions.ts @@ -861,6 +861,10 @@ async function httpRequest( return result.data; } +export function getBinaryPath(binaryDataId: string): string { + return BinaryDataManager.getInstance().getBinaryPath(binaryDataId); +} + /** * Returns binary file metadata */ @@ -2262,6 +2266,7 @@ const getNodeHelperFunctions = ({ const getBinaryHelperFunctions = ({ executionId, }: IWorkflowExecuteAdditionalData): BinaryHelperFunctions => ({ + getBinaryPath, getBinaryStream, getBinaryMetadata, binaryToBuffer, diff --git a/packages/editor-ui/package.json b/packages/editor-ui/package.json index 520a991704..fdf1dc01a1 100644 --- a/packages/editor-ui/package.json +++ b/packages/editor-ui/package.json @@ -85,7 +85,6 @@ "@testing-library/jest-dom": "^5.16.5", "@testing-library/user-event": "^14.4.3", "@testing-library/vue": "^5.8.3", - "@types/canvas-confetti": "^1.6.0", "@types/dateformat": "^3.0.0", "@types/file-saver": "^2.0.1", "@types/humanize-duration": "^3.27.1", diff --git a/packages/nodes-base/nodes/ReadPdf/ReadPDF.node.ts b/packages/nodes-base/nodes/ReadPdf/ReadPDF.node.ts index 6626c94e53..0379f06293 100644 --- a/packages/nodes-base/nodes/ReadPdf/ReadPDF.node.ts +++ b/packages/nodes-base/nodes/ReadPdf/ReadPDF.node.ts @@ -1,12 +1,32 @@ -import type { - IExecuteFunctions, - IDataObject, - INodeExecutionData, - INodeType, - INodeTypeDescription, +import { + BINARY_ENCODING, + type IExecuteFunctions, + type INodeExecutionData, + type INodeType, + type INodeTypeDescription, } from 'n8n-workflow'; -import pdf from 'pdf-parse'; +import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist'; + +type Document = Awaited>['promise']>; +type Page = Awaited>>; +type TextContent = Awaited>; + +const parseText = (textContent: TextContent) => { + let lastY = undefined; + const text = []; + for (const item of textContent.items) { + if ('str' in item) { + if (lastY == item.transform[5] || !lastY) { + text.push(item.str); + } else { + text.push(`\n${item.str}`); + } + lastY = item.transform[5]; + } + } + return text.join(''); +}; export class ReadPDF implements INodeType { description: INodeTypeDescription = { @@ -32,6 +52,26 @@ export class ReadPDF implements INodeType { required: true, description: 'Name of the binary property from which to read the PDF file', }, + { + displayName: 'Encrypted', + name: 'encrypted', + type: 'boolean', + default: false, + required: true, + }, + { + displayName: 'Password', + name: 'password', + type: 'string', + typeOptions: { password: true }, + default: '', + description: 'Password to decrypt the PDF file with', + displayOptions: { + show: { + encrypted: [true], + }, + }, + }, ], }; @@ -40,27 +80,50 @@ export class ReadPDF implements INodeType { const returnData: INodeExecutionData[] = []; const length = items.length; - let item: INodeExecutionData; for (let itemIndex = 0; itemIndex < length; itemIndex++) { try { - item = items[itemIndex]; const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex); + const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName); - if (item.binary === undefined) { - item.binary = {}; + const params: { password?: string; url?: URL; data?: ArrayBuffer } = {}; + + if (this.getNodeParameter('encrypted', itemIndex) === true) { + params.password = this.getNodeParameter('password', itemIndex) as string; } - const binaryDataBuffer = await this.helpers.getBinaryDataBuffer( - itemIndex, - binaryPropertyName, - ); - returnData.push({ - binary: item.binary, + if (binaryData.id) { + const binaryPath = this.helpers.getBinaryPath(binaryData.id); + params.url = new URL(`file://${binaryPath}`); + } else { + params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer; + } - json: (await pdf(binaryDataBuffer)) as unknown as IDataObject, + const document = await readPDF(params).promise; + const { info, metadata } = await document + .getMetadata() + .catch(() => ({ info: null, metadata: null })); + + const pages = []; + for (let i = 1; i <= document.numPages; i++) { + const page = await document.getPage(i); + const text = await page.getTextContent().then(parseText); + pages.push(text); + } + + returnData.push({ + binary: items[itemIndex].binary, + json: { + numpages: document.numPages, + numrender: document.numPages, + info, + metadata: metadata?.getAll(), + text: pages.join('\n\n'), + version: pdfJsVersion, + }, }); } catch (error) { + console.log(error); if (this.continueOnFail()) { returnData.push({ json: { diff --git a/packages/nodes-base/nodes/ReadPdf/test/ReadPDF-encrypted.workflow.json b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF-encrypted.workflow.json new file mode 100644 index 0000000000..7258819ead --- /dev/null +++ b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF-encrypted.workflow.json @@ -0,0 +1,87 @@ +{ + "nodes": [ + { + "name": "When clicking \"Execute Workflow\"", + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "parameters": {}, + "position": [660, 580] + }, + { + "name": "Read sample-encrypted.pdf", + "type": "n8n-nodes-base.readBinaryFile", + "typeVersion": 1, + "parameters": { + "filePath": "C:\\Test\\sample-encrypted.pdf" + }, + "position": [880, 780] + }, + { + "name": "Read PDF (encrypted)", + "type": "n8n-nodes-base.readPDF", + "typeVersion": 1, + "parameters": { + "encrypted": true, + "password": "ReaderPassword" + }, + "position": [1100, 780] + } + ], + "pinData": { + "Read PDF (encrypted)": [ + { + "binary": { + "data": { + "fileExtension": "pdf", + "fileName": "sample-encrypted.pdf", + "fileSize": "18.9 kB", + "mimeType": "application/pdf" + } + }, + "json": { + "numpages": 1, + "numrender": 1, + "info": { + "PDFFormatVersion": "1.7", + "Language": null, + "EncryptFilterName": "Standard", + "IsLinearized": false, + "IsAcroFormPresent": false, + "IsXFAPresent": false, + "IsCollectionPresent": false, + "IsSignaturesPresent": false, + "ModDate": "D:20230210122750Z", + "Producer": "iLovePDF", + "Title": "sample" + }, + "text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.", + "version": "2.16.105" + } + } + ] + }, + "connections": { + "When clicking \"Execute Workflow\"": { + "main": [ + [ + { + "node": "Read sample-encrypted.pdf", + "type": "main", + "index": 0 + } + ] + ] + }, + "Read sample-encrypted.pdf": { + "main": [ + [ + { + "node": "Read PDF (encrypted)", + "type": "main", + "index": 0 + } + ] + ] + } + } +} diff --git a/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.test.ts b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.test.ts index efe6c44f11..cd91f69a1d 100644 --- a/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.test.ts +++ b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.test.ts @@ -1,60 +1,11 @@ -/* eslint-disable @typescript-eslint/no-loop-func */ -import * as Helpers from '@test/nodes/Helpers'; -import type { WorkflowTestData } from '@test/nodes/types'; -import { executeWorkflow } from '@test/nodes/ExecuteWorkflow'; -import path from 'path'; +import { getWorkflowFilenames, initBinaryDataManager, testWorkflows } from '@test/nodes/Helpers'; describe('Test Read PDF Node', () => { - beforeEach(async () => { - await Helpers.initBinaryDataManager(); + const workflows = getWorkflowFilenames(__dirname); + + beforeAll(async () => { + await initBinaryDataManager(); }); - const workflow = Helpers.readJsonFileSync('nodes/ReadPdf/test/ReadPDF.workflow.json'); - const node = workflow.nodes.find((n: any) => n.name === 'Read Binary File'); - node.parameters.filePath = path.join(__dirname, 'sample.pdf'); - - const testData: WorkflowTestData = { - description: 'nodes/ReadPdf/test/ReadPDF.workflow.json', - input: { - workflowData: workflow, - }, - output: { - nodeData: { - 'Read PDF': [ - [ - { - json: { - numpages: 1, - numrender: 1, - info: { - PDFFormatVersion: '1.4', - IsAcroFormPresent: false, - IsXFAPresent: false, - Title: 'sample', - Producer: 'iLovePDF', - ModDate: 'D:20230210122750Z', - }, - metadata: null, - text: '\n\nN8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.', - version: '1.10.100', - }, - }, - ], - ], - }, - }, - }; - - const nodeTypes = Helpers.setup(testData); - - test(testData.description, async () => { - const { result } = await executeWorkflow(testData, nodeTypes); - const resultNodeData = Helpers.getResultNodeData(result, testData); - - // delete binary data because we test against json only - delete resultNodeData[0].resultData[0]![0].binary; - expect(resultNodeData[0].resultData).toEqual(testData.output.nodeData['Read PDF']); - - expect(result.finished).toEqual(true); - }); + testWorkflows(workflows); }); diff --git a/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.workflow.json b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.workflow.json index 9d6aaf0c1f..4a56640108 100644 --- a/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.workflow.json +++ b/packages/nodes-base/nodes/ReadPdf/test/ReadPDF.workflow.json @@ -1,47 +1,75 @@ { - "name": "Read PDF node unit test", "nodes": [ { - "parameters": {}, - "id": "0c9db33c-dd15-4088-9d12-b9f3b8f1fa96", "name": "When clicking \"Execute Workflow\"", "type": "n8n-nodes-base.manualTrigger", "typeVersion": 1, - "position": [960, 540] - }, - { "parameters": {}, - "id": "86abdc3b-206d-4b67-a37f-6b67b6bd3bbc", - "name": "Read PDF", - "type": "n8n-nodes-base.readPDF", - "typeVersion": 1, - "position": [1400, 540] + "position": [660, 580] }, { + "name": "Read sample.pdf", + "type": "n8n-nodes-base.readBinaryFile", + "typeVersion": 1, "parameters": { "filePath": "C:\\Test\\sample.pdf" }, - "id": "2f6d241e-44a4-4213-b49a-166201946a89", - "name": "Read Binary File", - "type": "n8n-nodes-base.readBinaryFile", + "position": [880, 580] + }, + { + "name": "Read PDF", + "type": "n8n-nodes-base.readPDF", "typeVersion": 1, - "position": [1180, 540] + "parameters": {}, + "position": [1100, 580] } ], - "pinData": {}, + "pinData": { + "Read PDF": [ + { + "binary": { + "data": { + "fileExtension": "pdf", + "fileName": "sample.pdf", + "fileSize": "17.8 kB", + "mimeType": "application/pdf" + } + }, + "json": { + "numpages": 1, + "numrender": 1, + "info": { + "PDFFormatVersion": "1.4", + "Language": null, + "EncryptFilterName": null, + "IsLinearized": false, + "IsAcroFormPresent": false, + "IsXFAPresent": false, + "IsCollectionPresent": false, + "IsSignaturesPresent": false, + "Title": "sample", + "Producer": "iLovePDF", + "ModDate": "D:20230210122750Z" + }, + "text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.", + "version": "2.16.105" + } + } + ] + }, "connections": { "When clicking \"Execute Workflow\"": { "main": [ [ { - "node": "Read Binary File", + "node": "Read sample.pdf", "type": "main", "index": 0 } ] ] }, - "Read Binary File": { + "Read sample.pdf": { "main": [ [ { @@ -52,13 +80,5 @@ ] ] } - }, - "active": false, - "settings": {}, - "versionId": "9802b48d-727a-40ef-ad87-d544a9a648a7", - "id": "188", - "meta": { - "instanceId": "104a4d08d8897b8bdeb38aaca515021075e0bd8544c983c2bb8c86e6a8e6081c" - }, - "tags": [] + } } diff --git a/packages/nodes-base/nodes/ReadPdf/test/sample-encrypted.pdf b/packages/nodes-base/nodes/ReadPdf/test/sample-encrypted.pdf new file mode 100644 index 0000000000..52d7fe80e8 Binary files /dev/null and b/packages/nodes-base/nodes/ReadPdf/test/sample-encrypted.pdf differ diff --git a/packages/nodes-base/package.json b/packages/nodes-base/package.json index 3fcae99d34..8513350c64 100644 --- a/packages/nodes-base/package.json +++ b/packages/nodes-base/package.json @@ -770,7 +770,6 @@ "@types/mssql": "^6.0.2", "@types/node-ssh": "^7.0.1", "@types/nodemailer": "^6.4.0", - "@types/pdf-parse": "^1.1.1", "@types/promise-ftp": "^1.3.4", "@types/redis": "^2.8.11", "@types/request-promise-native": "~1.0.15", @@ -826,7 +825,7 @@ "node-ssh": "^12.0.0", "nodemailer": "^6.7.1", "otpauth": "^9.1.1", - "pdf-parse": "^1.1.1", + "pdfjs-dist": "^2.16.105", "pg": "^8.3.0", "pg-promise": "^10.5.8", "pretty-bytes": "^5.6.0", diff --git a/packages/nodes-base/test/nodes/Helpers.ts b/packages/nodes-base/test/nodes/Helpers.ts index b30e6acd34..dc359391dd 100644 --- a/packages/nodes-base/test/nodes/Helpers.ts +++ b/packages/nodes-base/test/nodes/Helpers.ts @@ -321,6 +321,15 @@ export const equalityTest = async (testData: WorkflowTestData, types: INodeTypes const resultNodeData = getResultNodeData(result, testData); resultNodeData.forEach(({ nodeName, resultData }) => { const msg = `Equality failed for "${testData.description}" at node "${nodeName}"`; + resultData.forEach((item) => { + item?.forEach(({ binary }) => { + if (binary) { + // @ts-ignore + delete binary.data.data; + delete binary.data.directory; + } + }); + }); return expect(resultData, msg).toEqual(testData.output.nodeData[nodeName]); }); @@ -345,6 +354,14 @@ export const workflowToTests = (workflowFiles: string[]) => { for (const filePath of workflowFiles) { const description = filePath.replace('.json', ''); const workflowData = readJsonFileSync(filePath); + const testDir = path.join(baseDir, path.dirname(filePath)); + workflowData.nodes.forEach((node) => { + if (node.parameters) { + node.parameters = JSON.parse( + JSON.stringify(node.parameters).replace(/"C:\\\\Test\\\\(.*)"/, `"${testDir}/$1"`), + ); + } + }); if (workflowData.pinData === undefined) { throw new Error('Workflow data does not contain pinData'); } diff --git a/packages/workflow/src/Interfaces.ts b/packages/workflow/src/Interfaces.ts index c71e350294..d60d6ab6a0 100644 --- a/packages/workflow/src/Interfaces.ts +++ b/packages/workflow/src/Interfaces.ts @@ -683,6 +683,7 @@ export interface BinaryHelperFunctions { setBinaryDataBuffer(data: IBinaryData, binaryData: Buffer): Promise; copyBinaryFile(): Promise; binaryToBuffer(body: Buffer | Readable): Promise; + getBinaryPath(binaryDataId: string): string; getBinaryStream(binaryDataId: string, chunkSize?: number): Readable; getBinaryMetadata(binaryDataId: string): Promise; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c732f0e8a6..b361326851 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -926,9 +926,6 @@ importers: '@testing-library/vue': specifier: ^5.8.3 version: 5.8.3(vue-template-compiler@2.7.14)(vue@2.7.14) - '@types/canvas-confetti': - specifier: ^1.6.0 - version: 1.6.0 '@types/dateformat': specifier: ^3.0.0 version: 3.0.1 @@ -1131,9 +1128,9 @@ importers: otpauth: specifier: ^9.1.1 version: 9.1.1 - pdf-parse: - specifier: ^1.1.1 - version: 1.1.1 + pdfjs-dist: + specifier: ^2.16.105 + version: 2.16.105 pg: specifier: ^8.3.0 version: 8.8.0 @@ -1246,9 +1243,6 @@ importers: '@types/nodemailer': specifier: ^6.4.0 version: 6.4.6 - '@types/pdf-parse': - specifier: ^1.1.1 - version: 1.1.1 '@types/promise-ftp': specifier: ^1.3.4 version: 1.3.4 @@ -7016,10 +7010,6 @@ packages: '@types/connect': 3.4.35 '@types/node': 18.16.16 - /@types/canvas-confetti@1.6.0: - resolution: {integrity: sha512-Yq6rIccwcco0TLD5SMUrIM7Fk7Fe/C0jmNRxJJCLtAF6gebDkPuUjK5EHedxecm69Pi/aA+It39Ux4OHmFhjRw==} - dev: true - /@types/caseless@0.12.2: resolution: {integrity: sha512-6ckxMjBBD8URvjB6J3NcnuAn5Pkl7t3TizAg+xdlzzQGSPSmBcXf8KoIH0ua/i+tio+ZRUHEXp0HEmvaR4kt0w==} dev: true @@ -7443,10 +7433,6 @@ packages: '@types/express': 4.17.14 dev: true - /@types/pdf-parse@1.1.1: - resolution: {integrity: sha512-lDBKAslCwvfK2uvS1Uk+UCpGvw+JRy5vnBFANPKFSY92n/iEnunXi0KVBjPJXhsM4jtdcPnS7tuZ0zjA9x6piQ==} - dev: true - /@types/prettier@2.7.1: resolution: {integrity: sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow==} dev: true @@ -11470,6 +11456,11 @@ packages: dependencies: domelementtype: 2.3.0 + /dommatrix@1.0.3: + resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==} + deprecated: dommatrix is no longer maintained. Please use @thednp/dommatrix. + dev: false + /domutils@2.8.0: resolution: {integrity: sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==} dependencies: @@ -17046,10 +17037,6 @@ packages: minimatch: 3.1.2 dev: true - /node-ensure@0.0.0: - resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} - dev: false - /node-fetch-native@1.0.1: resolution: {integrity: sha512-VzW+TAk2wE4X9maiKMlT+GsPU4OMmR1U9CrHSmd3DFLn2IcZ9VJ6M6BBugGfYUnPCLSYxXdZy17M0BEJyhUTwg==} dev: true @@ -17909,14 +17896,16 @@ packages: resolution: {integrity: sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg==} dev: false - /pdf-parse@1.1.1: - resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} - engines: {node: '>=6.8.1'} + /pdfjs-dist@2.16.105: + resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==} + peerDependencies: + worker-loader: ^3.0.8 + peerDependenciesMeta: + worker-loader: + optional: true dependencies: - debug: 3.2.7(supports-color@8.1.1) - node-ensure: 0.0.0 - transitivePeerDependencies: - - supports-color + dommatrix: 1.0.3 + web-streams-polyfill: 3.2.1 dev: false /peek-readable@4.1.0: @@ -22296,6 +22285,11 @@ packages: graceful-fs: 4.2.10 dev: true + /web-streams-polyfill@3.2.1: + resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} + engines: {node: '>= 8'} + dev: false + /webidl-conversions@3.0.1: resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}