feat(Read PDF Node): Replace pdf-parse with pdfjs, and add support for streaming and encrypted PDFs (#6640)

This commit is contained in:
कारतोफ्फेलस्क्रिप्ट™ 2023-07-18 20:07:29 +02:00 committed by GitHub
parent acda7f269f
commit 0a31b8e2b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 267 additions and 131 deletions

View file

@ -861,6 +861,10 @@ async function httpRequest(
return result.data;
}
export function getBinaryPath(binaryDataId: string): string {
return BinaryDataManager.getInstance().getBinaryPath(binaryDataId);
}
/**
* Returns binary file metadata
*/
@ -2262,6 +2266,7 @@ const getNodeHelperFunctions = ({
const getBinaryHelperFunctions = ({
executionId,
}: IWorkflowExecuteAdditionalData): BinaryHelperFunctions => ({
getBinaryPath,
getBinaryStream,
getBinaryMetadata,
binaryToBuffer,

View file

@ -85,7 +85,6 @@
"@testing-library/jest-dom": "^5.16.5",
"@testing-library/user-event": "^14.4.3",
"@testing-library/vue": "^5.8.3",
"@types/canvas-confetti": "^1.6.0",
"@types/dateformat": "^3.0.0",
"@types/file-saver": "^2.0.1",
"@types/humanize-duration": "^3.27.1",

View file

@ -1,12 +1,32 @@
import type {
IExecuteFunctions,
IDataObject,
INodeExecutionData,
INodeType,
INodeTypeDescription,
import {
BINARY_ENCODING,
type IExecuteFunctions,
type INodeExecutionData,
type INodeType,
type INodeTypeDescription,
} from 'n8n-workflow';
import pdf from 'pdf-parse';
import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist';
type Document = Awaited<ReturnType<Awaited<typeof readPDF>>['promise']>;
type Page = Awaited<ReturnType<Awaited<Document['getPage']>>>;
type TextContent = Awaited<ReturnType<Page['getTextContent']>>;
const parseText = (textContent: TextContent) => {
let lastY = undefined;
const text = [];
for (const item of textContent.items) {
if ('str' in item) {
if (lastY == item.transform[5] || !lastY) {
text.push(item.str);
} else {
text.push(`\n${item.str}`);
}
lastY = item.transform[5];
}
}
return text.join('');
};
export class ReadPDF implements INodeType {
description: INodeTypeDescription = {
@ -32,6 +52,26 @@ export class ReadPDF implements INodeType {
required: true,
description: 'Name of the binary property from which to read the PDF file',
},
{
displayName: 'Encrypted',
name: 'encrypted',
type: 'boolean',
default: false,
required: true,
},
{
displayName: 'Password',
name: 'password',
type: 'string',
typeOptions: { password: true },
default: '',
description: 'Password to decrypt the PDF file with',
displayOptions: {
show: {
encrypted: [true],
},
},
},
],
};
@ -40,27 +80,50 @@ export class ReadPDF implements INodeType {
const returnData: INodeExecutionData[] = [];
const length = items.length;
let item: INodeExecutionData;
for (let itemIndex = 0; itemIndex < length; itemIndex++) {
try {
item = items[itemIndex];
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
if (item.binary === undefined) {
item.binary = {};
const params: { password?: string; url?: URL; data?: ArrayBuffer } = {};
if (this.getNodeParameter('encrypted', itemIndex) === true) {
params.password = this.getNodeParameter('password', itemIndex) as string;
}
const binaryDataBuffer = await this.helpers.getBinaryDataBuffer(
itemIndex,
binaryPropertyName,
);
returnData.push({
binary: item.binary,
if (binaryData.id) {
const binaryPath = this.helpers.getBinaryPath(binaryData.id);
params.url = new URL(`file://${binaryPath}`);
} else {
params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer;
}
json: (await pdf(binaryDataBuffer)) as unknown as IDataObject,
const document = await readPDF(params).promise;
const { info, metadata } = await document
.getMetadata()
.catch(() => ({ info: null, metadata: null }));
const pages = [];
for (let i = 1; i <= document.numPages; i++) {
const page = await document.getPage(i);
const text = await page.getTextContent().then(parseText);
pages.push(text);
}
returnData.push({
binary: items[itemIndex].binary,
json: {
numpages: document.numPages,
numrender: document.numPages,
info,
metadata: metadata?.getAll(),
text: pages.join('\n\n'),
version: pdfJsVersion,
},
});
} catch (error) {
console.log(error);
if (this.continueOnFail()) {
returnData.push({
json: {

View file

@ -0,0 +1,87 @@
{
"nodes": [
{
"name": "When clicking \"Execute Workflow\"",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"parameters": {},
"position": [660, 580]
},
{
"name": "Read sample-encrypted.pdf",
"type": "n8n-nodes-base.readBinaryFile",
"typeVersion": 1,
"parameters": {
"filePath": "C:\\Test\\sample-encrypted.pdf"
},
"position": [880, 780]
},
{
"name": "Read PDF (encrypted)",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"parameters": {
"encrypted": true,
"password": "ReaderPassword"
},
"position": [1100, 780]
}
],
"pinData": {
"Read PDF (encrypted)": [
{
"binary": {
"data": {
"fileExtension": "pdf",
"fileName": "sample-encrypted.pdf",
"fileSize": "18.9 kB",
"mimeType": "application/pdf"
}
},
"json": {
"numpages": 1,
"numrender": 1,
"info": {
"PDFFormatVersion": "1.7",
"Language": null,
"EncryptFilterName": "Standard",
"IsLinearized": false,
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"IsCollectionPresent": false,
"IsSignaturesPresent": false,
"ModDate": "D:20230210122750Z",
"Producer": "iLovePDF",
"Title": "sample"
},
"text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.",
"version": "2.16.105"
}
}
]
},
"connections": {
"When clicking \"Execute Workflow\"": {
"main": [
[
{
"node": "Read sample-encrypted.pdf",
"type": "main",
"index": 0
}
]
]
},
"Read sample-encrypted.pdf": {
"main": [
[
{
"node": "Read PDF (encrypted)",
"type": "main",
"index": 0
}
]
]
}
}
}

View file

@ -1,60 +1,11 @@
/* eslint-disable @typescript-eslint/no-loop-func */
import * as Helpers from '@test/nodes/Helpers';
import type { WorkflowTestData } from '@test/nodes/types';
import { executeWorkflow } from '@test/nodes/ExecuteWorkflow';
import path from 'path';
import { getWorkflowFilenames, initBinaryDataManager, testWorkflows } from '@test/nodes/Helpers';
describe('Test Read PDF Node', () => {
beforeEach(async () => {
await Helpers.initBinaryDataManager();
const workflows = getWorkflowFilenames(__dirname);
beforeAll(async () => {
await initBinaryDataManager();
});
const workflow = Helpers.readJsonFileSync('nodes/ReadPdf/test/ReadPDF.workflow.json');
const node = workflow.nodes.find((n: any) => n.name === 'Read Binary File');
node.parameters.filePath = path.join(__dirname, 'sample.pdf');
const testData: WorkflowTestData = {
description: 'nodes/ReadPdf/test/ReadPDF.workflow.json',
input: {
workflowData: workflow,
},
output: {
nodeData: {
'Read PDF': [
[
{
json: {
numpages: 1,
numrender: 1,
info: {
PDFFormatVersion: '1.4',
IsAcroFormPresent: false,
IsXFAPresent: false,
Title: 'sample',
Producer: 'iLovePDF',
ModDate: 'D:20230210122750Z',
},
metadata: null,
text: '\n\nN8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.',
version: '1.10.100',
},
},
],
],
},
},
};
const nodeTypes = Helpers.setup(testData);
test(testData.description, async () => {
const { result } = await executeWorkflow(testData, nodeTypes);
const resultNodeData = Helpers.getResultNodeData(result, testData);
// delete binary data because we test against json only
delete resultNodeData[0].resultData[0]![0].binary;
expect(resultNodeData[0].resultData).toEqual(testData.output.nodeData['Read PDF']);
expect(result.finished).toEqual(true);
});
testWorkflows(workflows);
});

View file

@ -1,47 +1,75 @@
{
"name": "Read PDF node unit test",
"nodes": [
{
"parameters": {},
"id": "0c9db33c-dd15-4088-9d12-b9f3b8f1fa96",
"name": "When clicking \"Execute Workflow\"",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [960, 540]
},
{
"parameters": {},
"id": "86abdc3b-206d-4b67-a37f-6b67b6bd3bbc",
"name": "Read PDF",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"position": [1400, 540]
"position": [660, 580]
},
{
"name": "Read sample.pdf",
"type": "n8n-nodes-base.readBinaryFile",
"typeVersion": 1,
"parameters": {
"filePath": "C:\\Test\\sample.pdf"
},
"id": "2f6d241e-44a4-4213-b49a-166201946a89",
"name": "Read Binary File",
"type": "n8n-nodes-base.readBinaryFile",
"position": [880, 580]
},
{
"name": "Read PDF",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"position": [1180, 540]
"parameters": {},
"position": [1100, 580]
}
],
"pinData": {},
"pinData": {
"Read PDF": [
{
"binary": {
"data": {
"fileExtension": "pdf",
"fileName": "sample.pdf",
"fileSize": "17.8 kB",
"mimeType": "application/pdf"
}
},
"json": {
"numpages": 1,
"numrender": 1,
"info": {
"PDFFormatVersion": "1.4",
"Language": null,
"EncryptFilterName": null,
"IsLinearized": false,
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"IsCollectionPresent": false,
"IsSignaturesPresent": false,
"Title": "sample",
"Producer": "iLovePDF",
"ModDate": "D:20230210122750Z"
},
"text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.",
"version": "2.16.105"
}
}
]
},
"connections": {
"When clicking \"Execute Workflow\"": {
"main": [
[
{
"node": "Read Binary File",
"node": "Read sample.pdf",
"type": "main",
"index": 0
}
]
]
},
"Read Binary File": {
"Read sample.pdf": {
"main": [
[
{
@ -52,13 +80,5 @@
]
]
}
},
"active": false,
"settings": {},
"versionId": "9802b48d-727a-40ef-ad87-d544a9a648a7",
"id": "188",
"meta": {
"instanceId": "104a4d08d8897b8bdeb38aaca515021075e0bd8544c983c2bb8c86e6a8e6081c"
},
"tags": []
}
}

View file

@ -770,7 +770,6 @@
"@types/mssql": "^6.0.2",
"@types/node-ssh": "^7.0.1",
"@types/nodemailer": "^6.4.0",
"@types/pdf-parse": "^1.1.1",
"@types/promise-ftp": "^1.3.4",
"@types/redis": "^2.8.11",
"@types/request-promise-native": "~1.0.15",
@ -826,7 +825,7 @@
"node-ssh": "^12.0.0",
"nodemailer": "^6.7.1",
"otpauth": "^9.1.1",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^2.16.105",
"pg": "^8.3.0",
"pg-promise": "^10.5.8",
"pretty-bytes": "^5.6.0",

View file

@ -321,6 +321,15 @@ export const equalityTest = async (testData: WorkflowTestData, types: INodeTypes
const resultNodeData = getResultNodeData(result, testData);
resultNodeData.forEach(({ nodeName, resultData }) => {
const msg = `Equality failed for "${testData.description}" at node "${nodeName}"`;
resultData.forEach((item) => {
item?.forEach(({ binary }) => {
if (binary) {
// @ts-ignore
delete binary.data.data;
delete binary.data.directory;
}
});
});
return expect(resultData, msg).toEqual(testData.output.nodeData[nodeName]);
});
@ -345,6 +354,14 @@ export const workflowToTests = (workflowFiles: string[]) => {
for (const filePath of workflowFiles) {
const description = filePath.replace('.json', '');
const workflowData = readJsonFileSync<IWorkflowBase>(filePath);
const testDir = path.join(baseDir, path.dirname(filePath));
workflowData.nodes.forEach((node) => {
if (node.parameters) {
node.parameters = JSON.parse(
JSON.stringify(node.parameters).replace(/"C:\\\\Test\\\\(.*)"/, `"${testDir}/$1"`),
);
}
});
if (workflowData.pinData === undefined) {
throw new Error('Workflow data does not contain pinData');
}

View file

@ -683,6 +683,7 @@ export interface BinaryHelperFunctions {
setBinaryDataBuffer(data: IBinaryData, binaryData: Buffer): Promise<IBinaryData>;
copyBinaryFile(): Promise<never>;
binaryToBuffer(body: Buffer | Readable): Promise<Buffer>;
getBinaryPath(binaryDataId: string): string;
getBinaryStream(binaryDataId: string, chunkSize?: number): Readable;
getBinaryMetadata(binaryDataId: string): Promise<BinaryMetadata>;
}

View file

@ -926,9 +926,6 @@ importers:
'@testing-library/vue':
specifier: ^5.8.3
version: 5.8.3(vue-template-compiler@2.7.14)(vue@2.7.14)
'@types/canvas-confetti':
specifier: ^1.6.0
version: 1.6.0
'@types/dateformat':
specifier: ^3.0.0
version: 3.0.1
@ -1131,9 +1128,9 @@ importers:
otpauth:
specifier: ^9.1.1
version: 9.1.1
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
pdfjs-dist:
specifier: ^2.16.105
version: 2.16.105
pg:
specifier: ^8.3.0
version: 8.8.0
@ -1246,9 +1243,6 @@ importers:
'@types/nodemailer':
specifier: ^6.4.0
version: 6.4.6
'@types/pdf-parse':
specifier: ^1.1.1
version: 1.1.1
'@types/promise-ftp':
specifier: ^1.3.4
version: 1.3.4
@ -7016,10 +7010,6 @@ packages:
'@types/connect': 3.4.35
'@types/node': 18.16.16
/@types/canvas-confetti@1.6.0:
resolution: {integrity: sha512-Yq6rIccwcco0TLD5SMUrIM7Fk7Fe/C0jmNRxJJCLtAF6gebDkPuUjK5EHedxecm69Pi/aA+It39Ux4OHmFhjRw==}
dev: true
/@types/caseless@0.12.2:
resolution: {integrity: sha512-6ckxMjBBD8URvjB6J3NcnuAn5Pkl7t3TizAg+xdlzzQGSPSmBcXf8KoIH0ua/i+tio+ZRUHEXp0HEmvaR4kt0w==}
dev: true
@ -7443,10 +7433,6 @@ packages:
'@types/express': 4.17.14
dev: true
/@types/pdf-parse@1.1.1:
resolution: {integrity: sha512-lDBKAslCwvfK2uvS1Uk+UCpGvw+JRy5vnBFANPKFSY92n/iEnunXi0KVBjPJXhsM4jtdcPnS7tuZ0zjA9x6piQ==}
dev: true
/@types/prettier@2.7.1:
resolution: {integrity: sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow==}
dev: true
@ -11470,6 +11456,11 @@ packages:
dependencies:
domelementtype: 2.3.0
/dommatrix@1.0.3:
resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==}
deprecated: dommatrix is no longer maintained. Please use @thednp/dommatrix.
dev: false
/domutils@2.8.0:
resolution: {integrity: sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==}
dependencies:
@ -17046,10 +17037,6 @@ packages:
minimatch: 3.1.2
dev: true
/node-ensure@0.0.0:
resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==}
dev: false
/node-fetch-native@1.0.1:
resolution: {integrity: sha512-VzW+TAk2wE4X9maiKMlT+GsPU4OMmR1U9CrHSmd3DFLn2IcZ9VJ6M6BBugGfYUnPCLSYxXdZy17M0BEJyhUTwg==}
dev: true
@ -17909,14 +17896,16 @@ packages:
resolution: {integrity: sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg==}
dev: false
/pdf-parse@1.1.1:
resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==}
engines: {node: '>=6.8.1'}
/pdfjs-dist@2.16.105:
resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==}
peerDependencies:
worker-loader: ^3.0.8
peerDependenciesMeta:
worker-loader:
optional: true
dependencies:
debug: 3.2.7(supports-color@8.1.1)
node-ensure: 0.0.0
transitivePeerDependencies:
- supports-color
dommatrix: 1.0.3
web-streams-polyfill: 3.2.1
dev: false
/peek-readable@4.1.0:
@ -22296,6 +22285,11 @@ packages:
graceful-fs: 4.2.10
dev: true
/web-streams-polyfill@3.2.1:
resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==}
engines: {node: '>= 8'}
dev: false
/webidl-conversions@3.0.1:
resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}