2024-02-22 00:20:07 -08:00
|
|
|
import { pipeline } from 'stream/promises';
|
|
|
|
import { createWriteStream } from 'fs';
|
2023-11-30 02:59:37 -08:00
|
|
|
import type { IExecuteFunctions, INodeExecutionData } from 'n8n-workflow';
|
2023-12-08 04:42:32 -08:00
|
|
|
import { NodeOperationError, BINARY_ENCODING } from 'n8n-workflow';
|
2023-11-29 03:13:55 -08:00
|
|
|
|
2024-05-24 05:43:17 -07:00
|
|
|
import type { TextSplitter } from '@langchain/textsplitters';
|
2024-03-07 02:36:36 -08:00
|
|
|
import type { Document } from '@langchain/core/documents';
|
2024-05-24 05:43:17 -07:00
|
|
|
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
|
|
|
|
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
|
2023-11-29 03:13:55 -08:00
|
|
|
import { JSONLoader } from 'langchain/document_loaders/fs/json';
|
2024-05-24 05:43:17 -07:00
|
|
|
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
|
2023-11-29 03:13:55 -08:00
|
|
|
import { TextLoader } from 'langchain/document_loaders/fs/text';
|
2024-05-24 05:43:17 -07:00
|
|
|
import { EPubLoader } from '@langchain/community/document_loaders/fs/epub';
|
2023-11-30 02:59:37 -08:00
|
|
|
import { file as tmpFile, type DirectoryResult } from 'tmp-promise';
|
|
|
|
|
2023-11-29 03:13:55 -08:00
|
|
|
import { getMetadataFiltersValues } from './helpers';
|
|
|
|
|
|
|
|
const SUPPORTED_MIME_TYPES = {
|
|
|
|
auto: ['*/*'],
|
|
|
|
pdfLoader: ['application/pdf'],
|
|
|
|
csvLoader: ['text/csv'],
|
|
|
|
epubLoader: ['application/epub+zip'],
|
|
|
|
docxLoader: ['application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
|
|
|
|
textLoader: ['text/plain', 'text/mdx', 'text/md'],
|
|
|
|
jsonLoader: ['application/json'],
|
|
|
|
};
|
|
|
|
|
|
|
|
export class N8nBinaryLoader {
|
|
|
|
private context: IExecuteFunctions;
|
|
|
|
|
|
|
|
private optionsPrefix: string;
|
|
|
|
|
2023-12-08 04:42:32 -08:00
|
|
|
private binaryDataKey: string;
|
|
|
|
|
|
|
|
private textSplitter?: TextSplitter;
|
|
|
|
|
2024-02-22 00:20:07 -08:00
|
|
|
constructor(
|
|
|
|
context: IExecuteFunctions,
|
|
|
|
optionsPrefix = '',
|
|
|
|
binaryDataKey = '',
|
|
|
|
textSplitter?: TextSplitter,
|
|
|
|
) {
|
2023-11-29 03:13:55 -08:00
|
|
|
this.context = context;
|
2023-12-08 04:42:32 -08:00
|
|
|
this.textSplitter = textSplitter;
|
2023-11-29 03:13:55 -08:00
|
|
|
this.optionsPrefix = optionsPrefix;
|
2023-12-08 04:42:32 -08:00
|
|
|
this.binaryDataKey = binaryDataKey;
|
2023-11-29 03:13:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
async processAll(items?: INodeExecutionData[]): Promise<Document[]> {
|
|
|
|
const docs: Document[] = [];
|
|
|
|
|
|
|
|
if (!items) return [];
|
|
|
|
|
|
|
|
for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
|
|
|
|
const processedDocuments = await this.processItem(items[itemIndex], itemIndex);
|
|
|
|
|
|
|
|
docs.push(...processedDocuments);
|
|
|
|
}
|
|
|
|
|
|
|
|
return docs;
|
|
|
|
}
|
|
|
|
|
|
|
|
async processItem(item: INodeExecutionData, itemIndex: number): Promise<Document[]> {
|
|
|
|
const selectedLoader: keyof typeof SUPPORTED_MIME_TYPES = this.context.getNodeParameter(
|
|
|
|
'loader',
|
|
|
|
itemIndex,
|
2023-12-08 04:42:32 -08:00
|
|
|
'auto',
|
2023-11-29 03:13:55 -08:00
|
|
|
) as keyof typeof SUPPORTED_MIME_TYPES;
|
|
|
|
|
|
|
|
const docs: Document[] = [];
|
|
|
|
const metadata = getMetadataFiltersValues(this.context, itemIndex);
|
|
|
|
|
|
|
|
if (!item) return [];
|
|
|
|
|
2024-02-22 00:20:07 -08:00
|
|
|
const binaryData = this.context.helpers.assertBinaryData(itemIndex, this.binaryDataKey);
|
2023-11-29 03:13:55 -08:00
|
|
|
const { mimeType } = binaryData;
|
|
|
|
|
|
|
|
// Check if loader matches the mime-type of the data
|
|
|
|
if (selectedLoader !== 'auto' && !SUPPORTED_MIME_TYPES[selectedLoader].includes(mimeType)) {
|
|
|
|
const neededLoader = Object.keys(SUPPORTED_MIME_TYPES).find((loader) =>
|
|
|
|
SUPPORTED_MIME_TYPES[loader as keyof typeof SUPPORTED_MIME_TYPES].includes(mimeType),
|
|
|
|
);
|
|
|
|
|
|
|
|
throw new NodeOperationError(
|
|
|
|
this.context.getNode(),
|
|
|
|
`Mime type doesn't match selected loader. Please select under "Loader Type": ${neededLoader}`,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!Object.values(SUPPORTED_MIME_TYPES).flat().includes(mimeType)) {
|
|
|
|
throw new NodeOperationError(this.context.getNode(), `Unsupported mime type: ${mimeType}`);
|
|
|
|
}
|
|
|
|
if (
|
|
|
|
!SUPPORTED_MIME_TYPES[selectedLoader].includes(mimeType) &&
|
|
|
|
selectedLoader !== 'textLoader' &&
|
|
|
|
selectedLoader !== 'auto'
|
|
|
|
) {
|
|
|
|
throw new NodeOperationError(
|
|
|
|
this.context.getNode(),
|
|
|
|
`Unsupported mime type: ${mimeType} for selected loader: ${selectedLoader}`,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2023-11-30 02:59:37 -08:00
|
|
|
let filePathOrBlob: string | Blob;
|
|
|
|
if (binaryData.id) {
|
2024-02-22 00:20:07 -08:00
|
|
|
const binaryBuffer = await this.context.helpers.binaryToBuffer(
|
|
|
|
await this.context.helpers.getBinaryStream(binaryData.id),
|
|
|
|
);
|
|
|
|
filePathOrBlob = new Blob([binaryBuffer], {
|
|
|
|
type: mimeType,
|
|
|
|
});
|
2023-11-30 02:59:37 -08:00
|
|
|
} else {
|
|
|
|
filePathOrBlob = new Blob([Buffer.from(binaryData.data, BINARY_ENCODING)], {
|
|
|
|
type: mimeType,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
let loader: PDFLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader;
|
2024-01-09 03:11:39 -08:00
|
|
|
let cleanupTmpFile: DirectoryResult['cleanup'] | undefined = undefined;
|
2023-11-29 03:13:55 -08:00
|
|
|
|
|
|
|
switch (mimeType) {
|
|
|
|
case 'application/pdf':
|
|
|
|
const splitPages = this.context.getNodeParameter(
|
|
|
|
`${this.optionsPrefix}splitPages`,
|
|
|
|
itemIndex,
|
|
|
|
false,
|
|
|
|
) as boolean;
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new PDFLoader(filePathOrBlob, {
|
2023-11-29 03:13:55 -08:00
|
|
|
splitPages,
|
|
|
|
});
|
|
|
|
break;
|
|
|
|
case 'text/csv':
|
|
|
|
const column = this.context.getNodeParameter(
|
|
|
|
`${this.optionsPrefix}column`,
|
|
|
|
itemIndex,
|
|
|
|
null,
|
|
|
|
) as string;
|
|
|
|
const separator = this.context.getNodeParameter(
|
|
|
|
`${this.optionsPrefix}separator`,
|
|
|
|
itemIndex,
|
|
|
|
',',
|
|
|
|
) as string;
|
|
|
|
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new CSVLoader(filePathOrBlob, {
|
2023-11-29 03:13:55 -08:00
|
|
|
column: column ?? undefined,
|
|
|
|
separator,
|
|
|
|
});
|
|
|
|
break;
|
|
|
|
case 'application/epub+zip':
|
2023-11-30 02:59:37 -08:00
|
|
|
// EPubLoader currently does not accept Blobs https://github.com/langchain-ai/langchainjs/issues/1623
|
|
|
|
let filePath: string;
|
|
|
|
if (filePathOrBlob instanceof Blob) {
|
|
|
|
const tmpFileData = await tmpFile({ prefix: 'epub-loader-' });
|
|
|
|
cleanupTmpFile = tmpFileData.cleanup;
|
|
|
|
try {
|
|
|
|
const bufferData = await filePathOrBlob.arrayBuffer();
|
2024-01-09 03:11:39 -08:00
|
|
|
await pipeline([new Uint8Array(bufferData)], createWriteStream(tmpFileData.path));
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new EPubLoader(tmpFileData.path);
|
2024-01-09 03:11:39 -08:00
|
|
|
break;
|
2023-11-30 02:59:37 -08:00
|
|
|
} catch (error) {
|
|
|
|
await cleanupTmpFile();
|
|
|
|
throw new NodeOperationError(this.context.getNode(), error as Error);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
filePath = filePathOrBlob;
|
|
|
|
}
|
|
|
|
loader = new EPubLoader(filePath);
|
2023-11-29 03:13:55 -08:00
|
|
|
break;
|
|
|
|
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new DocxLoader(filePathOrBlob);
|
2023-11-29 03:13:55 -08:00
|
|
|
break;
|
|
|
|
case 'text/plain':
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new TextLoader(filePathOrBlob);
|
2023-11-29 03:13:55 -08:00
|
|
|
break;
|
|
|
|
case 'application/json':
|
|
|
|
const pointers = this.context.getNodeParameter(
|
|
|
|
`${this.optionsPrefix}pointers`,
|
|
|
|
itemIndex,
|
|
|
|
'',
|
|
|
|
) as string;
|
|
|
|
const pointersArray = pointers.split(',').map((pointer) => pointer.trim());
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new JSONLoader(filePathOrBlob, pointersArray);
|
2023-11-29 03:13:55 -08:00
|
|
|
break;
|
|
|
|
default:
|
2023-11-30 02:59:37 -08:00
|
|
|
loader = new TextLoader(filePathOrBlob);
|
2023-11-29 03:13:55 -08:00
|
|
|
}
|
|
|
|
|
2024-02-22 00:20:07 -08:00
|
|
|
const loadedDoc = this.textSplitter
|
2024-05-24 05:43:17 -07:00
|
|
|
? await this.textSplitter.splitDocuments(await loader.load())
|
2024-02-22 00:20:07 -08:00
|
|
|
: await loader.load();
|
2023-11-29 03:13:55 -08:00
|
|
|
|
|
|
|
docs.push(...loadedDoc);
|
|
|
|
|
|
|
|
if (metadata) {
|
|
|
|
docs.forEach((document) => {
|
|
|
|
document.metadata = {
|
|
|
|
...document.metadata,
|
|
|
|
...metadata,
|
|
|
|
};
|
|
|
|
});
|
|
|
|
}
|
2023-11-30 02:59:37 -08:00
|
|
|
|
|
|
|
if (cleanupTmpFile) {
|
|
|
|
await cleanupTmpFile();
|
|
|
|
}
|
2023-11-29 03:13:55 -08:00
|
|
|
return docs;
|
|
|
|
}
|
|
|
|
}
|