mirror of
https://github.com/n8n-io/n8n.git
synced 2024-11-15 09:04:07 -08:00
87def60979
Github issue / Community forum post (link here to close automatically): https://community.n8n.io/t/langchain-memory-chat/23733 --------- Signed-off-by: Oleg Ivaniv <me@olegivaniv.com> Co-authored-by: Oleg Ivaniv <me@olegivaniv.com> Co-authored-by: Val <68596159+valya@users.noreply.github.com> Co-authored-by: Alex Grozav <alex@grozav.com> Co-authored-by: कारतोफ्फेलस्क्रिप्ट™ <aditya@netroy.in> Co-authored-by: Deborah <deborah@starfallprojects.co.uk> Co-authored-by: Jesper Bylund <mail@jesperbylund.com> Co-authored-by: Jon <jonathan.bennetts@gmail.com> Co-authored-by: Michael Kret <88898367+michael-radency@users.noreply.github.com> Co-authored-by: Giulio Andreini <andreini@netseven.it> Co-authored-by: Mason Geloso <Mason.geloso@gmail.com> Co-authored-by: Mason Geloso <hone@Masons-Mac-mini.local> Co-authored-by: Mutasem Aldmour <mutasem@n8n.io>
83 lines
2.4 KiB
TypeScript
83 lines
2.4 KiB
TypeScript
// Modified version of https://github.com/hwchase17/langchainjs/blob/main/langchain/src/document_loaders/fs/epub.ts
|
|
// to support loading of EPUB files from a Buffer
|
|
import { parseEpub } from '@gxl/epub-parser';
|
|
import { BaseDocumentLoader } from 'langchain/document_loaders/base';
|
|
import { Document } from 'langchain/document';
|
|
import { htmlToText } from 'html-to-text';
|
|
/**
|
|
* A class that extends the `BaseDocumentLoader` class. It represents a
|
|
* document loader that loads documents from EPUB files.
|
|
*/
|
|
export class N8nEPubLoader extends BaseDocumentLoader {
|
|
private splitChapters: boolean;
|
|
|
|
constructor(
|
|
public file: Buffer,
|
|
{ splitChapters = true } = {},
|
|
) {
|
|
super();
|
|
this.splitChapters = splitChapters;
|
|
}
|
|
|
|
/**
|
|
* A protected method that takes an EPUB object as a parameter and returns
|
|
* a promise that resolves to an array of objects representing the content
|
|
* and metadata of each chapter.
|
|
* @param epub The EPUB object to parse.
|
|
* @returns A promise that resolves to an array of objects representing the content and metadata of each chapter.
|
|
*/
|
|
protected async parse(
|
|
epub: ReturnType<typeof parseEpub>,
|
|
): Promise<Array<{ pageContent: string; metadata?: object }>> {
|
|
// We await it here because epub-parsers doesn't export a type for the
|
|
// return value of parseEpub.
|
|
const parsed = await epub;
|
|
|
|
const chapters = await Promise.all(
|
|
(parsed.sections ?? []).map(async (chapter) => {
|
|
if (!chapter.id) return null as never;
|
|
|
|
const html = chapter.htmlString;
|
|
if (!html) return null as never;
|
|
|
|
return {
|
|
html,
|
|
title: chapter.id,
|
|
};
|
|
}),
|
|
);
|
|
return chapters.filter(Boolean).map((chapter) => ({
|
|
pageContent: htmlToText(chapter.html),
|
|
metadata: {
|
|
...(chapter.title && { chapter: chapter.title }),
|
|
},
|
|
}));
|
|
}
|
|
|
|
/**
|
|
* A method that loads the EPUB file and returns a promise that resolves
|
|
* to an array of `Document` instances.
|
|
* @returns A promise that resolves to an array of `Document` instances.
|
|
*/
|
|
public async load(): Promise<Document[]> {
|
|
const epub = parseEpub(this.file, { type: 'buffer' });
|
|
const parsed = await this.parse(epub);
|
|
|
|
return this.splitChapters
|
|
? parsed.map(
|
|
(chapter) =>
|
|
new Document({
|
|
pageContent: chapter.pageContent,
|
|
metadata: {
|
|
...chapter.metadata,
|
|
},
|
|
}),
|
|
)
|
|
: [
|
|
new Document({
|
|
pageContent: parsed.map((chapter) => chapter.pageContent).join('\n\n'),
|
|
}),
|
|
];
|
|
}
|
|
}
|