diff --git a/packages/nodes-base/nodes/Html/Html.node.ts b/packages/nodes-base/nodes/Html/Html.node.ts index f83ebb9318..0a8b5ba33d 100644 --- a/packages/nodes-base/nodes/Html/Html.node.ts +++ b/packages/nodes-base/nodes/Html/Html.node.ts @@ -95,6 +95,20 @@ const extractionValuesCollection: INodeProperties = { placeholder: 'class', description: 'The name of the attribute to return the value off', }, + { + displayName: 'Skip Selectors', + name: 'skipSelectors', + type: 'string', + displayOptions: { + show: { + returnValue: ['text'], + '@version': [{ _cnd: { gt: 1.1 } }], + }, + }, + default: '', + placeholder: 'e.g. img, .className, #ItemId', + description: 'Comma-separated list of selectors to skip in the text extraction', + }, { displayName: 'Return Array', name: 'returnArray', @@ -114,7 +128,7 @@ export class Html implements INodeType { name: 'html', icon: 'file:html.svg', group: ['transform'], - version: [1, 1.1], + version: [1, 1.1, 1.2], subtitle: '={{ $parameter["operation"] }}', description: 'Work with HTML', defaults: { @@ -277,6 +291,14 @@ export class Html implements INodeType { description: 'Whether to remove automatically all spaces and newlines from the beginning and end of the values', }, + { + displayName: 'Clean Up Text', + name: 'cleanUpText', + type: 'boolean', + default: true, + description: + 'Whether to remove remove leading and trailing whitespaces, line breaks (newlines) and condense multiple consecutive whitespaces into a single space', + }, ], }, // ---------------------------------- @@ -548,14 +570,19 @@ export class Html implements INodeType { // An array should be returned so iterate over one // value at a time newItem.json[valueData.key] = []; - htmlElement.each((i, el) => { + htmlElement.each((_, el) => { (newItem.json[valueData.key] as Array).push( - getValue($(el), valueData, options), + getValue($(el), valueData, options, nodeVersion), ); }); } else { // One single value should be returned - newItem.json[valueData.key] = getValue(htmlElement, valueData, options); + newItem.json[valueData.key] = getValue( + htmlElement, + valueData, + options, + nodeVersion, + ); } } returnData.push(newItem); diff --git a/packages/nodes-base/nodes/Html/test/Html.node.test.ts b/packages/nodes-base/nodes/Html/test/Html.node.test.ts new file mode 100644 index 0000000000..9fc798c198 --- /dev/null +++ b/packages/nodes-base/nodes/Html/test/Html.node.test.ts @@ -0,0 +1,5 @@ +import { testWorkflows, getWorkflowFilenames } from '@test/nodes/Helpers'; + +const workflows = getWorkflowFilenames(__dirname); + +describe('Test Html Node > extractHtmlContent', () => testWorkflows(workflows)); diff --git a/packages/nodes-base/nodes/Html/test/extractHtmlContent.workflow.json b/packages/nodes-base/nodes/Html/test/extractHtmlContent.workflow.json new file mode 100644 index 0000000000..e027d29e07 --- /dev/null +++ b/packages/nodes-base/nodes/Html/test/extractHtmlContent.workflow.json @@ -0,0 +1,469 @@ +{ + "name": "html extract fix", + "nodes": [ + { + "parameters": {}, + "id": "b421815f-bbeb-480d-a759-6a0360a050b6", + "name": "When clicking \"Execute Workflow\"", + "type": "n8n-nodes-base.manualTrigger", + "typeVersion": 1, + "position": [ + 480, + 780 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "html" + } + ] + }, + "options": { + "cleanUpText": true + } + }, + "id": "73ed18ec-3a26-4300-b917-240faa810a33", + "name": "HTML", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 260 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "html", + "skipSelectors": "img, a" + } + ] + }, + "options": {} + }, + "id": "d8eaf4c4-be91-43ba-b5ff-efcc7ece60b0", + "name": "HTML2", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 600 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "p", + "returnArray": true + } + ] + }, + "options": {} + }, + "id": "145a5168-69fd-49bd-a2a0-07841854937c", + "name": "HTML3", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 760 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "=html" + } + ] + }, + "options": { + "trimValues": true + } + }, + "id": "7a370ce9-e4c4-46e0-89a4-881a6c8a7019", + "name": "HTML1", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 420 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "div", + "returnValue": "attribute" + } + ] + }, + "options": {} + }, + "id": "64c9005d-f9fe-457e-8e24-f1c59e76aeae", + "name": "HTML4", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 940 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "body", + "returnValue": "html" + } + ] + }, + "options": {} + }, + "id": "eef6b477-2c28-4c20-884f-93bbd48f9d0d", + "name": "HTML5", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 1120 + ] + }, + { + "parameters": { + "operation": "extractHtmlContent", + "extractionValues": { + "values": [ + { + "key": "data", + "cssSelector": "#text-id", + "returnValue": "value" + } + ] + }, + "options": {} + }, + "id": "a548e5e3-0dcd-4f52-a581-bd046ef325b3", + "name": "HTML6", + "type": "n8n-nodes-base.html", + "typeVersion": 1.2, + "position": [ + 1280, + 1280 + ] + }, + { + "parameters": { + "jsCode": "return {\n data: `\n\n\tMy Page\t\n\n\n\t

My Page

\n\t

Hello World

\n\t
\n\t\t

Another paragraph\\n

\n\t\t

Yet \\r\\n\\r\\n\\t\\t\\t\\t\\t\\t\\another paragraph\\n

\n\t\t

And\\one more\\n

\n\t
\n\t\"n8n.io\n\tn8n.io\n \n\n`\n};" + }, + "id": "ed46f03d-6cde-4225-beab-fdbe82bf095f", + "name": "Code", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [ + 780, + 740 + ] + }, + { + "parameters": {}, + "id": "9d4c07df-3348-4b0e-b144-dfb038bddb99", + "name": "No Operation, do nothing", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1500, + 260 + ] + }, + { + "parameters": {}, + "id": "0dd56421-7e3a-4908-a933-c4e09de6b7d5", + "name": "No Operation, do nothing1", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1580, + 620 + ] + }, + { + "parameters": {}, + "id": "15c21267-b4b7-4805-ad40-32060400fcef", + "name": "No Operation, do nothing2", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1600, + 780 + ] + }, + { + "parameters": {}, + "id": "d6fd1d78-e24b-4237-bd66-620130f0e5fc", + "name": "No Operation, do nothing3", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1640, + 1140 + ] + }, + { + "parameters": {}, + "id": "f8b7457b-7b46-4af6-958c-ad770f29e587", + "name": "No Operation, do nothing4", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1560, + 960 + ] + }, + { + "parameters": {}, + "id": "f509b012-78ea-4e14-9ee4-ebdd562efe3e", + "name": "No Operation, do nothing5", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1560, + 420 + ] + }, + { + "parameters": {}, + "id": "173f20fb-80aa-42d1-97b1-fc7a751fbedd", + "name": "No Operation, do nothing6", + "type": "n8n-nodes-base.noOp", + "typeVersion": 1, + "position": [ + 1580, + 1300 + ] + } + ], + "pinData": { + "No Operation, do nothing4": [ + { + "json": { + "data": { + "class": "content" + } + } + } + ], + "No Operation, do nothing": [ + { + "json": { + "data": "MY PAGEHello WorldAnother paragraphYet another paragraphAndone moren8n.io logo [https://n8n.io/n8n-logo.png] n8n.io [https://n8n.io]" + } + } + ], + "No Operation, do nothing5": [ + { + "json": { + "data": "MY PAGE\n\nHello World\n\nAnother paragraph\n\nYet another paragraph\n\nAndone more\n\nn8n.io logo [https://n8n.io/n8n-logo.png] n8n.io [https://n8n.io]" + } + } + ], + "No Operation, do nothing1": [ + { + "json": { + "data": "MY PAGE\n\nHello World\n\nAnother paragraph\n\nYet another paragraph\n\nAndone more" + } + } + ], + "No Operation, do nothing2": [ + { + "json": { + "data": [ + "Hello World", + "Another paragraph", + "Yet another paragraph", + "Andone more" + ] + } + } + ], + "No Operation, do nothing3": [ + { + "json": { + "data": "\n\t

My Page

\n\t

Hello World

\n\t
\n\t\t

Another paragraph\n

\n\t\t

Yet \n\n\t\t\t\t\t\tanother paragraph\n

\n\t\t

Andone more\n

\n\t
\n\t\"n8n.io\n\tn8n.io\n \n\n" + } + } + ], + "No Operation, do nothing6": [ + { + "json": { + "data": "n8n" + } + } + ] + }, + "connections": { + "When clicking \"Execute Workflow\"": { + "main": [ + [ + { + "node": "Code", + "type": "main", + "index": 0 + } + ] + ] + }, + "Code": { + "main": [ + [ + { + "node": "HTML", + "type": "main", + "index": 0 + }, + { + "node": "HTML1", + "type": "main", + "index": 0 + }, + { + "node": "HTML2", + "type": "main", + "index": 0 + }, + { + "node": "HTML3", + "type": "main", + "index": 0 + }, + { + "node": "HTML4", + "type": "main", + "index": 0 + }, + { + "node": "HTML5", + "type": "main", + "index": 0 + }, + { + "node": "HTML6", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML": { + "main": [ + [ + { + "node": "No Operation, do nothing", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML6": { + "main": [ + [ + { + "node": "No Operation, do nothing6", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML5": { + "main": [ + [ + { + "node": "No Operation, do nothing3", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML4": { + "main": [ + [ + { + "node": "No Operation, do nothing4", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML3": { + "main": [ + [ + { + "node": "No Operation, do nothing2", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML2": { + "main": [ + [ + { + "node": "No Operation, do nothing1", + "type": "main", + "index": 0 + } + ] + ] + }, + "HTML1": { + "main": [ + [ + { + "node": "No Operation, do nothing5", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "active": false, + "settings": { + "executionOrder": "v1" + }, + "versionId": "170b087f-19bf-4cbd-90cf-d684fb112034", + "meta": { + "templateCredsSetupCompleted": true, + "instanceId": "be251a83c052a9862eeac953816fbb1464f89dfbf79d7ac490a8e336a8cc8bfd" + }, + "id": "vqwcz5PIBQmAw4SZ", + "tags": [] +} diff --git a/packages/nodes-base/nodes/Html/types.ts b/packages/nodes-base/nodes/Html/types.ts index bbe5ab68ac..251720845c 100644 --- a/packages/nodes-base/nodes/Html/types.ts +++ b/packages/nodes-base/nodes/Html/types.ts @@ -4,6 +4,7 @@ export type Cheerio = ReturnType; export interface IValueData { attribute?: string; + skipSelectors?: string; cssSelector: string; returnValue: string; key: string; diff --git a/packages/nodes-base/nodes/Html/utils.ts b/packages/nodes-base/nodes/Html/utils.ts index 1b163b14f1..d204c349d2 100644 --- a/packages/nodes-base/nodes/Html/utils.ts +++ b/packages/nodes-base/nodes/Html/utils.ts @@ -1,26 +1,59 @@ import type { IDataObject } from 'n8n-workflow'; import type { IValueData, Cheerio } from './types'; +import { convert } from 'html-to-text'; + // The extraction functions const extractFunctions: { - [key: string]: ($: Cheerio, valueData: IValueData) => string | undefined; + [key: string]: ($: Cheerio, valueData: IValueData, nodeVersion: number) => string | undefined; } = { attribute: ($: Cheerio, valueData: IValueData): string | undefined => $.attr(valueData.attribute!), - html: ($: Cheerio, _valueData: IValueData): string | undefined => $.html() || undefined, - text: ($: Cheerio, _valueData: IValueData): string | undefined => $.text(), + text: ($: Cheerio, _valueData: IValueData, nodeVersion: number): string | undefined => { + if (nodeVersion <= 1.1) return $.text() || undefined; + + const html = $.html() || ''; + + let options; + if (_valueData.skipSelectors) { + options = { + selectors: _valueData.skipSelectors.split(',').map((s) => ({ + selector: s.trim(), + format: 'skip', + })), + }; + } + return convert(html, options); + }, value: ($: Cheerio, _valueData: IValueData): string | undefined => $.val(), }; /** * Simple helper function which applies options */ -export function getValue($: Cheerio, valueData: IValueData, options: IDataObject) { - const value = extractFunctions[valueData.returnValue]($, valueData); - if (options.trimValues === false || value === undefined) { +export function getValue( + $: Cheerio, + valueData: IValueData, + options: IDataObject, + nodeVersion: number, +) { + let value = extractFunctions[valueData.returnValue]($, valueData, nodeVersion); + + if (value === undefined) { return value; } - return value.trim(); + if (options.trimValues) { + value = value.trim(); + } + + if (options.cleanUpText) { + value = value + .replace(/^\s+|\s+$/g, '') + .replace(/(\r\n|\n|\r)/gm, '') + .replace(/\s+/g, ' '); + } + + return value; } diff --git a/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts b/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts index d62cd05f06..e9de09050e 100644 --- a/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts +++ b/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts @@ -270,7 +270,7 @@ export class HtmlExtract implements INodeType { // An array should be returned so iterate over one // value at a time newItem.json[valueData.key] = []; - htmlElement.each((i, el) => { + htmlElement.each((_, el) => { (newItem.json[valueData.key] as Array).push( getValue($(el), valueData, options), ); diff --git a/packages/nodes-base/package.json b/packages/nodes-base/package.json index 766b88d883..18d123af51 100644 --- a/packages/nodes-base/package.json +++ b/packages/nodes-base/package.json @@ -805,6 +805,7 @@ "@types/cron": "~1.7.1", "@types/eventsource": "^1.1.2", "@types/express": "^4.17.6", + "@types/html-to-text": "^9.0.1", "@types/gm": "^1.25.0", "@types/imap-simple": "^4.2.0", "@types/js-nacl": "^1.3.0", @@ -842,6 +843,7 @@ "csv-parse": "5.5.0", "currency-codes": "2.1.0", "eventsource": "2.0.2", + "html-to-text": "9.0.5", "fast-glob": "3.2.12", "fflate": "0.7.4", "get-system-fonts": "2.0.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f3735657f3..4694440b94 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1260,6 +1260,9 @@ importers: gm: specifier: 1.25.0 version: 1.25.0 + html-to-text: + specifier: 9.0.5 + version: 9.0.5 iconv-lite: specifier: 0.6.3 version: 0.6.3 @@ -1426,6 +1429,9 @@ importers: '@types/gm': specifier: ^1.25.0 version: 1.25.0 + '@types/html-to-text': + specifier: ^9.0.1 + version: 9.0.4 '@types/imap-simple': specifier: ^4.2.0 version: 4.2.5