feat(HTML Extract Node): Better text extraction, option to specify selectors to skip, option to clean up text data (#8586)

This commit is contained in:
Michael Kret 2024-02-12 12:52:51 +02:00 committed by GitHub
parent 510bf8905d
commit 32281d12d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 555 additions and 12 deletions

View file

@ -95,6 +95,20 @@ const extractionValuesCollection: INodeProperties = {
placeholder: 'class',
description: 'The name of the attribute to return the value off',
},
{
displayName: 'Skip Selectors',
name: 'skipSelectors',
type: 'string',
displayOptions: {
show: {
returnValue: ['text'],
'@version': [{ _cnd: { gt: 1.1 } }],
},
},
default: '',
placeholder: 'e.g. img, .className, #ItemId',
description: 'Comma-separated list of selectors to skip in the text extraction',
},
{
displayName: 'Return Array',
name: 'returnArray',
@ -114,7 +128,7 @@ export class Html implements INodeType {
name: 'html',
icon: 'file:html.svg',
group: ['transform'],
version: [1, 1.1],
version: [1, 1.1, 1.2],
subtitle: '={{ $parameter["operation"] }}',
description: 'Work with HTML',
defaults: {
@ -277,6 +291,14 @@ export class Html implements INodeType {
description:
'Whether to remove automatically all spaces and newlines from the beginning and end of the values',
},
{
displayName: 'Clean Up Text',
name: 'cleanUpText',
type: 'boolean',
default: true,
description:
'Whether to remove remove leading and trailing whitespaces, line breaks (newlines) and condense multiple consecutive whitespaces into a single space',
},
],
},
// ----------------------------------
@ -548,14 +570,19 @@ export class Html implements INodeType {
// An array should be returned so iterate over one
// value at a time
newItem.json[valueData.key] = [];
htmlElement.each((i, el) => {
htmlElement.each((_, el) => {
(newItem.json[valueData.key] as Array<string | undefined>).push(
getValue($(el), valueData, options),
getValue($(el), valueData, options, nodeVersion),
);
});
} else {
// One single value should be returned
newItem.json[valueData.key] = getValue(htmlElement, valueData, options);
newItem.json[valueData.key] = getValue(
htmlElement,
valueData,
options,
nodeVersion,
);
}
}
returnData.push(newItem);

View file

@ -0,0 +1,5 @@
import { testWorkflows, getWorkflowFilenames } from '@test/nodes/Helpers';
const workflows = getWorkflowFilenames(__dirname);
describe('Test Html Node > extractHtmlContent', () => testWorkflows(workflows));

View file

@ -0,0 +1,469 @@
{
"name": "html extract fix",
"nodes": [
{
"parameters": {},
"id": "b421815f-bbeb-480d-a759-6a0360a050b6",
"name": "When clicking \"Execute Workflow\"",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
480,
780
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "html"
}
]
},
"options": {
"cleanUpText": true
}
},
"id": "73ed18ec-3a26-4300-b917-240faa810a33",
"name": "HTML",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
260
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "html",
"skipSelectors": "img, a"
}
]
},
"options": {}
},
"id": "d8eaf4c4-be91-43ba-b5ff-efcc7ece60b0",
"name": "HTML2",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
600
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "p",
"returnArray": true
}
]
},
"options": {}
},
"id": "145a5168-69fd-49bd-a2a0-07841854937c",
"name": "HTML3",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
760
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "=html"
}
]
},
"options": {
"trimValues": true
}
},
"id": "7a370ce9-e4c4-46e0-89a4-881a6c8a7019",
"name": "HTML1",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
420
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "div",
"returnValue": "attribute"
}
]
},
"options": {}
},
"id": "64c9005d-f9fe-457e-8e24-f1c59e76aeae",
"name": "HTML4",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
940
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "body",
"returnValue": "html"
}
]
},
"options": {}
},
"id": "eef6b477-2c28-4c20-884f-93bbd48f9d0d",
"name": "HTML5",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
1120
]
},
{
"parameters": {
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "data",
"cssSelector": "#text-id",
"returnValue": "value"
}
]
},
"options": {}
},
"id": "a548e5e3-0dcd-4f52-a581-bd046ef325b3",
"name": "HTML6",
"type": "n8n-nodes-base.html",
"typeVersion": 1.2,
"position": [
1280,
1280
]
},
{
"parameters": {
"jsCode": "return {\n data: `<html>\n<head>\n\t<title>My Page</title>\t\n</head>\n<body>\n\t<h1>My Page</h1>\n\t<p>Hello World</p>\n\t<div class=\"content\">\n\t\t<p>Another paragraph\\n</p>\n\t\t<p>Yet \\r\\n\\r\\n\\t\\t\\t\\t\\t\\t\\another paragraph\\n</p>\n\t\t<p>And\\one more\\n</p>\n\t</div>\n\t<img src=\"https://n8n.io/n8n-logo.png\" alt=\"n8n.io logo\" />\n\t<a href=\"https://n8n.io\">n8n.io</a>\n <input id=\"text-id\" type=\"text\" value=\"n8n\" />\n</body>\n</html>`\n};"
},
"id": "ed46f03d-6cde-4225-beab-fdbe82bf095f",
"name": "Code",
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
780,
740
]
},
{
"parameters": {},
"id": "9d4c07df-3348-4b0e-b144-dfb038bddb99",
"name": "No Operation, do nothing",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1500,
260
]
},
{
"parameters": {},
"id": "0dd56421-7e3a-4908-a933-c4e09de6b7d5",
"name": "No Operation, do nothing1",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1580,
620
]
},
{
"parameters": {},
"id": "15c21267-b4b7-4805-ad40-32060400fcef",
"name": "No Operation, do nothing2",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1600,
780
]
},
{
"parameters": {},
"id": "d6fd1d78-e24b-4237-bd66-620130f0e5fc",
"name": "No Operation, do nothing3",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1640,
1140
]
},
{
"parameters": {},
"id": "f8b7457b-7b46-4af6-958c-ad770f29e587",
"name": "No Operation, do nothing4",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1560,
960
]
},
{
"parameters": {},
"id": "f509b012-78ea-4e14-9ee4-ebdd562efe3e",
"name": "No Operation, do nothing5",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1560,
420
]
},
{
"parameters": {},
"id": "173f20fb-80aa-42d1-97b1-fc7a751fbedd",
"name": "No Operation, do nothing6",
"type": "n8n-nodes-base.noOp",
"typeVersion": 1,
"position": [
1580,
1300
]
}
],
"pinData": {
"No Operation, do nothing4": [
{
"json": {
"data": {
"class": "content"
}
}
}
],
"No Operation, do nothing": [
{
"json": {
"data": "MY PAGEHello WorldAnother paragraphYet another paragraphAndone moren8n.io logo [https://n8n.io/n8n-logo.png] n8n.io [https://n8n.io]"
}
}
],
"No Operation, do nothing5": [
{
"json": {
"data": "MY PAGE\n\nHello World\n\nAnother paragraph\n\nYet another paragraph\n\nAndone more\n\nn8n.io logo [https://n8n.io/n8n-logo.png] n8n.io [https://n8n.io]"
}
}
],
"No Operation, do nothing1": [
{
"json": {
"data": "MY PAGE\n\nHello World\n\nAnother paragraph\n\nYet another paragraph\n\nAndone more"
}
}
],
"No Operation, do nothing2": [
{
"json": {
"data": [
"Hello World",
"Another paragraph",
"Yet another paragraph",
"Andone more"
]
}
}
],
"No Operation, do nothing3": [
{
"json": {
"data": "\n\t<h1>My Page</h1>\n\t<p>Hello World</p>\n\t<div class=\"content\">\n\t\t<p>Another paragraph\n</p>\n\t\t<p>Yet \n\n\t\t\t\t\t\tanother paragraph\n</p>\n\t\t<p>Andone more\n</p>\n\t</div>\n\t<img src=\"https://n8n.io/n8n-logo.png\" alt=\"n8n.io logo\">\n\t<a href=\"https://n8n.io\">n8n.io</a>\n <input id=\"text-id\" type=\"text\" value=\"n8n\">\n\n"
}
}
],
"No Operation, do nothing6": [
{
"json": {
"data": "n8n"
}
}
]
},
"connections": {
"When clicking \"Execute Workflow\"": {
"main": [
[
{
"node": "Code",
"type": "main",
"index": 0
}
]
]
},
"Code": {
"main": [
[
{
"node": "HTML",
"type": "main",
"index": 0
},
{
"node": "HTML1",
"type": "main",
"index": 0
},
{
"node": "HTML2",
"type": "main",
"index": 0
},
{
"node": "HTML3",
"type": "main",
"index": 0
},
{
"node": "HTML4",
"type": "main",
"index": 0
},
{
"node": "HTML5",
"type": "main",
"index": 0
},
{
"node": "HTML6",
"type": "main",
"index": 0
}
]
]
},
"HTML": {
"main": [
[
{
"node": "No Operation, do nothing",
"type": "main",
"index": 0
}
]
]
},
"HTML6": {
"main": [
[
{
"node": "No Operation, do nothing6",
"type": "main",
"index": 0
}
]
]
},
"HTML5": {
"main": [
[
{
"node": "No Operation, do nothing3",
"type": "main",
"index": 0
}
]
]
},
"HTML4": {
"main": [
[
{
"node": "No Operation, do nothing4",
"type": "main",
"index": 0
}
]
]
},
"HTML3": {
"main": [
[
{
"node": "No Operation, do nothing2",
"type": "main",
"index": 0
}
]
]
},
"HTML2": {
"main": [
[
{
"node": "No Operation, do nothing1",
"type": "main",
"index": 0
}
]
]
},
"HTML1": {
"main": [
[
{
"node": "No Operation, do nothing5",
"type": "main",
"index": 0
}
]
]
}
},
"active": false,
"settings": {
"executionOrder": "v1"
},
"versionId": "170b087f-19bf-4cbd-90cf-d684fb112034",
"meta": {
"templateCredsSetupCompleted": true,
"instanceId": "be251a83c052a9862eeac953816fbb1464f89dfbf79d7ac490a8e336a8cc8bfd"
},
"id": "vqwcz5PIBQmAw4SZ",
"tags": []
}

View file

@ -4,6 +4,7 @@ export type Cheerio = ReturnType<typeof cheerio>;
export interface IValueData {
attribute?: string;
skipSelectors?: string;
cssSelector: string;
returnValue: string;
key: string;

View file

@ -1,26 +1,59 @@
import type { IDataObject } from 'n8n-workflow';
import type { IValueData, Cheerio } from './types';
import { convert } from 'html-to-text';
// The extraction functions
const extractFunctions: {
[key: string]: ($: Cheerio, valueData: IValueData) => string | undefined;
[key: string]: ($: Cheerio, valueData: IValueData, nodeVersion: number) => string | undefined;
} = {
attribute: ($: Cheerio, valueData: IValueData): string | undefined =>
$.attr(valueData.attribute!),
html: ($: Cheerio, _valueData: IValueData): string | undefined => $.html() || undefined,
text: ($: Cheerio, _valueData: IValueData): string | undefined => $.text(),
text: ($: Cheerio, _valueData: IValueData, nodeVersion: number): string | undefined => {
if (nodeVersion <= 1.1) return $.text() || undefined;
const html = $.html() || '';
let options;
if (_valueData.skipSelectors) {
options = {
selectors: _valueData.skipSelectors.split(',').map((s) => ({
selector: s.trim(),
format: 'skip',
})),
};
}
return convert(html, options);
},
value: ($: Cheerio, _valueData: IValueData): string | undefined => $.val(),
};
/**
* Simple helper function which applies options
*/
export function getValue($: Cheerio, valueData: IValueData, options: IDataObject) {
const value = extractFunctions[valueData.returnValue]($, valueData);
if (options.trimValues === false || value === undefined) {
export function getValue(
$: Cheerio,
valueData: IValueData,
options: IDataObject,
nodeVersion: number,
) {
let value = extractFunctions[valueData.returnValue]($, valueData, nodeVersion);
if (value === undefined) {
return value;
}
return value.trim();
if (options.trimValues) {
value = value.trim();
}
if (options.cleanUpText) {
value = value
.replace(/^\s+|\s+$/g, '')
.replace(/(\r\n|\n|\r)/gm, '')
.replace(/\s+/g, ' ');
}
return value;
}

View file

@ -270,7 +270,7 @@ export class HtmlExtract implements INodeType {
// An array should be returned so iterate over one
// value at a time
newItem.json[valueData.key] = [];
htmlElement.each((i, el) => {
htmlElement.each((_, el) => {
(newItem.json[valueData.key] as Array<string | undefined>).push(
getValue($(el), valueData, options),
);

View file

@ -805,6 +805,7 @@
"@types/cron": "~1.7.1",
"@types/eventsource": "^1.1.2",
"@types/express": "^4.17.6",
"@types/html-to-text": "^9.0.1",
"@types/gm": "^1.25.0",
"@types/imap-simple": "^4.2.0",
"@types/js-nacl": "^1.3.0",
@ -842,6 +843,7 @@
"csv-parse": "5.5.0",
"currency-codes": "2.1.0",
"eventsource": "2.0.2",
"html-to-text": "9.0.5",
"fast-glob": "3.2.12",
"fflate": "0.7.4",
"get-system-fonts": "2.0.2",

View file

@ -1260,6 +1260,9 @@ importers:
gm:
specifier: 1.25.0
version: 1.25.0
html-to-text:
specifier: 9.0.5
version: 9.0.5
iconv-lite:
specifier: 0.6.3
version: 0.6.3
@ -1426,6 +1429,9 @@ importers:
'@types/gm':
specifier: ^1.25.0
version: 1.25.0
'@types/html-to-text':
specifier: ^9.0.1
version: 9.0.4
'@types/imap-simple':
specifier: ^4.2.0
version: 4.2.5