From 50f3d9bb5a3b949001ba6fee6b0444ead5018ff5 Mon Sep 17 00:00:00 2001 From: Jan Oberhauser Date: Sat, 28 Dec 2019 21:28:15 -0600 Subject: [PATCH] :sparkles: Add HTML Extract-Node --- packages/editor-ui/src/main.ts | 2 + .../nodes/HtmlExtract/HtmlExtract.node.ts | 277 ++++++++++++++++++ packages/nodes-base/package.json | 3 + 3 files changed, 282 insertions(+) create mode 100644 packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts diff --git a/packages/editor-ui/src/main.ts b/packages/editor-ui/src/main.ts index 9cfa172e33..048a718d96 100644 --- a/packages/editor-ui/src/main.ts +++ b/packages/editor-ui/src/main.ts @@ -34,6 +34,7 @@ import { faClone, faCloud, faCopy, + faCut, faDotCircle, faEdit, faEnvelope, @@ -106,6 +107,7 @@ library.add(faCogs); library.add(faClone); library.add(faCloud); library.add(faCopy); +library.add(faCut); library.add(faDotCircle); library.add(faEdit); library.add(faEnvelope); diff --git a/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts b/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts new file mode 100644 index 0000000000..ddedbc0823 --- /dev/null +++ b/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts @@ -0,0 +1,277 @@ +import * as cheerio from 'cheerio'; +import { IExecuteFunctions } from 'n8n-core'; +import { + INodeExecutionData, + INodeType, + INodeTypeDescription, + IDataObject, +} from 'n8n-workflow'; + +interface IValueData { + attribute?: string; + cssSelector: string; + returnValue: string; + key: string; + returnArray: boolean; +} + + +// The extraction functions +const extractFunctions: { + [key: string]: ($: Cheerio, valueData: IValueData) => string | undefined; +} = { + attribute: ($: Cheerio, valueData: IValueData): string | undefined => $.attr(valueData.attribute!), + html: ($: Cheerio, valueData: IValueData): string | undefined => $.html() || undefined, + text: ($: Cheerio, valueData: IValueData): string | undefined => $.text(), + value: ($: Cheerio, valueData: IValueData): string | undefined => $.val(), +}; + + +/** + * Simple helper function which applies options + */ +function getValue($: Cheerio, valueData: IValueData, options: IDataObject) { + const value = extractFunctions[valueData.returnValue]($, valueData); + if (options.trimValues === false || value === undefined) { + return value; + } + + return value.trim(); +} + + +export class HtmlExtract implements INodeType { + description: INodeTypeDescription = { + displayName: 'HTML Extract', + name: 'htmlExtract', + icon: 'fa:cut', + group: ['transform'], + version: 1, + subtitle: '={{$parameter["sourceData"] + ": " + $parameter["dataPropertyName"]}}', + description: 'Extracts data from HTML', + defaults: { + name: 'HTML Extract', + color: '#333377', + }, + inputs: ['main'], + outputs: ['main'], + properties: [ + { + displayName: 'Source Data', + name: 'sourceData', + type: 'options', + options: [ + { + name: 'Binary', + value: 'binary', + }, + { + name: 'JSON', + value: 'json', + }, + ], + default: 'json', + description: 'If HTML should be read from binary or json data.', + }, + { + displayName: 'Binary Property', + name: 'dataPropertyName', + type: 'string', + displayOptions: { + show: { + sourceData: [ + 'binary', + ], + }, + }, + default: 'data', + required: true, + description: 'Name of the binary property in which the HTML to extract the data from can be found.', + }, + { + displayName: 'JSON Property', + name: 'dataPropertyName', + type: 'string', + displayOptions: { + show: { + sourceData: [ + 'json', + ], + }, + }, + default: 'data', + required: true, + description: 'Name of the json property in which the HTML to extract the data from can be found.
The property can either contain a string or an array of strings.', + }, + { + displayName: 'Extraction Values', + name: 'extractionValues', + placeholder: 'Add Value', + type: 'fixedCollection', + typeOptions: { + multipleValues: true, + }, + description: 'The extraction values.', + default: {}, + options: [ + { + name: 'values', + displayName: 'Values', + values: [ + { + displayName: 'Key', + name: 'key', + type: 'string', + default: '', + description: 'The key under which the extracted value should be saved.', + }, + { + displayName: 'CSS Selector', + name: 'cssSelector', + type: 'string', + default: '', + placeholder: '.price', + description: 'The CSS selector to use.', + }, + { + displayName: 'Return Value', + name: 'returnValue', + type: 'options', + options: [ + { + name: 'Attribute', + value: 'attribute', + description: 'Get an attribute value like "class" from an element.', + }, + { + name: 'HTML', + value: 'html', + description: 'Get the HTML the element contains.', + }, + { + name: 'Text', + value: 'text', + description: 'Get only the text content of the element.', + }, + { + name: 'Value', + value: 'value', + description: 'Get value of an input, select or textarea.', + }, + ], + default: 'text', + description: 'What kind of data should be returned.', + }, + { + displayName: 'Attribute', + name: 'attribute', + type: 'string', + displayOptions: { + show: { + returnValue: [ + 'attribute', + ], + }, + }, + default: '', + placeholder: 'class', + description: 'The name of the attribute to return the value off.', + }, + { + displayName: 'Return Array', + name: 'returnArray', + type: 'boolean', + default: false, + description: 'Returns the values as an array so if multiple ones get found they also get
returned separately.If not set all will be returned as a single string.', + }, + ], + }, + ], + }, + + { + displayName: 'Options', + name: 'options', + type: 'collection', + placeholder: 'Add Option', + default: {}, + options: [ + { + displayName: 'Trim Values', + name: 'trimValues', + type: 'boolean', + default: true, + description: 'Removes automatically all spaces and newlines from
the beginning and end of the values.', + }, + ], + } + ] + }; + + + async execute(this: IExecuteFunctions): Promise { + const items = this.getInputData(); + + const returnData: INodeExecutionData[] = []; + + let item: INodeExecutionData; + for (let itemIndex = 0; itemIndex < items.length; itemIndex++) { + const dataPropertyName = this.getNodeParameter('dataPropertyName', itemIndex) as string; + const extractionValues = this.getNodeParameter('extractionValues', itemIndex) as IDataObject; + const options = this.getNodeParameter('options', itemIndex, {}) as IDataObject; + const sourceData = this.getNodeParameter('sourceData', itemIndex) as string; + + item = items[itemIndex]; + + let htmlArray: string[] | string = []; + if (sourceData === 'json') { + if (item.json[dataPropertyName] === undefined) { + throw new Error(`No property named "${dataPropertyName}" exists!`); + } + htmlArray = item.json[dataPropertyName] as string; + } else { + if (item.binary === undefined) { + throw new Error(`No item does not contain binary data!`); + } + if (item.binary[dataPropertyName] === undefined) { + throw new Error(`No property named "${dataPropertyName}" exists!`); + } + htmlArray = Buffer.from(item.binary[dataPropertyName].data, 'base64').toString('utf8'); + } + + // Convert it always to array that it works with a string or an array of strings + if (!Array.isArray(htmlArray)) { + htmlArray = [htmlArray]; + } + + for (const html of htmlArray as string[]) { + const $ = cheerio.load(html); + + const newItem: INodeExecutionData = { + json: {}, + }; + + // Itterate over all the defined values which should be extracted + let htmlElement; + for (const valueData of extractionValues.values as IValueData[]) { + htmlElement = $(valueData.cssSelector); + + if (valueData.returnArray === true) { + // An array should be returned so itterate over one + // value at a time + newItem.json[valueData.key as string] = []; + htmlElement.each((i, el) => { + (newItem.json[valueData.key as string] as Array).push(getValue($(el), valueData, options)); + }); + } else { + // One single value should be returned + newItem.json[valueData.key as string] = getValue(htmlElement, valueData, options); + } + } + returnData.push(newItem); + } + } + + return this.prepareOutputData(returnData); + } +} diff --git a/packages/nodes-base/package.json b/packages/nodes-base/package.json index fa5665b66e..ebd64cab02 100644 --- a/packages/nodes-base/package.json +++ b/packages/nodes-base/package.json @@ -109,6 +109,7 @@ "dist/nodes/Google/GoogleDrive.node.js", "dist/nodes/Google/GoogleSheets.node.js", "dist/nodes/GraphQL/GraphQL.node.js", + "dist/nodes/HtmlExtract/HtmlExtract.node.js", "dist/nodes/HttpRequest.node.js", "dist/nodes/Hubspot/Hubspot.node.js", "dist/nodes/If.node.js", @@ -167,6 +168,7 @@ "devDependencies": { "@types/aws4": "^1.5.1", "@types/basic-auth": "^1.1.2", + "@types/cheerio": "^0.22.15", "@types/cron": "^1.6.1", "@types/express": "^4.16.1", "@types/gm": "^1.18.2", @@ -189,6 +191,7 @@ "dependencies": { "aws4": "^1.8.0", "basic-auth": "^2.0.1", + "cheerio": "^1.0.0-rc.3", "cron": "^1.6.0", "glob-promise": "^3.4.0", "gm": "^1.23.1",