n8n/packages/nodes-base/nodes/HtmlExtract/HtmlExtract.node.ts

281 lines
7.6 KiB
TypeScript
Raw Normal View History

2019-12-28 19:28:15 -08:00
import * as cheerio from 'cheerio';
import { IExecuteFunctions } from 'n8n-core';
import {
IDataObject,
2019-12-28 19:28:15 -08:00
INodeExecutionData,
INodeType,
INodeTypeDescription,
:sparkles: Improve node error handling (#1309) * Add path mapping and response error interfaces * Add error handling and throwing functionality * Refactor error handling into a single function * Re-implement error handling in Hacker News node * Fix linting details * Re-implement error handling in Spotify node * Re-implement error handling in G Suite Admin node * :construction: create basic setup NodeError * :construction: add httpCodes * :construction: add path priolist * :construction: handle statusCode in error, adjust interfaces * :construction: fixing type issues w/Ivan * :construction: add error exploration * 👔 fix linter issues * :wrench: improve object check * :construction: remove path passing from NodeApiError * :construction: add multi error + refactor findProperty method * 👔 allow any * :wrench: handle multi error message callback * :zap: change return type of callback * :zap: add customCallback to MultiError * :construction: refactor to use INode * :hammer: handle arrays, continue search after first null property found * 🚫 refactor method access * :construction: setup NodeErrorView * :zap: change timestamp to Date.now * :books: Add documentation for methods and constants * :construction: change message setting * 🚚 move NodeErrors to workflow * :sparkles: add new ErrorView for Nodes * :art: improve error notification * :art: refactor interfaces * :zap: add WorkflowOperationError, refactor error throwing * 👕 fix linter issues * :art: rename param * :bug: fix handling normal errors * :zap: add usage of NodeApiError * :art: fix throw new error instead of constructor * :art: remove unnecessary code/comments * :art: adjusted spacing + updated status messages * :art: fix tab indentation * ✨ Replace current errors with custom errors (#1576) * :zap: Introduce NodeApiError in catch blocks * :zap: Introduce NodeOperationError in nodes * :zap: Add missing errors and remove incompatible * :zap: Fix NodeOperationError in incompatible nodes * :wrench: Adjust error handling in missed nodes PayPal, FileMaker, Reddit, Taiga and Facebook Graph API nodes * :hammer: Adjust Strava Trigger node error handling * :hammer: Adjust AWS nodes error handling * :hammer: Remove duplicate instantiation of NodeApiError * :bug: fix strava trigger node error handling * Add XML parsing to NodeApiError constructor (#1633) * :bug: Remove type annotation from catch variable * :sparkles: Add XML parsing to NodeApiError * :zap: Simplify error handling in Rekognition node * :zap: Pass in XML flag in generic functions * :fire: Remove try/catch wrappers at call sites * :hammer: Refactor setting description from XML * :hammer: Refactor let to const in resource loaders * :zap: Find property in parsed XML * :zap: Change let to const * :fire: Remove unneeded try/catch block * :shirt: Fix linting issues * :bug: Fix errors from merge conflict resolution * :zap: Add custom errors to latest contributions * :shirt: Fix linting issues * :zap: Refactor MongoDB helpers for custom errors * :bug: Correct custom error type * :zap: Apply feedback to A nodes * :zap: Apply feedback to missed A node * :zap: Apply feedback to B-D nodes * :zap: Apply feedback to E-F nodes * :zap: Apply feedback to G nodes * :zap: Apply feedback to H-L nodes * :zap: Apply feedback to M nodes * :zap: Apply feedback to P nodes * :zap: Apply feedback to R nodes * :zap: Apply feedback to S nodes * :zap: Apply feedback to T nodes * :zap: Apply feedback to V-Z nodes * :zap: Add HTTP code to iterable node error * :hammer: Standardize e as error * :hammer: Standardize err as error * :zap: Fix error handling for non-standard nodes Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <1849459+BHesseldieck@users.noreply.github.com>
2021-04-16 09:33:36 -07:00
NodeOperationError,
2019-12-28 19:28:15 -08:00
} from 'n8n-workflow';
2020-09-24 01:02:05 -07:00
type Cheerio = ReturnType<typeof cheerio>;
2019-12-28 19:28:15 -08:00
interface IValueData {
attribute?: string;
cssSelector: string;
returnValue: string;
key: string;
returnArray: boolean;
}
// The extraction functions
const extractFunctions: {
[key: string]: ($: Cheerio, valueData: IValueData) => string | undefined;
} = {
attribute: ($: Cheerio, valueData: IValueData): string | undefined => $.attr(valueData.attribute!),
html: ($: Cheerio, valueData: IValueData): string | undefined => $.html() || undefined,
text: ($: Cheerio, valueData: IValueData): string | undefined => $.text(),
value: ($: Cheerio, valueData: IValueData): string | undefined => $.val(),
};
/**
* Simple helper function which applies options
*/
function getValue($: Cheerio, valueData: IValueData, options: IDataObject) {
const value = extractFunctions[valueData.returnValue]($, valueData);
if (options.trimValues === false || value === undefined) {
return value;
}
return value.trim();
}
export class HtmlExtract implements INodeType {
description: INodeTypeDescription = {
displayName: 'HTML Extract',
name: 'htmlExtract',
icon: 'fa:cut',
group: ['transform'],
version: 1,
subtitle: '={{$parameter["sourceData"] + ": " + $parameter["dataPropertyName"]}}',
description: 'Extracts data from HTML',
defaults: {
name: 'HTML Extract',
color: '#333377',
},
inputs: ['main'],
outputs: ['main'],
properties: [
{
displayName: 'Source Data',
name: 'sourceData',
type: 'options',
options: [
{
name: 'Binary',
value: 'binary',
},
{
name: 'JSON',
value: 'json',
},
],
default: 'json',
description: 'If HTML should be read from binary or json data.',
},
{
displayName: 'Binary Property',
name: 'dataPropertyName',
type: 'string',
displayOptions: {
show: {
sourceData: [
'binary',
],
},
},
default: 'data',
required: true,
description: 'Name of the binary property in which the HTML to extract the data from can be found.',
},
{
displayName: 'JSON Property',
name: 'dataPropertyName',
type: 'string',
displayOptions: {
show: {
sourceData: [
'json',
],
},
},
default: 'data',
required: true,
description: 'Name of the json property in which the HTML to extract the data from can be found.<br />The property can either contain a string or an array of strings.',
},
{
displayName: 'Extraction Values',
name: 'extractionValues',
placeholder: 'Add Value',
type: 'fixedCollection',
typeOptions: {
multipleValues: true,
},
description: 'The extraction values.',
default: {},
options: [
{
name: 'values',
displayName: 'Values',
values: [
{
displayName: 'Key',
name: 'key',
type: 'string',
default: '',
description: 'The key under which the extracted value should be saved.',
},
{
displayName: 'CSS Selector',
name: 'cssSelector',
type: 'string',
default: '',
placeholder: '.price',
description: 'The CSS selector to use.',
},
{
displayName: 'Return Value',
name: 'returnValue',
type: 'options',
options: [
{
name: 'Attribute',
value: 'attribute',
description: 'Get an attribute value like "class" from an element.',
},
{
name: 'HTML',
value: 'html',
description: 'Get the HTML the element contains.',
},
{
name: 'Text',
value: 'text',
description: 'Get only the text content of the element.',
},
{
name: 'Value',
value: 'value',
description: 'Get value of an input, select or textarea.',
},
],
default: 'text',
description: 'What kind of data should be returned.',
},
{
displayName: 'Attribute',
name: 'attribute',
type: 'string',
displayOptions: {
show: {
returnValue: [
'attribute',
],
},
},
default: '',
placeholder: 'class',
description: 'The name of the attribute to return the value off.',
},
{
displayName: 'Return Array',
name: 'returnArray',
type: 'boolean',
default: false,
description: 'Returns the values as an array so if multiple ones get found they also get<br />returned separately.If not set all will be returned as a single string.',
},
],
},
],
},
{
displayName: 'Options',
name: 'options',
type: 'collection',
placeholder: 'Add Option',
default: {},
options: [
{
displayName: 'Trim Values',
name: 'trimValues',
type: 'boolean',
default: true,
description: 'Removes automatically all spaces and newlines from<br />the beginning and end of the values.',
},
],
2020-10-22 06:46:03 -07:00
},
],
2019-12-28 19:28:15 -08:00
};
async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
const items = this.getInputData();
const returnData: INodeExecutionData[] = [];
let item: INodeExecutionData;
for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
const dataPropertyName = this.getNodeParameter('dataPropertyName', itemIndex) as string;
const extractionValues = this.getNodeParameter('extractionValues', itemIndex) as IDataObject;
const options = this.getNodeParameter('options', itemIndex, {}) as IDataObject;
const sourceData = this.getNodeParameter('sourceData', itemIndex) as string;
item = items[itemIndex];
let htmlArray: string[] | string = [];
if (sourceData === 'json') {
if (item.json[dataPropertyName] === undefined) {
:sparkles: Improve node error handling (#1309) * Add path mapping and response error interfaces * Add error handling and throwing functionality * Refactor error handling into a single function * Re-implement error handling in Hacker News node * Fix linting details * Re-implement error handling in Spotify node * Re-implement error handling in G Suite Admin node * :construction: create basic setup NodeError * :construction: add httpCodes * :construction: add path priolist * :construction: handle statusCode in error, adjust interfaces * :construction: fixing type issues w/Ivan * :construction: add error exploration * 👔 fix linter issues * :wrench: improve object check * :construction: remove path passing from NodeApiError * :construction: add multi error + refactor findProperty method * 👔 allow any * :wrench: handle multi error message callback * :zap: change return type of callback * :zap: add customCallback to MultiError * :construction: refactor to use INode * :hammer: handle arrays, continue search after first null property found * 🚫 refactor method access * :construction: setup NodeErrorView * :zap: change timestamp to Date.now * :books: Add documentation for methods and constants * :construction: change message setting * 🚚 move NodeErrors to workflow * :sparkles: add new ErrorView for Nodes * :art: improve error notification * :art: refactor interfaces * :zap: add WorkflowOperationError, refactor error throwing * 👕 fix linter issues * :art: rename param * :bug: fix handling normal errors * :zap: add usage of NodeApiError * :art: fix throw new error instead of constructor * :art: remove unnecessary code/comments * :art: adjusted spacing + updated status messages * :art: fix tab indentation * ✨ Replace current errors with custom errors (#1576) * :zap: Introduce NodeApiError in catch blocks * :zap: Introduce NodeOperationError in nodes * :zap: Add missing errors and remove incompatible * :zap: Fix NodeOperationError in incompatible nodes * :wrench: Adjust error handling in missed nodes PayPal, FileMaker, Reddit, Taiga and Facebook Graph API nodes * :hammer: Adjust Strava Trigger node error handling * :hammer: Adjust AWS nodes error handling * :hammer: Remove duplicate instantiation of NodeApiError * :bug: fix strava trigger node error handling * Add XML parsing to NodeApiError constructor (#1633) * :bug: Remove type annotation from catch variable * :sparkles: Add XML parsing to NodeApiError * :zap: Simplify error handling in Rekognition node * :zap: Pass in XML flag in generic functions * :fire: Remove try/catch wrappers at call sites * :hammer: Refactor setting description from XML * :hammer: Refactor let to const in resource loaders * :zap: Find property in parsed XML * :zap: Change let to const * :fire: Remove unneeded try/catch block * :shirt: Fix linting issues * :bug: Fix errors from merge conflict resolution * :zap: Add custom errors to latest contributions * :shirt: Fix linting issues * :zap: Refactor MongoDB helpers for custom errors * :bug: Correct custom error type * :zap: Apply feedback to A nodes * :zap: Apply feedback to missed A node * :zap: Apply feedback to B-D nodes * :zap: Apply feedback to E-F nodes * :zap: Apply feedback to G nodes * :zap: Apply feedback to H-L nodes * :zap: Apply feedback to M nodes * :zap: Apply feedback to P nodes * :zap: Apply feedback to R nodes * :zap: Apply feedback to S nodes * :zap: Apply feedback to T nodes * :zap: Apply feedback to V-Z nodes * :zap: Add HTTP code to iterable node error * :hammer: Standardize e as error * :hammer: Standardize err as error * :zap: Fix error handling for non-standard nodes Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <1849459+BHesseldieck@users.noreply.github.com>
2021-04-16 09:33:36 -07:00
throw new NodeOperationError(this.getNode(), `No property named "${dataPropertyName}" exists!`);
2019-12-28 19:28:15 -08:00
}
htmlArray = item.json[dataPropertyName] as string;
} else {
if (item.binary === undefined) {
:sparkles: Improve node error handling (#1309) * Add path mapping and response error interfaces * Add error handling and throwing functionality * Refactor error handling into a single function * Re-implement error handling in Hacker News node * Fix linting details * Re-implement error handling in Spotify node * Re-implement error handling in G Suite Admin node * :construction: create basic setup NodeError * :construction: add httpCodes * :construction: add path priolist * :construction: handle statusCode in error, adjust interfaces * :construction: fixing type issues w/Ivan * :construction: add error exploration * 👔 fix linter issues * :wrench: improve object check * :construction: remove path passing from NodeApiError * :construction: add multi error + refactor findProperty method * 👔 allow any * :wrench: handle multi error message callback * :zap: change return type of callback * :zap: add customCallback to MultiError * :construction: refactor to use INode * :hammer: handle arrays, continue search after first null property found * 🚫 refactor method access * :construction: setup NodeErrorView * :zap: change timestamp to Date.now * :books: Add documentation for methods and constants * :construction: change message setting * 🚚 move NodeErrors to workflow * :sparkles: add new ErrorView for Nodes * :art: improve error notification * :art: refactor interfaces * :zap: add WorkflowOperationError, refactor error throwing * 👕 fix linter issues * :art: rename param * :bug: fix handling normal errors * :zap: add usage of NodeApiError * :art: fix throw new error instead of constructor * :art: remove unnecessary code/comments * :art: adjusted spacing + updated status messages * :art: fix tab indentation * ✨ Replace current errors with custom errors (#1576) * :zap: Introduce NodeApiError in catch blocks * :zap: Introduce NodeOperationError in nodes * :zap: Add missing errors and remove incompatible * :zap: Fix NodeOperationError in incompatible nodes * :wrench: Adjust error handling in missed nodes PayPal, FileMaker, Reddit, Taiga and Facebook Graph API nodes * :hammer: Adjust Strava Trigger node error handling * :hammer: Adjust AWS nodes error handling * :hammer: Remove duplicate instantiation of NodeApiError * :bug: fix strava trigger node error handling * Add XML parsing to NodeApiError constructor (#1633) * :bug: Remove type annotation from catch variable * :sparkles: Add XML parsing to NodeApiError * :zap: Simplify error handling in Rekognition node * :zap: Pass in XML flag in generic functions * :fire: Remove try/catch wrappers at call sites * :hammer: Refactor setting description from XML * :hammer: Refactor let to const in resource loaders * :zap: Find property in parsed XML * :zap: Change let to const * :fire: Remove unneeded try/catch block * :shirt: Fix linting issues * :bug: Fix errors from merge conflict resolution * :zap: Add custom errors to latest contributions * :shirt: Fix linting issues * :zap: Refactor MongoDB helpers for custom errors * :bug: Correct custom error type * :zap: Apply feedback to A nodes * :zap: Apply feedback to missed A node * :zap: Apply feedback to B-D nodes * :zap: Apply feedback to E-F nodes * :zap: Apply feedback to G nodes * :zap: Apply feedback to H-L nodes * :zap: Apply feedback to M nodes * :zap: Apply feedback to P nodes * :zap: Apply feedback to R nodes * :zap: Apply feedback to S nodes * :zap: Apply feedback to T nodes * :zap: Apply feedback to V-Z nodes * :zap: Add HTTP code to iterable node error * :hammer: Standardize e as error * :hammer: Standardize err as error * :zap: Fix error handling for non-standard nodes Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <1849459+BHesseldieck@users.noreply.github.com>
2021-04-16 09:33:36 -07:00
throw new NodeOperationError(this.getNode(), `No item does not contain binary data!`);
2019-12-28 19:28:15 -08:00
}
if (item.binary[dataPropertyName] === undefined) {
:sparkles: Improve node error handling (#1309) * Add path mapping and response error interfaces * Add error handling and throwing functionality * Refactor error handling into a single function * Re-implement error handling in Hacker News node * Fix linting details * Re-implement error handling in Spotify node * Re-implement error handling in G Suite Admin node * :construction: create basic setup NodeError * :construction: add httpCodes * :construction: add path priolist * :construction: handle statusCode in error, adjust interfaces * :construction: fixing type issues w/Ivan * :construction: add error exploration * 👔 fix linter issues * :wrench: improve object check * :construction: remove path passing from NodeApiError * :construction: add multi error + refactor findProperty method * 👔 allow any * :wrench: handle multi error message callback * :zap: change return type of callback * :zap: add customCallback to MultiError * :construction: refactor to use INode * :hammer: handle arrays, continue search after first null property found * 🚫 refactor method access * :construction: setup NodeErrorView * :zap: change timestamp to Date.now * :books: Add documentation for methods and constants * :construction: change message setting * 🚚 move NodeErrors to workflow * :sparkles: add new ErrorView for Nodes * :art: improve error notification * :art: refactor interfaces * :zap: add WorkflowOperationError, refactor error throwing * 👕 fix linter issues * :art: rename param * :bug: fix handling normal errors * :zap: add usage of NodeApiError * :art: fix throw new error instead of constructor * :art: remove unnecessary code/comments * :art: adjusted spacing + updated status messages * :art: fix tab indentation * ✨ Replace current errors with custom errors (#1576) * :zap: Introduce NodeApiError in catch blocks * :zap: Introduce NodeOperationError in nodes * :zap: Add missing errors and remove incompatible * :zap: Fix NodeOperationError in incompatible nodes * :wrench: Adjust error handling in missed nodes PayPal, FileMaker, Reddit, Taiga and Facebook Graph API nodes * :hammer: Adjust Strava Trigger node error handling * :hammer: Adjust AWS nodes error handling * :hammer: Remove duplicate instantiation of NodeApiError * :bug: fix strava trigger node error handling * Add XML parsing to NodeApiError constructor (#1633) * :bug: Remove type annotation from catch variable * :sparkles: Add XML parsing to NodeApiError * :zap: Simplify error handling in Rekognition node * :zap: Pass in XML flag in generic functions * :fire: Remove try/catch wrappers at call sites * :hammer: Refactor setting description from XML * :hammer: Refactor let to const in resource loaders * :zap: Find property in parsed XML * :zap: Change let to const * :fire: Remove unneeded try/catch block * :shirt: Fix linting issues * :bug: Fix errors from merge conflict resolution * :zap: Add custom errors to latest contributions * :shirt: Fix linting issues * :zap: Refactor MongoDB helpers for custom errors * :bug: Correct custom error type * :zap: Apply feedback to A nodes * :zap: Apply feedback to missed A node * :zap: Apply feedback to B-D nodes * :zap: Apply feedback to E-F nodes * :zap: Apply feedback to G nodes * :zap: Apply feedback to H-L nodes * :zap: Apply feedback to M nodes * :zap: Apply feedback to P nodes * :zap: Apply feedback to R nodes * :zap: Apply feedback to S nodes * :zap: Apply feedback to T nodes * :zap: Apply feedback to V-Z nodes * :zap: Add HTTP code to iterable node error * :hammer: Standardize e as error * :hammer: Standardize err as error * :zap: Fix error handling for non-standard nodes Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <b.hesseldieck@gmail.com> Co-authored-by: Ben Hesseldieck <1849459+BHesseldieck@users.noreply.github.com>
2021-04-16 09:33:36 -07:00
throw new NodeOperationError(this.getNode(), `No property named "${dataPropertyName}" exists!`);
2019-12-28 19:28:15 -08:00
}
htmlArray = Buffer.from(item.binary[dataPropertyName].data, 'base64').toString('utf8');
}
// Convert it always to array that it works with a string or an array of strings
if (!Array.isArray(htmlArray)) {
htmlArray = [htmlArray];
}
for (const html of htmlArray as string[]) {
const $ = cheerio.load(html);
const newItem: INodeExecutionData = {
json: {},
};
// Itterate over all the defined values which should be extracted
let htmlElement;
for (const valueData of extractionValues.values as IValueData[]) {
htmlElement = $(valueData.cssSelector);
if (valueData.returnArray === true) {
// An array should be returned so itterate over one
// value at a time
newItem.json[valueData.key as string] = [];
htmlElement.each((i, el) => {
(newItem.json[valueData.key as string] as Array<string | undefined>).push(getValue($(el), valueData, options));
});
} else {
// One single value should be returned
newItem.json[valueData.key as string] = getValue(htmlElement, valueData, options);
}
}
returnData.push(newItem);
}
}
return this.prepareOutputData(returnData);
}
}