2019-12-28 19:28:15 -08:00
import * as cheerio from 'cheerio' ;
import { IExecuteFunctions } from 'n8n-core' ;
import {
2020-10-01 05:01:39 -07:00
IDataObject ,
2019-12-28 19:28:15 -08:00
INodeExecutionData ,
INodeType ,
INodeTypeDescription ,
2021-04-16 09:33:36 -07:00
NodeOperationError ,
2019-12-28 19:28:15 -08:00
} from 'n8n-workflow' ;
2020-09-24 01:02:05 -07:00
type Cheerio = ReturnType < typeof cheerio > ;
2019-12-28 19:28:15 -08:00
interface IValueData {
attribute? : string ;
cssSelector : string ;
returnValue : string ;
key : string ;
returnArray : boolean ;
}
// The extraction functions
const extractFunctions : {
[ key : string ] : ( $ : Cheerio , valueData : IValueData ) = > string | undefined ;
} = {
attribute : ( $ : Cheerio , valueData : IValueData ) : string | undefined = > $ . attr ( valueData . attribute ! ) ,
html : ( $ : Cheerio , valueData : IValueData ) : string | undefined = > $ . html ( ) || undefined ,
text : ( $ : Cheerio , valueData : IValueData ) : string | undefined = > $ . text ( ) ,
value : ( $ : Cheerio , valueData : IValueData ) : string | undefined = > $ . val ( ) ,
} ;
/ * *
* Simple helper function which applies options
* /
function getValue ( $ : Cheerio , valueData : IValueData , options : IDataObject ) {
const value = extractFunctions [ valueData . returnValue ] ( $ , valueData ) ;
if ( options . trimValues === false || value === undefined ) {
return value ;
}
return value . trim ( ) ;
}
export class HtmlExtract implements INodeType {
description : INodeTypeDescription = {
displayName : 'HTML Extract' ,
name : 'htmlExtract' ,
icon : 'fa:cut' ,
group : [ 'transform' ] ,
version : 1 ,
subtitle : '={{$parameter["sourceData"] + ": " + $parameter["dataPropertyName"]}}' ,
description : 'Extracts data from HTML' ,
defaults : {
name : 'HTML Extract' ,
color : '#333377' ,
} ,
inputs : [ 'main' ] ,
outputs : [ 'main' ] ,
properties : [
{
displayName : 'Source Data' ,
name : 'sourceData' ,
type : 'options' ,
options : [
{
name : 'Binary' ,
value : 'binary' ,
} ,
{
name : 'JSON' ,
value : 'json' ,
} ,
] ,
default : 'json' ,
description : 'If HTML should be read from binary or json data.' ,
} ,
{
displayName : 'Binary Property' ,
name : 'dataPropertyName' ,
type : 'string' ,
displayOptions : {
show : {
sourceData : [
'binary' ,
] ,
} ,
} ,
default : 'data' ,
required : true ,
description : 'Name of the binary property in which the HTML to extract the data from can be found.' ,
} ,
{
displayName : 'JSON Property' ,
name : 'dataPropertyName' ,
type : 'string' ,
displayOptions : {
show : {
sourceData : [
'json' ,
] ,
} ,
} ,
default : 'data' ,
required : true ,
2021-11-25 09:10:06 -08:00
description : 'Name of the json property in which the HTML to extract the data from can be found. The property can either contain a string or an array of strings.' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Extraction Values' ,
name : 'extractionValues' ,
placeholder : 'Add Value' ,
type : 'fixedCollection' ,
typeOptions : {
multipleValues : true ,
} ,
description : 'The extraction values.' ,
default : { } ,
options : [
{
name : 'values' ,
displayName : 'Values' ,
values : [
{
displayName : 'Key' ,
name : 'key' ,
type : 'string' ,
default : '' ,
description : 'The key under which the extracted value should be saved.' ,
} ,
{
displayName : 'CSS Selector' ,
name : 'cssSelector' ,
type : 'string' ,
default : '' ,
placeholder : '.price' ,
description : 'The CSS selector to use.' ,
} ,
{
displayName : 'Return Value' ,
name : 'returnValue' ,
type : 'options' ,
options : [
{
name : 'Attribute' ,
value : 'attribute' ,
description : 'Get an attribute value like "class" from an element.' ,
} ,
{
name : 'HTML' ,
value : 'html' ,
description : 'Get the HTML the element contains.' ,
} ,
{
name : 'Text' ,
value : 'text' ,
description : 'Get only the text content of the element.' ,
} ,
{
name : 'Value' ,
value : 'value' ,
description : 'Get value of an input, select or textarea.' ,
} ,
] ,
default : 'text' ,
description : 'What kind of data should be returned.' ,
} ,
{
displayName : 'Attribute' ,
name : 'attribute' ,
type : 'string' ,
displayOptions : {
show : {
returnValue : [
'attribute' ,
] ,
} ,
} ,
default : '' ,
placeholder : 'class' ,
description : 'The name of the attribute to return the value off.' ,
} ,
{
displayName : 'Return Array' ,
name : 'returnArray' ,
type : 'boolean' ,
default : false ,
2021-10-27 13:00:13 -07:00
description : 'Returns the values as an array so if multiple ones get found they also get returned separately. If not set all will be returned as a single string.' ,
2019-12-28 19:28:15 -08:00
} ,
] ,
} ,
] ,
} ,
{
displayName : 'Options' ,
name : 'options' ,
type : 'collection' ,
placeholder : 'Add Option' ,
default : { } ,
options : [
{
displayName : 'Trim Values' ,
name : 'trimValues' ,
type : 'boolean' ,
default : true ,
2021-10-27 13:00:13 -07:00
description : 'Removes automatically all spaces and newlines from the beginning and end of the values.' ,
2019-12-28 19:28:15 -08:00
} ,
] ,
2020-10-22 06:46:03 -07:00
} ,
] ,
2019-12-28 19:28:15 -08:00
} ;
async execute ( this : IExecuteFunctions ) : Promise < INodeExecutionData [ ] [ ] > {
const items = this . getInputData ( ) ;
const returnData : INodeExecutionData [ ] = [ ] ;
let item : INodeExecutionData ;
for ( let itemIndex = 0 ; itemIndex < items . length ; itemIndex ++ ) {
2021-07-19 23:58:54 -07:00
try {
const dataPropertyName = this . getNodeParameter ( 'dataPropertyName' , itemIndex ) as string ;
const extractionValues = this . getNodeParameter ( 'extractionValues' , itemIndex ) as IDataObject ;
const options = this . getNodeParameter ( 'options' , itemIndex , { } ) as IDataObject ;
const sourceData = this . getNodeParameter ( 'sourceData' , itemIndex ) as string ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
item = items [ itemIndex ] ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
let htmlArray : string [ ] | string = [ ] ;
if ( sourceData === 'json' ) {
if ( item . json [ dataPropertyName ] === undefined ) {
throw new NodeOperationError ( this . getNode ( ) , ` No property named " ${ dataPropertyName } " exists! ` ) ;
}
htmlArray = item . json [ dataPropertyName ] as string ;
} else {
if ( item . binary === undefined ) {
throw new NodeOperationError ( this . getNode ( ) , ` No item does not contain binary data! ` ) ;
}
if ( item . binary [ dataPropertyName ] === undefined ) {
throw new NodeOperationError ( this . getNode ( ) , ` No property named " ${ dataPropertyName } " exists! ` ) ;
}
2022-01-03 13:42:42 -08:00
const binaryDataBuffer = await this . helpers . getBinaryDataBuffer ( itemIndex , dataPropertyName ) ;
htmlArray = binaryDataBuffer . toString ( 'utf-8' ) ;
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
// Convert it always to array that it works with a string or an array of strings
if ( ! Array . isArray ( htmlArray ) ) {
htmlArray = [ htmlArray ] ;
}
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
for ( const html of htmlArray as string [ ] ) {
const $ = cheerio . load ( html ) ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
const newItem : INodeExecutionData = {
json : { } ,
} ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
// Itterate over all the defined values which should be extracted
let htmlElement ;
for ( const valueData of extractionValues . values as IValueData [ ] ) {
htmlElement = $ ( valueData . cssSelector ) ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
if ( valueData . returnArray === true ) {
// An array should be returned so itterate over one
// value at a time
newItem . json [ valueData . key as string ] = [ ] ;
htmlElement . each ( ( i , el ) = > {
( newItem . json [ valueData . key as string ] as Array < string | undefined > ) . push ( getValue ( $ ( el ) , valueData , options ) ) ;
} ) ;
} else {
// One single value should be returned
newItem . json [ valueData . key as string ] = getValue ( htmlElement , valueData , options ) ;
}
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
returnData . push ( newItem ) ;
}
} catch ( error ) {
if ( this . continueOnFail ( ) ) {
returnData . push ( { json : { error : error.message } } ) ;
continue ;
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
throw error ;
2019-12-28 19:28:15 -08:00
}
}
return this . prepareOutputData ( returnData ) ;
}
}