2022-04-08 14:32:08 -07:00
import cheerio from 'cheerio' ;
2023-01-27 03:22:44 -08:00
import type { IExecuteFunctions } from 'n8n-core' ;
import type {
2020-10-01 05:01:39 -07:00
IDataObject ,
2019-12-28 19:28:15 -08:00
INodeExecutionData ,
INodeType ,
INodeTypeDescription ,
} from 'n8n-workflow' ;
2023-01-27 03:22:44 -08:00
import { NodeOperationError } from 'n8n-workflow' ;
2019-12-28 19:28:15 -08:00
2020-09-24 01:02:05 -07:00
type Cheerio = ReturnType < typeof cheerio > ;
2019-12-28 19:28:15 -08:00
interface IValueData {
attribute? : string ;
cssSelector : string ;
returnValue : string ;
key : string ;
returnArray : boolean ;
}
// The extraction functions
const extractFunctions : {
[ key : string ] : ( $ : Cheerio , valueData : IValueData ) = > string | undefined ;
} = {
2022-08-17 08:50:24 -07:00
attribute : ( $ : Cheerio , valueData : IValueData ) : string | undefined = >
$ . attr ( valueData . attribute ! ) ,
2023-01-19 04:37:19 -08:00
html : ( $ : Cheerio , _valueData : IValueData ) : string | undefined = > $ . html ( ) || undefined ,
2022-11-08 06:28:21 -08:00
text : ( $ : Cheerio , _valueData : IValueData ) : string | undefined = > $ . text ( ) ,
value : ( $ : Cheerio , _valueData : IValueData ) : string | undefined = > $ . val ( ) ,
2019-12-28 19:28:15 -08:00
} ;
/ * *
* Simple helper function which applies options
* /
function getValue ( $ : Cheerio , valueData : IValueData , options : IDataObject ) {
const value = extractFunctions [ valueData . returnValue ] ( $ , valueData ) ;
if ( options . trimValues === false || value === undefined ) {
return value ;
}
return value . trim ( ) ;
}
export class HtmlExtract implements INodeType {
description : INodeTypeDescription = {
displayName : 'HTML Extract' ,
name : 'htmlExtract' ,
icon : 'fa:cut' ,
group : [ 'transform' ] ,
version : 1 ,
2023-01-26 01:03:13 -08:00
hidden : true ,
2019-12-28 19:28:15 -08:00
subtitle : '={{$parameter["sourceData"] + ": " + $parameter["dataPropertyName"]}}' ,
description : 'Extracts data from HTML' ,
defaults : {
name : 'HTML Extract' ,
color : '#333377' ,
} ,
inputs : [ 'main' ] ,
outputs : [ 'main' ] ,
properties : [
{
displayName : 'Source Data' ,
name : 'sourceData' ,
type : 'options' ,
options : [
{
name : 'Binary' ,
value : 'binary' ,
} ,
{
name : 'JSON' ,
value : 'json' ,
} ,
] ,
default : 'json' ,
2022-05-20 14:47:24 -07:00
description : 'If HTML should be read from binary or JSON data' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Binary Property' ,
name : 'dataPropertyName' ,
type : 'string' ,
displayOptions : {
show : {
2022-08-17 08:50:24 -07:00
sourceData : [ 'binary' ] ,
2019-12-28 19:28:15 -08:00
} ,
} ,
default : 'data' ,
required : true ,
2022-08-17 08:50:24 -07:00
description :
'Name of the binary property in which the HTML to extract the data from can be found' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'JSON Property' ,
name : 'dataPropertyName' ,
type : 'string' ,
displayOptions : {
show : {
2022-08-17 08:50:24 -07:00
sourceData : [ 'json' ] ,
2019-12-28 19:28:15 -08:00
} ,
} ,
default : 'data' ,
required : true ,
2022-08-17 08:50:24 -07:00
description :
'Name of the JSON property in which the HTML to extract the data from can be found. The property can either contain a string or an array of strings.' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Extraction Values' ,
name : 'extractionValues' ,
placeholder : 'Add Value' ,
type : 'fixedCollection' ,
typeOptions : {
multipleValues : true ,
} ,
default : { } ,
options : [
{
name : 'values' ,
displayName : 'Values' ,
values : [
{
displayName : 'Key' ,
name : 'key' ,
type : 'string' ,
default : '' ,
2022-05-06 14:01:25 -07:00
description : 'The key under which the extracted value should be saved' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'CSS Selector' ,
name : 'cssSelector' ,
type : 'string' ,
default : '' ,
placeholder : '.price' ,
2022-05-06 14:01:25 -07:00
description : 'The CSS selector to use' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Return Value' ,
name : 'returnValue' ,
type : 'options' ,
options : [
{
name : 'Attribute' ,
value : 'attribute' ,
2022-05-06 14:01:25 -07:00
description : 'Get an attribute value like "class" from an element' ,
2019-12-28 19:28:15 -08:00
} ,
{
name : 'HTML' ,
value : 'html' ,
2022-05-06 14:01:25 -07:00
description : 'Get the HTML the element contains' ,
2019-12-28 19:28:15 -08:00
} ,
{
name : 'Text' ,
value : 'text' ,
2022-05-06 14:01:25 -07:00
description : 'Get only the text content of the element' ,
2019-12-28 19:28:15 -08:00
} ,
{
name : 'Value' ,
value : 'value' ,
2022-05-06 14:01:25 -07:00
description : 'Get value of an input, select or textarea' ,
2019-12-28 19:28:15 -08:00
} ,
] ,
default : 'text' ,
2022-05-06 14:01:25 -07:00
description : 'What kind of data should be returned' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Attribute' ,
name : 'attribute' ,
type : 'string' ,
displayOptions : {
show : {
2022-08-17 08:50:24 -07:00
returnValue : [ 'attribute' ] ,
2019-12-28 19:28:15 -08:00
} ,
} ,
default : '' ,
placeholder : 'class' ,
2022-05-06 14:01:25 -07:00
description : 'The name of the attribute to return the value off' ,
2019-12-28 19:28:15 -08:00
} ,
{
displayName : 'Return Array' ,
name : 'returnArray' ,
type : 'boolean' ,
default : false ,
2022-08-17 08:50:24 -07:00
description :
'Whether to return the values as an array so if multiple ones get found they also get returned separately. If not set all will be returned as a single string.' ,
2019-12-28 19:28:15 -08:00
} ,
] ,
} ,
] ,
} ,
{
displayName : 'Options' ,
name : 'options' ,
type : 'collection' ,
placeholder : 'Add Option' ,
default : { } ,
options : [
{
displayName : 'Trim Values' ,
name : 'trimValues' ,
type : 'boolean' ,
default : true ,
2022-08-17 08:50:24 -07:00
description :
'Whether to remove automatically all spaces and newlines from the beginning and end of the values' ,
2019-12-28 19:28:15 -08:00
} ,
] ,
2020-10-22 06:46:03 -07:00
} ,
] ,
2019-12-28 19:28:15 -08:00
} ;
async execute ( this : IExecuteFunctions ) : Promise < INodeExecutionData [ ] [ ] > {
const items = this . getInputData ( ) ;
const returnData : INodeExecutionData [ ] = [ ] ;
let item : INodeExecutionData ;
for ( let itemIndex = 0 ; itemIndex < items . length ; itemIndex ++ ) {
2021-07-19 23:58:54 -07:00
try {
2023-01-06 06:09:32 -08:00
const dataPropertyName = this . getNodeParameter ( 'dataPropertyName' , itemIndex ) ;
2022-08-17 08:50:24 -07:00
const extractionValues = this . getNodeParameter (
'extractionValues' ,
itemIndex ,
) as IDataObject ;
2022-12-02 12:54:28 -08:00
const options = this . getNodeParameter ( 'options' , itemIndex , { } ) ;
2021-07-19 23:58:54 -07:00
const sourceData = this . getNodeParameter ( 'sourceData' , itemIndex ) as string ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
item = items [ itemIndex ] ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
let htmlArray : string [ ] | string = [ ] ;
if ( sourceData === 'json' ) {
if ( item . json [ dataPropertyName ] === undefined ) {
2022-08-17 08:50:24 -07:00
throw new NodeOperationError (
this . getNode ( ) ,
` No property named " ${ dataPropertyName } " exists! ` ,
{ itemIndex } ,
) ;
2021-07-19 23:58:54 -07:00
}
htmlArray = item . json [ dataPropertyName ] as string ;
} else {
if ( item . binary === undefined ) {
2022-12-29 03:20:43 -08:00
throw new NodeOperationError ( this . getNode ( ) , 'No item does not contain binary data!' , {
2022-08-17 08:50:24 -07:00
itemIndex ,
} ) ;
2021-07-19 23:58:54 -07:00
}
if ( item . binary [ dataPropertyName ] === undefined ) {
2022-08-17 08:50:24 -07:00
throw new NodeOperationError (
this . getNode ( ) ,
` No property named " ${ dataPropertyName } " exists! ` ,
{ itemIndex } ,
) ;
2021-07-19 23:58:54 -07:00
}
2022-01-03 13:42:42 -08:00
2022-08-17 08:50:24 -07:00
const binaryDataBuffer = await this . helpers . getBinaryDataBuffer (
itemIndex ,
dataPropertyName ,
) ;
2022-01-03 13:42:42 -08:00
htmlArray = binaryDataBuffer . toString ( 'utf-8' ) ;
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
// Convert it always to array that it works with a string or an array of strings
if ( ! Array . isArray ( htmlArray ) ) {
htmlArray = [ htmlArray ] ;
}
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
for ( const html of htmlArray as string [ ] ) {
const $ = cheerio . load ( html ) ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
const newItem : INodeExecutionData = {
json : { } ,
2022-06-03 08:25:07 -07:00
pairedItem : {
item : itemIndex ,
} ,
2021-07-19 23:58:54 -07:00
} ;
2019-12-28 19:28:15 -08:00
2021-07-19 23:58:54 -07:00
// Itterate over all the defined values which should be extracted
let htmlElement ;
for ( const valueData of extractionValues . values as IValueData [ ] ) {
htmlElement = $ ( valueData . cssSelector ) ;
2019-12-28 19:28:15 -08:00
2022-12-02 12:54:28 -08:00
if ( valueData . returnArray ) {
2021-07-19 23:58:54 -07:00
// An array should be returned so itterate over one
// value at a time
2022-12-02 12:54:28 -08:00
newItem . json [ valueData . key ] = [ ] ;
2021-07-19 23:58:54 -07:00
htmlElement . each ( ( i , el ) = > {
2022-12-02 12:54:28 -08:00
( newItem . json [ valueData . key ] as Array < string | undefined > ) . push (
2022-08-17 08:50:24 -07:00
getValue ( $ ( el ) , valueData , options ) ,
) ;
2021-07-19 23:58:54 -07:00
} ) ;
} else {
// One single value should be returned
2022-12-02 12:54:28 -08:00
newItem . json [ valueData . key ] = getValue ( htmlElement , valueData , options ) ;
2021-07-19 23:58:54 -07:00
}
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
returnData . push ( newItem ) ;
}
} catch ( error ) {
if ( this . continueOnFail ( ) ) {
2022-06-03 08:25:07 -07:00
returnData . push ( {
json : {
error : error.message ,
} ,
pairedItem : {
item : itemIndex ,
} ,
} ) ;
2021-07-19 23:58:54 -07:00
continue ;
2019-12-28 19:28:15 -08:00
}
2021-07-19 23:58:54 -07:00
throw error ;
2019-12-28 19:28:15 -08:00
}
}
return this . prepareOutputData ( returnData ) ;
}
}