mirror of
https://github.com/n8n-io/n8n.git
synced 2025-03-05 20:50:17 -08:00
fix(core): Improve domain and url matching for extractDomain and extractUrl (#6010)
* fix(core): Fix domain and url matching for isDomain/isUrl/extractDomain/extractUrl * Document regex and include www in the domain * Lint fix
This commit is contained in:
parent
71ed1f410c
commit
33fb73217d
|
@ -21,16 +21,95 @@ const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
|
||||||
// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
|
// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
|
||||||
const CURRENCY_REGEXP =
|
const CURRENCY_REGEXP =
|
||||||
/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
|
/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
|
||||||
const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/;
|
|
||||||
|
|
||||||
// This won't validate or catch literally valid email address, just what most people
|
/*
|
||||||
// would expect
|
Extract the domain part from various inputs, including URLs, email addresses, and plain domains.
|
||||||
|
|
||||||
|
/^(?:(?:https?|ftp):\/\/)? // Match optional http, https, or ftp protocols
|
||||||
|
(?:mailto:)? // Match optional mailto:
|
||||||
|
(?:\/\/)? // Match optional double slashes
|
||||||
|
(?:www\.)? // Match optional www prefix
|
||||||
|
(?:[-\w]*\.)? // Match any optional subdomain
|
||||||
|
( // Capture the domain part
|
||||||
|
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||||
|
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
|
||||||
|
|localhost // Match localhost
|
||||||
|
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(?::\d+)? // Match optional port number
|
||||||
|
(?:\/[^\s?]*)? // Match optional path
|
||||||
|
(?:\?[^\s#]*)? // Match optional query string
|
||||||
|
(?:#[^\s]*)?$/i; // Match optional hash fragment
|
||||||
|
*/
|
||||||
|
const DOMAIN_EXTRACT_REGEXP =
|
||||||
|
/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Matches domain names without the protocol or optional subdomains
|
||||||
|
|
||||||
|
/^(?:www\.)? // Match optional www prefix
|
||||||
|
( // Capture the domain part
|
||||||
|
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||||
|
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|
||||||
|
|localhost // Match localhost
|
||||||
|
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(?::\d+)? // Match optional port number
|
||||||
|
(?:\/[^\s?]*)? // Match optional path
|
||||||
|
(?:\?[^\s#]*)? // Match optional query string
|
||||||
|
(?:#[^\s]*)?$/i; // Match optional fragment at the end of the string
|
||||||
|
*/
|
||||||
|
const DOMAIN_REGEXP =
|
||||||
|
/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Matches email addresses
|
||||||
|
|
||||||
|
/(
|
||||||
|
( // Capture local part of the email address
|
||||||
|
([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) // One or more characters not in the set, followed by
|
||||||
|
a period, followed by one or more characters not in the set
|
||||||
|
|(".+") // Or one or more characters inside quotes
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ // Match @ symbol
|
||||||
|
(?<domain>( // Capture the domain part of the email address
|
||||||
|
\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] // Match IPv4 address inside brackets
|
||||||
|
|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) // Or match domain with at least two subdomains and TLD
|
||||||
|
))/;
|
||||||
|
*/
|
||||||
const EMAIL_REGEXP =
|
const EMAIL_REGEXP =
|
||||||
/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;
|
/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;
|
||||||
|
|
||||||
// This also might not catch every possible URL
|
/*
|
||||||
|
Matches URLs with strict beginning and end of the string checks
|
||||||
|
|
||||||
|
/^(?:(?:https?|ftp):\/\/) // Match http, https, or ftp protocols at the start of the string
|
||||||
|
(?:www\.)? // Match optional www prefix
|
||||||
|
( // Capture the domain part
|
||||||
|
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||||
|
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|
||||||
|
|localhost // Match localhost
|
||||||
|
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(?::\d+)? // Match optional port number
|
||||||
|
(?:\/[^\s?#]*)? // Match optional path
|
||||||
|
(?:\?[^\s#]*)? // Match optional query string
|
||||||
|
(?=([^\s]+#.*)?) // Positive lookahead for the fragment identifier
|
||||||
|
#?[^\s]*$/i; // Match optional fragment at the end of the string
|
||||||
|
*/
|
||||||
|
const URL_REGEXP_EXACT =
|
||||||
|
/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
|
||||||
|
matching URLs in the middle of a string
|
||||||
|
*/
|
||||||
const URL_REGEXP =
|
const URL_REGEXP =
|
||||||
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/;
|
/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;
|
||||||
|
|
||||||
const CHAR_TEST_REGEXP = /\p{L}/u;
|
const CHAR_TEST_REGEXP = /\p{L}/u;
|
||||||
const PUNC_TEST_REGEXP = /[!?.]/;
|
const PUNC_TEST_REGEXP = /[!?.]/;
|
||||||
|
@ -182,24 +261,7 @@ function isNumeric(value: string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function isUrl(value: string) {
|
function isUrl(value: string) {
|
||||||
let url: URL;
|
return URL_REGEXP_EXACT.test(value);
|
||||||
try {
|
|
||||||
url = new URL(value);
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// URL constructor tolerates missing `//` after protocol so check manually
|
|
||||||
for (const scheme of ['http:', 'https:']) {
|
|
||||||
if (
|
|
||||||
url.protocol === scheme &&
|
|
||||||
value.slice(scheme.length, scheme.length + '//'.length) === '//'
|
|
||||||
) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function isDomain(value: string) {
|
function isDomain(value: string) {
|
||||||
|
@ -272,9 +334,13 @@ function extractDomain(value: string) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
return matched.groups?.domain;
|
return matched.groups?.domain;
|
||||||
} else if (isUrl(value)) {
|
|
||||||
return new URL(value).hostname;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
|
||||||
|
if (domainMatch) {
|
||||||
|
return domainMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => {
|
||||||
|
|
||||||
test('.isUrl should work on a string', () => {
|
test('.isUrl should work on a string', () => {
|
||||||
expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
|
expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true);
|
||||||
expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
|
expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example".isUrl() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "".isUrl() }}')).toEqual(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('.isDomain should work on a string', () => {
|
test('.isDomain should work on a string', () => {
|
||||||
expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
|
expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
|
||||||
expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
|
expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
|
||||||
expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
|
expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "example".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false);
|
||||||
|
expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain
|
||||||
|
expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true);
|
||||||
|
expect(evaluate('={{ "".isDomain() }}')).toEqual(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('.toSnakeCase should work on a string', () => {
|
test('.toSnakeCase should work on a string', () => {
|
||||||
|
@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => {
|
||||||
'={{ "I am a test with a url: https://example.net/ and I am a test with an email: test@example.org".extractUrl() }}',
|
'={{ "I am a test with a url: https://example.net/ and I am a test with an email: test@example.org".extractUrl() }}',
|
||||||
),
|
),
|
||||||
).toEqual('https://example.net/');
|
).toEqual('https://example.net/');
|
||||||
|
expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash');
|
||||||
|
expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined);
|
||||||
|
expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com');
|
||||||
|
expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined);
|
||||||
|
expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh');
|
||||||
|
expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1');
|
||||||
|
expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('.extractDomain should work on a string', () => {
|
test('.extractDomain should work on a string', () => {
|
||||||
expect(evaluate('={{ "test@example.org".extractDomain() }}')).toEqual('example.org');
|
expect(evaluate('={{ "test@example.org".extractDomain() }}')).toEqual('example.org');
|
||||||
expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
|
expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
|
||||||
|
expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com');
|
||||||
|
expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org');
|
||||||
|
expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com');
|
||||||
|
expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com');
|
||||||
|
expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com');
|
||||||
|
expect(evaluate('={{ "mailto:john.doe@example.com".extractDomain() }}')).toEqual('example.com');
|
||||||
|
expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined);
|
||||||
|
expect(evaluate('={{ "jane.doe@example.org".extractDomain() }}')).toEqual('example.org');
|
||||||
|
expect(evaluate('={{ "name+tag@example.com".extractDomain() }}')).toEqual('example.com');
|
||||||
|
expect(evaluate('={{ "first.last@example.co.uk".extractDomain() }}')).toEqual('example.co.uk');
|
||||||
|
expect(evaluate('={{ "user@subdomain.example.com".extractDomain() }}')).toEqual('subdomain.example.com');
|
||||||
|
expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
|
||||||
|
expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1');
|
||||||
|
expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh');
|
||||||
|
expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost');
|
||||||
|
expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost');
|
||||||
|
expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com');
|
||||||
|
expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com');
|
||||||
|
expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('.extractEmail should work on a string', () => {
|
test('.extractEmail should work on a string', () => {
|
||||||
|
|
Loading…
Reference in a new issue