mirror of
https://github.com/n8n-io/n8n.git
synced 2024-11-10 06:34:05 -08:00
fix(core): Improve domain and url matching for extractDomain and extractUrl (#6010)
* fix(core): Fix domain and url matching for isDomain/isUrl/extractDomain/extractUrl * Document regex and include www in the domain * Lint fix
This commit is contained in:
parent
71ed1f410c
commit
33fb73217d
|
@ -21,16 +21,95 @@ const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
|
|||
// All symbols from https://www.xe.com/symbols/ as for 2022/11/09
|
||||
const CURRENCY_REGEXP =
|
||||
/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
|
||||
const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/;
|
||||
|
||||
// This won't validate or catch literally valid email address, just what most people
|
||||
// would expect
|
||||
/*
|
||||
Extract the domain part from various inputs, including URLs, email addresses, and plain domains.
|
||||
|
||||
/^(?:(?:https?|ftp):\/\/)? // Match optional http, https, or ftp protocols
|
||||
(?:mailto:)? // Match optional mailto:
|
||||
(?:\/\/)? // Match optional double slashes
|
||||
(?:www\.)? // Match optional www prefix
|
||||
(?:[-\w]*\.)? // Match any optional subdomain
|
||||
( // Capture the domain part
|
||||
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
|
||||
|localhost // Match localhost
|
||||
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||
)
|
||||
)
|
||||
(?::\d+)? // Match optional port number
|
||||
(?:\/[^\s?]*)? // Match optional path
|
||||
(?:\?[^\s#]*)? // Match optional query string
|
||||
(?:#[^\s]*)?$/i; // Match optional hash fragment
|
||||
*/
|
||||
const DOMAIN_EXTRACT_REGEXP =
|
||||
/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
|
||||
|
||||
/*
|
||||
Matches domain names without the protocol or optional subdomains
|
||||
|
||||
/^(?:www\.)? // Match optional www prefix
|
||||
( // Capture the domain part
|
||||
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|
||||
|localhost // Match localhost
|
||||
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||
)
|
||||
)
|
||||
(?::\d+)? // Match optional port number
|
||||
(?:\/[^\s?]*)? // Match optional path
|
||||
(?:\?[^\s#]*)? // Match optional query string
|
||||
(?:#[^\s]*)?$/i; // Match optional fragment at the end of the string
|
||||
*/
|
||||
const DOMAIN_REGEXP =
|
||||
/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
|
||||
|
||||
/*
|
||||
Matches email addresses
|
||||
|
||||
/(
|
||||
( // Capture local part of the email address
|
||||
([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) // One or more characters not in the set, followed by
|
||||
a period, followed by one or more characters not in the set
|
||||
|(".+") // Or one or more characters inside quotes
|
||||
)
|
||||
)
|
||||
@ // Match @ symbol
|
||||
(?<domain>( // Capture the domain part of the email address
|
||||
\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] // Match IPv4 address inside brackets
|
||||
|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) // Or match domain with at least two subdomains and TLD
|
||||
))/;
|
||||
*/
|
||||
const EMAIL_REGEXP =
|
||||
/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;
|
||||
|
||||
// This also might not catch every possible URL
|
||||
/*
|
||||
Matches URLs with strict beginning and end of the string checks
|
||||
|
||||
/^(?:(?:https?|ftp):\/\/) // Match http, https, or ftp protocols at the start of the string
|
||||
(?:www\.)? // Match optional www prefix
|
||||
( // Capture the domain part
|
||||
(?:(?:[-\w]+\.)+ // Match one or more subdomains
|
||||
(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) // Match top-level domain or Punycode encoded IDN
|
||||
|localhost // Match localhost
|
||||
|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
|
||||
)
|
||||
)
|
||||
(?::\d+)? // Match optional port number
|
||||
(?:\/[^\s?#]*)? // Match optional path
|
||||
(?:\?[^\s#]*)? // Match optional query string
|
||||
(?=([^\s]+#.*)?) // Positive lookahead for the fragment identifier
|
||||
#?[^\s]*$/i; // Match optional fragment at the end of the string
|
||||
*/
|
||||
const URL_REGEXP_EXACT =
|
||||
/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;
|
||||
|
||||
/*
|
||||
Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
|
||||
matching URLs in the middle of a string
|
||||
*/
|
||||
const URL_REGEXP =
|
||||
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/;
|
||||
/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;
|
||||
|
||||
const CHAR_TEST_REGEXP = /\p{L}/u;
|
||||
const PUNC_TEST_REGEXP = /[!?.]/;
|
||||
|
@ -182,24 +261,7 @@ function isNumeric(value: string) {
|
|||
}
|
||||
|
||||
function isUrl(value: string) {
|
||||
let url: URL;
|
||||
try {
|
||||
url = new URL(value);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
|
||||
// URL constructor tolerates missing `//` after protocol so check manually
|
||||
for (const scheme of ['http:', 'https:']) {
|
||||
if (
|
||||
url.protocol === scheme &&
|
||||
value.slice(scheme.length, scheme.length + '//'.length) === '//'
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return URL_REGEXP_EXACT.test(value);
|
||||
}
|
||||
|
||||
function isDomain(value: string) {
|
||||
|
@ -272,9 +334,13 @@ function extractDomain(value: string) {
|
|||
return undefined;
|
||||
}
|
||||
return matched.groups?.domain;
|
||||
} else if (isUrl(value)) {
|
||||
return new URL(value).hostname;
|
||||
}
|
||||
|
||||
const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
|
||||
if (domainMatch) {
|
||||
return domainMatch[1];
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
|
|
|
@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => {
|
|||
|
||||
test('.isUrl should work on a string', () => {
|
||||
expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example".isUrl() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "".isUrl() }}')).toEqual(false);
|
||||
});
|
||||
|
||||
test('.isDomain should work on a string', () => {
|
||||
expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "example".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false);
|
||||
expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain
|
||||
expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true);
|
||||
expect(evaluate('={{ "".isDomain() }}')).toEqual(false);
|
||||
});
|
||||
|
||||
test('.toSnakeCase should work on a string', () => {
|
||||
|
@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => {
|
|||
'={{ "I am a test with a url: https://example.net/ and I am a test with an email: test@example.org".extractUrl() }}',
|
||||
),
|
||||
).toEqual('https://example.net/');
|
||||
expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash');
|
||||
expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined);
|
||||
expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com');
|
||||
expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined);
|
||||
expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh');
|
||||
expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1');
|
||||
expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag');
|
||||
});
|
||||
|
||||
test('.extractDomain should work on a string', () => {
|
||||
expect(evaluate('={{ "test@example.org".extractDomain() }}')).toEqual('example.org');
|
||||
expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
|
||||
expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com');
|
||||
expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org');
|
||||
expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com');
|
||||
expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com');
|
||||
expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com');
|
||||
expect(evaluate('={{ "mailto:john.doe@example.com".extractDomain() }}')).toEqual('example.com');
|
||||
expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined);
|
||||
expect(evaluate('={{ "jane.doe@example.org".extractDomain() }}')).toEqual('example.org');
|
||||
expect(evaluate('={{ "name+tag@example.com".extractDomain() }}')).toEqual('example.com');
|
||||
expect(evaluate('={{ "first.last@example.co.uk".extractDomain() }}')).toEqual('example.co.uk');
|
||||
expect(evaluate('={{ "user@subdomain.example.com".extractDomain() }}')).toEqual('subdomain.example.com');
|
||||
expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
|
||||
expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1');
|
||||
expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh');
|
||||
expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost');
|
||||
expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost');
|
||||
expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com');
|
||||
expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com');
|
||||
expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space');
|
||||
});
|
||||
|
||||
test('.extractEmail should work on a string', () => {
|
||||
|
|
Loading…
Reference in a new issue