fix(core): Improve domain and url matching for extractDomain and extractUrl (#6010)

* fix(core): Fix domain and url matching for isDomain/isUrl/extractDomain/extractUrl * Document regex and include www in the domain * Lint fix
2025-03-05 20:50:17 -08:00 · 2023-04-20 09:01:27 +02:00 · 2023-04-20 09:01:27 +02:00 · 33fb73217d
parent 71ed1f410c
commit 33fb73217d
2 changed files with 153 additions and 25 deletions
--- a/packages/workflow/src/Extensions/StringExtensions.ts
+++ b/packages/workflow/src/Extensions/StringExtensions.ts
@ -21,16 +21,95 @@ const hashFunctions: Record<string, typeof CryptoJS.MD5> = {
 // All symbols from https://www.xe.com/symbols/ as for 2022/11/09
 const CURRENCY_REGEXP =
 	/(\u004c\u0065\u006b|\u060b|\u0024|\u0192|\u20bc|\u0042\u0072|\u0042\u005a\u0024|\u0024\u0062|\u004b\u004d|\u0050|\u043b\u0432|\u0052\u0024|\u17db|\u00a5|\u20a1|\u006b\u006e|\u20b1|\u004b\u010d|\u006b\u0072|\u0052\u0044\u0024|\u00a3|\u20ac|\u00a2|\u0051|\u004c|\u0046\u0074|\u20b9|\u0052\u0070|\ufdfc|\u20aa|\u004a\u0024|\u20a9|\u20ad|\u0434\u0435\u043d|\u0052\u004d|\u20a8|\u20ae|\u004d\u0054|\u0043\u0024|\u20a6|\u0042\u002f\u002e|\u0047\u0073|\u0053\u002f\u002e|\u007a\u0142|\u006c\u0065\u0069|\u20bd|\u0414\u0438\u043d\u002e|\u0053|\u0052|\u0043\u0048\u0046|\u004e\u0054\u0024|\u0e3f|\u0054\u0054\u0024|\u20ba|\u20b4|\u0024\u0055|\u0042\u0073|\u20ab|\u005a\u0024)/gu;
-const DOMAIN_REGEXP = /^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9](?:\.[a-zA-Z]{2,})+$/;

-// This won't validate or catch literally valid email address, just what most people
-// would expect
+/*
+	Extract the domain part from various inputs, including URLs, email addresses, and plain domains.
+
+	/^(?:(?:https?|ftp):\/\/)? 								// Match optional http, https, or ftp protocols
+  (?:mailto:)?               								// Match optional mailto:
+  (?:\/\/)?                  								// Match optional double slashes
+  (?:www\.)?                 								// Match optional www prefix
+  (?:[-\w]*\.)?              								// Match any optional subdomain
+  (                           							// Capture the domain part
+    (?:(?:[-\w]+\.)+          							// Match one or more subdomains
+      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 		// Match top-level domain or Punycode encoded IDN(xn--80aswg.xn--p1ai)
+      |localhost              							// Match localhost
+      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} 	// Match IPv4 addresses
+    )
+  )
+  (?::\d+)?                   							// Match optional port number
+  (?:\/[^\s?]*)?              							// Match optional path
+  (?:\?[^\s#]*)?              							// Match optional query string
+  (?:#[^\s]*)?$/i;            							// Match optional hash fragment
+*/
+const DOMAIN_EXTRACT_REGEXP =
+	/^(?:(?:https?|ftp):\/\/)?(?:mailto:)?(?:\/\/)?((?:www\.)?(?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
+
+/*
+	Matches domain names without the protocol or optional subdomains
+
+	/^(?:www\.)? 															// Match optional www prefix
+  (                         								// Capture the domain part
+    (?:(?:[-\w]+\.)+        								// Match one or more subdomains
+      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 		// Match top-level domain or Punycode encoded IDN
+      |localhost            								// Match localhost
+      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} 	// Match IPv4 addresses
+    )
+  )
+  (?::\d+)?                 								// Match optional port number
+  (?:\/[^\s?]*)?            								// Match optional path
+  (?:\?[^\s#]*)?            								// Match optional query string
+  (?:#[^\s]*)?$/i;          								// Match optional fragment at the end of the string
+*/
+const DOMAIN_REGEXP =
+	/^(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?$/i;
+
+/*
+	Matches email addresses
+
+	/(
+    ( 																											// Capture local part of the email address
+      ([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*) 	// One or more characters not in the set, followed by
+																														a	period, followed by one or more characters not in the set
+      |(".+") 																							// Or one or more characters inside quotes
+    )
+  )
+  @                             														// Match @ symbol
+  (?<domain>(                   														// Capture the domain part of the email address
+    \[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\] 			// Match IPv4 address inside brackets
+    |(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}) 											// Or match domain with at least two subdomains and TLD
+  ))/;
+*/
 const EMAIL_REGEXP =
 	/(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@(?<domain>(\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;

-// This also might not catch every possible URL
+/*
+	Matches URLs with strict beginning and end of the string checks
+
+	/^(?:(?:https?|ftp):\/\/) 							// Match http, https, or ftp protocols at the start of the string
+  (?:www\.)?               								// Match optional www prefix
+  (                        								// Capture the domain part
+    (?:(?:[-\w]+\.)+       								// Match one or more subdomains
+      (?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+) 	// Match top-level domain or Punycode encoded IDN
+      |localhost           								// Match localhost
+      |\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} // Match IPv4 addresses
+    )
+  )
+  (?::\d+)?                								// Match optional port number
+  (?:\/[^\s?#]*)?          								// Match optional path
+  (?:\?[^\s#]*)?           								// Match optional query string
+  (?=([^\s]+#.*)?)         								// Positive lookahead for the fragment identifier
+  #?[^\s]*$/i;              							// Match optional fragment at the end of the string
+*/
+const URL_REGEXP_EXACT =
+	/^(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*$/i;
+
+/*
+	Same as URL_REGEXP_EXACT but without the strict beginning and end of the string checks to allow for
+	matching URLs in the middle of a string
+*/
 const URL_REGEXP =
-	/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,}\b([-a-zA-Z0-9()\[\]@:%_\+.~#?&//=]*)/;
+	/(?:(?:https?|ftp):\/\/)(?:www\.)?((?:(?:[-\w]+\.)+(?:[a-zA-Z]{2,}|xn--[a-zA-Z0-9]+)|localhost|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?=([^\s]+#.*)?)#?[^\s]*/i;

 const CHAR_TEST_REGEXP = /\p{L}/u;
 const PUNC_TEST_REGEXP = /[!?.]/;
@ -182,24 +261,7 @@ function isNumeric(value: string) {
 }

 function isUrl(value: string) {
-	let url: URL;
-	try {
-		url = new URL(value);
-	} catch {
-		return false;
-	}
-
-	// URL constructor tolerates missing `//` after protocol so check manually
-	for (const scheme of ['http:', 'https:']) {
-		if (
-			url.protocol === scheme &&
-			value.slice(scheme.length, scheme.length + '//'.length) === '//'
-		) {
-			return true;
-		}
-	}
-
-	return false;
+	return URL_REGEXP_EXACT.test(value);
 }

 function isDomain(value: string) {
@ -272,9 +334,13 @@ function extractDomain(value: string) {
 			return undefined;
 		}
 		return matched.groups?.domain;
-	} else if (isUrl(value)) {
-		return new URL(value).hostname;
 	}
+
+	const domainMatch = value.match(DOMAIN_EXTRACT_REGEXP);
+	if (domainMatch) {
+		return domainMatch[1];
+	}
+
 	return undefined;
 }

--- a/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts
+++ b/packages/workflow/test/ExpressionExtensions/StringExtensions.test.ts
@ -97,13 +97,43 @@ describe('Data Transformation Functions', () => {

 		test('.isUrl should work on a string', () => {
 			expect(evaluate('={{ "https://example.com/".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "http://example.com/".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "ftp://example.com/".isUrl() }}')).toEqual(true);
 			expect(evaluate('={{ "example.com".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "www.example.com".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "https://www.example.com/".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com/path".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com/path?query=1".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com/path#fragment".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com:8080".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com?query=1".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "https://example.com#fragment".isUrl() }}')).toEqual(true);
+			expect(evaluate('={{ "example.com/path".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "http:///".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "https://".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "example".isUrl() }}')).toEqual(false);
+			expect(evaluate('={{ "".isUrl() }}')).toEqual(false);
 		});

 		test('.isDomain should work on a string', () => {
 			expect(evaluate('={{ "example.com".isDomain() }}')).toEqual(true);
 			expect(evaluate('={{ "asdf".isDomain() }}')).toEqual(false);
 			expect(evaluate('={{ "https://example.com/".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "www.example.com".isDomain() }}')).toEqual(true);
+			expect(evaluate('={{ "subdomain.example.com".isDomain() }}')).toEqual(true);
+			expect(evaluate('={{ "example.co.uk".isDomain() }}')).toEqual(true);
+			expect(evaluate('={{ "example".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "example.".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ ".com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "example..com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "example_com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "example/com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "example com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "www.example..com".isDomain() }}')).toEqual(false);
+			expect(evaluate('={{ "123.com".isDomain() }}')).toEqual(true);
+			expect(evaluate('={{ "xn--80aswg.xn--p1ai".isDomain() }}')).toEqual(true); // Punycode domain
+			expect(evaluate('={{ "example.com:8080".isDomain() }}')).toEqual(true);
+			expect(evaluate('={{ "".isDomain() }}')).toEqual(false);
 		});

 		test('.toSnakeCase should work on a string', () => {
@ -127,11 +157,43 @@ describe('Data Transformation Functions', () => {
 					'={{ "I am a test with a url: https://example.net/ and I am a test with an email: test@example.org".extractUrl() }}',
 				),
 			).toEqual('https://example.net/');
+			expect(evaluate('={{ "Check this out: https://subdomain.example.com:3000/path?q=1#hash".extractUrl() }}')).toEqual('https://subdomain.example.com:3000/path?q=1#hash');
+			expect(evaluate('={{ "Invalid URL: http:///example.com".extractUrl() }}')).toEqual(undefined);
+			expect(evaluate('={{ "Mixed content: https://www.example.com and http://www.example.org".extractUrl() }}')).toEqual('https://www.example.com');
+			expect(evaluate('={{ "Text without URL: This is just a simple text".extractUrl() }}')).toEqual(undefined);
+			expect(evaluate('={{ "URL with Unicode: http://www.xn--80aswg.xn--j1amh".extractUrl() }}')).toEqual('http://www.xn--80aswg.xn--j1amh');
+			expect(evaluate('={{ "Localhost URL: http://localhost:8080/test?x=1".extractUrl() }}')).toEqual('http://localhost:8080/test?x=1');
+			expect(evaluate('={{ "IP URL: http://192.168.1.1:8000/path?q=value#frag".extractUrl() }}')).toEqual('http://192.168.1.1:8000/path?q=value#frag');
 		});

 		test('.extractDomain should work on a string', () => {
 			expect(evaluate('={{ "test@example.org".extractDomain() }}')).toEqual('example.org');
 			expect(evaluate('={{ "https://example.org/".extractDomain() }}')).toEqual('example.org');
+			expect(evaluate('={{ "https://www.google.com".extractDomain() }}')).toEqual('www.google.com');
+			expect(evaluate('={{ "http://example.org".extractDomain() }}')).toEqual('example.org');
+			expect(evaluate('={{ "ftp://ftp.example.com".extractDomain() }}')).toEqual('ftp.example.com');
+			expect(evaluate('={{ "google.com".extractDomain() }}')).toEqual('google.com');
+			expect(evaluate('={{ "www.example.net".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "//example.com".extractDomain() }}')).toEqual('example.com');
+			expect(evaluate('={{ "mailto:john.doe@example.com".extractDomain() }}')).toEqual('example.com');
+			expect(evaluate('={{ "tel:+1-555-123-4567".extractDomain() }}')).toEqual(undefined);
+			expect(evaluate('={{ "jane.doe@example.org".extractDomain() }}')).toEqual('example.org');
+			expect(evaluate('={{ "name+tag@example.com".extractDomain() }}')).toEqual('example.com');
+			expect(evaluate('={{ "first.last@example.co.uk".extractDomain() }}')).toEqual('example.co.uk');
+			expect(evaluate('={{ "user@subdomain.example.com".extractDomain() }}')).toEqual('subdomain.example.com');
+			expect(evaluate('={{ "www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "https://www.example.net?test=1213".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "https://www.example.net?test".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "https://www.example.net#tesdt123".extractDomain() }}')).toEqual('www.example.net');
+			expect(evaluate('={{ "https://192.168.1.1".extractDomain() }}')).toEqual('192.168.1.1');
+			expect(evaluate('={{ "http://www.xn--80aswg.xn--j1amh".extractDomain() }}')).toEqual('www.xn--80aswg.xn--j1amh');
+			expect(evaluate('={{ "https://localhost".extractDomain() }}')).toEqual('localhost');
+			expect(evaluate('={{ "https://localhost?test=123".extractDomain() }}')).toEqual('localhost');
+			expect(evaluate('={{ "https://www.example_with_underscore.com".extractDomain() }}')).toEqual('www.example_with_underscore.com');
+			expect(evaluate('={{ "https://www.example.com:8080".extractDomain() }}')).toEqual('www.example.com');
+			expect(evaluate('={{ "https://example.space".extractDomain() }}')).toEqual('example.space');
 		});

 		test('.extractEmail should work on a string', () => {