From dafe564e5c9291565e487da65a5110e7830b8bb3 Mon Sep 17 00:00:00 2001 From: Arturo Fonseca Date: Sat, 30 Aug 2025 00:31:44 -0300 Subject: [PATCH 1/2] Fix URL regex in api/rich-text --- packages/api/src/rich-text/detection.ts | 14 +++++++----- packages/api/src/rich-text/util.ts | 4 +++- .../api/tests/rich-text-detection.test.ts | 22 +++++++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts index f4b190e10fc..611979a3e19 100644 --- a/packages/api/src/rich-text/detection.ts +++ b/packages/api/src/rich-text/detection.ts @@ -41,17 +41,19 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined { // links const re = URL_REGEX while ((match = re.exec(text.utf16))) { - let uri = match[2] - if (!uri.startsWith('http')) { + let uri = match.groups?.uri + const protocol = match.groups?.protocol + const tld = match.groups?.tld + if (protocol === undefined) { const domain = match.groups?.domain - if (!domain || !isValidDomain(domain)) { + if (!domain || (tld !== undefined && !isValidDomain(domain))) { continue } uri = `https://${uri}` } - const start = text.utf16.indexOf(match[2], match.index) - const index = { start, end: start + match[2].length } - // strip ending puncuation + const start = text.utf16.indexOf(match.groups?.uri, match.index) + const index = { start, end: start + match.groups?.uri.length } + // strip ending punctuation if (/[.,;:!?]$/.test(uri)) { uri = uri.slice(0, -1) index.end-- diff --git a/packages/api/src/rich-text/util.ts b/packages/api/src/rich-text/util.ts index cafd93d84e0..185d3c45298 100644 --- a/packages/api/src/rich-text/util.ts +++ b/packages/api/src/rich-text/util.ts @@ -1,6 +1,8 @@ export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g +// inspired by https://gist.github.com/dperini/729294 (2018/09/12 version) +// gist credit: Diego Perini export const URL_REGEX = - /(^|\s|\()((https?:\/\/[\S]+)|((?[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))/gim + /(?:^|\s|\()(?(?https?:\/\/)?(?(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}\.(?:1\d\d|2[0-4]\d|25[0-4]|[1-9]\d?)|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]*)?[a-z0-9\u00a1-\uffff]\.)+(?[a-z\u00a1-\uffff]{2,})\.?)(?::\d{2,5})?(?:[/?#]\S*)?)/gim export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu /** diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts index bb55f8222b1..92b2054fda2 100644 --- a/packages/api/tests/rich-text-detection.test.ts +++ b/packages/api/tests/rich-text-detection.test.ts @@ -60,6 +60,15 @@ describe('detectFacets', () => { 'punctuation https://foo.com, https://bar.com/whatever; https://baz.com.', 'parenthentical (https://foo.com)', 'except for https://foo.com/thing_(cool)', + 'HTTPS://google.com', + 'https://google.COM', + 'ko-fi.com', + '日本語.jp', + 'GOOGLE.com', + 'https://34.64.0.52', + '198.185.159.145', + 'invalid IPs: http://127.0.0.1 https://255.255.255.255 https://0.0.0.0 https://169.254.1.1 https://1.1.1.011', + 'invalid URIs: https://google.a https://localhost', ] const outputs: string[][][] = [ [['no mention']], @@ -212,6 +221,19 @@ describe('detectFacets', () => { ['except for '], ['https://foo.com/thing_(cool)', 'https://foo.com/thing_(cool)'], ], + [['HTTPS://google.com', 'HTTPS://google.com']], + [['https://google.COM', 'https://google.COM']], + [['ko-fi.com', 'https://ko-fi.com']], + [['日本語.jp', 'https://日本語.jp']], + [['GOOGLE.com', 'https://GOOGLE.com']], + [['https://34.64.0.52', 'https://34.64.0.52']], + [['198.185.159.145', 'https://198.185.159.145']], + [ + [ + 'invalid IPs: http://127.0.0.1 https://255.255.255.255 https://0.0.0.0 https://169.254.1.1 https://1.1.1.011', + ], + ], + [['invalid URIs: https://google.a https://localhost']], ] it('correctly handles a set of text inputs', async () => { for (let i = 0; i < inputs.length; i++) { From 23009d2ebd92b1e545c1524b41bec9dadb2d00b1 Mon Sep 17 00:00:00 2001 From: Arturo Fonseca Date: Sat, 30 Aug 2025 22:10:19 -0300 Subject: [PATCH 2/2] add license text to url regex in api/rich-text --- packages/api/src/rich-text/util.ts | 40 +++++++++++++++++-- .../api/tests/rich-text-detection.test.ts | 6 +++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/packages/api/src/rich-text/util.ts b/packages/api/src/rich-text/util.ts index 185d3c45298..9ce24dd55c1 100644 --- a/packages/api/src/rich-text/util.ts +++ b/packages/api/src/rich-text/util.ts @@ -1,8 +1,5 @@ export const MENTION_REGEX = /(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)/g -// inspired by https://gist.github.com/dperini/729294 (2018/09/12 version) -// gist credit: Diego Perini -export const URL_REGEX = - /(?:^|\s|\()(?(?https?:\/\/)?(?(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}\.(?:1\d\d|2[0-4]\d|25[0-4]|[1-9]\d?)|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]*)?[a-z0-9\u00a1-\uffff]\.)+(?[a-z\u00a1-\uffff]{2,})\.?)(?::\d{2,5})?(?:[/?#]\S*)?)/gim + export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu /** @@ -12,3 +9,38 @@ export const TRAILING_PUNCTUATION_REGEX = /\p{P}+$/gu export const TAG_REGEX = // eslint-disable-next-line no-misleading-character-class /(^|\s)[##]((?!\ufe0f)[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*[^\d\s\p{P}\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]+[^\s\u00AD\u2060\u200A\u200B\u200C\u200D\u20e2]*)?/gu + +// The RegEx below is inspired by https://gist.github.com/dperini/729294 (accessed in 2025/08/30) +// Regular Expression for URL validation +// +// Author: Diego Perini +// Created: 2010/12/05 +// Updated: 2018/09/12 +// License: MIT +// +// Copyright (c) 2010-2018 Diego Perini (http://www.iport.it) +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +export const URL_REGEX = + /(?:^|\s|\()(?(?https?:\/\/)?(?(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}\.(?:1\d\d|2[0-4]\d|25[0-4]|[1-9]\d?)|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]*)?[a-z0-9\u00a1-\uffff]\.)+(?[a-z\u00a1-\uffff]{2,}))(?::\d{2,5})?(?:[/?#]\S*)?)/gim +//-(-prefix--)(uri---(-------protocol-------)-(domain---(not-private-and-loopback-ips)(---not-system-and-class-c-private-ips--)(----------not-class-b-private-ips-----------)(----------ip-1st-oct------------)(----------ip-2nd-and-3rd-oct---------)--(-----------ip-4th-oct------------)-(--------------------------------dns-domain---------------------------------)-(-------------tld------------))(---port---)-(---path---)-) diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts index 92b2054fda2..3c5a04aa06d 100644 --- a/packages/api/tests/rich-text-detection.test.ts +++ b/packages/api/tests/rich-text-detection.test.ts @@ -69,6 +69,7 @@ describe('detectFacets', () => { '198.185.159.145', 'invalid IPs: http://127.0.0.1 https://255.255.255.255 https://0.0.0.0 https://169.254.1.1 https://1.1.1.011', 'invalid URIs: https://google.a https://localhost', + 'this is a website: google.com. The final dot it is not part of it', ] const outputs: string[][][] = [ [['no mention']], @@ -234,6 +235,11 @@ describe('detectFacets', () => { ], ], [['invalid URIs: https://google.a https://localhost']], + [ + ['this is a website: '], + ['google.com', 'https://google.com'], + ['. The final dot it is not part of it'], + ], ] it('correctly handles a set of text inputs', async () => { for (let i = 0; i < inputs.length; i++) {