From 70e287343022205eb6bc61be42f9e866f7b3ac1b Mon Sep 17 00:00:00 2001 From: Ujjwal Sharma Date: Mon, 10 Feb 2025 16:29:43 +0100 Subject: [PATCH] Fix autolinking errors due to regex and email validation Fix some edge cases in the autolinking logic with the regex as well as validating email domains and add unit tests for them. Fixes: https://github.com/mozilla/pdf.js/issues/19462 --- test/unit/autolinker_spec.js | 4 ++++ web/autolinker.js | 32 +++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/test/unit/autolinker_spec.js b/test/unit/autolinker_spec.js index 23a30446c54b3..2d2e4d6731cf1 100644 --- a/test/unit/autolinker_spec.js +++ b/test/unit/autolinker_spec.js @@ -87,6 +87,9 @@ describe("autolinker", function () { "CAP.cap@Gmail.Com", // Keep the original case. "mailto:CAP.cap@Gmail.Com", ], + ["partl@mail.boku.ac.at", "mailto:partl@mail.boku.ac.at"], + ["Irene.Hyna@bmwf.ac.at", "mailto:Irene.Hyna@bmwf.ac.at"], + ["", "mailto:hi@foo.bar.baz"], ]); }); @@ -140,6 +143,7 @@ describe("autolinker", function () { "http//[00:00:00:00:00:00", // Invalid IPv6 address. "http//[]", // Empty IPv6 address. "abc.example.com", // URL without scheme. + "JD?M$0QP)lKn06l1apKDC@\\qJ4B!!(5m+j.7F790m", // Not a valid email. ].join("\n") ); expect(matches.length).toEqual(0); diff --git a/web/autolinker.js b/web/autolinker.js index 098acb2739320..d3ee14c282265 100644 --- a/web/autolinker.js +++ b/web/autolinker.js @@ -96,31 +96,37 @@ class Autolinker { static #regex; static findLinks(text) { - // Regex can be tested and verified at https://regex101.com/r/zgDwPE/1. + // Regex can be tested and verified at https://regex101.com/r/rXoLiT/2. this.#regex ??= - /\b(?:https?:\/\/|mailto:|www\.)(?:[[\S--\[]--\p{P}]|\/|[\p{P}--\[]+[[\S--\[]--\p{P}])+|\b[[\S--@]--\{]+@[\S--.]+\.[[\S--\[]--\p{P}]{2,}/gmv; + /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv; const [normalizedText, diffs] = normalize(text); const matches = normalizedText.matchAll(this.#regex); const links = []; for (const match of matches) { - const raw = - match[0].startsWith("www.") || - match[0].startsWith("mailto:") || - match[0].startsWith("http://") || - match[0].startsWith("https://") - ? match[0] - : `mailto:${match[0]}`; - const url = createValidAbsoluteUrl(raw, null, { + const [url, emailDomain] = match; + let raw; + if ( + url.startsWith("www.") || + url.startsWith("http://") || + url.startsWith("https://") + ) { + raw = url; + } else if (URL.canParse(`http://${emailDomain}`)) { + raw = url.startsWith("mailto:") ? url : `mailto:${url}`; + } else { + continue; + } + const absoluteURL = createValidAbsoluteUrl(raw, null, { addDefaultProtocol: true, }); - if (url) { + if (absoluteURL) { const [index, length] = getOriginalIndex( diffs, match.index, - match[0].length + url.length ); - links.push({ url: url.href, index, length }); + links.push({ url: absoluteURL.href, index, length }); } } return links;