Merge pull request #118 from matteomorari/fixing-emails-parse-as-mentions

mlomb · web-flow · commit 75a77bc1b6ad · 2025-03-28T10:09:29.000-05:00
- Tokenization of emails, to avoid appearing as words
- Mentions must start with @ (nothing leading to it)
diff --git a/pipeline/process/nlp/Tokenizer.ts b/pipeline/process/nlp/Tokenizer.ts
@@ -24,7 +24,7 @@ import emojiRegex from "emoji-regex";
     https://i.imgflip.com/4hkogk.jpg
 */
 
-export type Tag = "code" | "url" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";
+export type Tag = "code" | "url" | "email" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";
 
 export interface Token {
     text: string;
@@ -53,12 +53,17 @@ const Matchers: Readonly<TokenMatcher[]> = [
         regex: /https?:\/\/[^\s<]+[^<.,:;"')\]\s]/g, // Discord's regex to match URLs
         tag: "url",
     },
-    // TODO: match emails, so they are not parsed as mentions (@gmail, @hotmail, etc)
+    // Emails matcher must be before @mentions matcher to avoid false positives
+    {
+        // Match emails
+        regex: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
+        tag: "email",
+    },
     {
         // match @mentions
-        regex: /@[\p{L}_0-9]+/giu,
+        regex: /(^|\s)@[\p{L}_0-9]+/giu,
         tag: "mention",
-        transform: (match) => match.slice(1), // remove @
+        transform: (match) => match.trim().slice(1), // remove @
     },
     {
         // match emojis 🔥
diff --git a/tests/process/nlp/Tokenizer.test.ts b/tests/process/nlp/Tokenizer.test.ts
@@ -45,7 +45,9 @@ describe("should match the correct tag", () => {
         // urls
         ["http://example.com", "http://example.com", "url"],
         // mentions
+        ["email@gmail.com", "email@gmail.com", "email"],
         ["@mention", "mention", "mention"],
+        ["  @mention", "mention", "mention"], // with whitespace
         ["@123123123", "123123123", "mention"],
         // emojis
         ["🔥", "🔥", "emoji"],
@@ -61,12 +63,19 @@ describe("should match the correct tag", () => {
 
     test.each(cases)("%p → %p (tag=%p)", async (input, expectedText, expectedTag) => {
         const tokens = tokenize(input);
-        expect(tokens.length).toBe(1);
-        expect(tokens[0].text).toBe(expectedText);
-        expect(tokens[0].tag).toBe(expectedTag);
+        expect(tokens).toStrictEqual([{ text: expectedText, tag: expectedTag }]);
     });
 });
 
+it("should not classify 'abc@xyz' as an email or mention", () => {
+    const tokens = tokenize("abc@xyz");
+    expect(tokens).toStrictEqual([
+        { text: "abc", tag: "word" },
+        { text: "@", tag: "unknown" },
+        { text: "xyz", tag: "word" },
+    ]);
+});
+
 it("exclude outside ' matching words", () => {
     const tokens = tokenize("'hello'");
     expect(tokens.length).toBe(3);