Skip to content

Commit 75a77bc

Browse files
authored
Merge pull request #118 from matteomorari/fixing-emails-parse-as-mentions
- Tokenization of emails, to avoid appearing as words - Mentions must start with @ (nothing leading to it)
2 parents e09cda3 + 6bb89f6 commit 75a77bc

File tree

2 files changed

+21
-7
lines changed

2 files changed

+21
-7
lines changed

pipeline/process/nlp/Tokenizer.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import emojiRegex from "emoji-regex";
2424
https://i.imgflip.com/4hkogk.jpg
2525
*/
2626

27-
export type Tag = "code" | "url" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";
27+
export type Tag = "code" | "url" | "email" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";
2828

2929
export interface Token {
3030
text: string;
@@ -53,12 +53,17 @@ const Matchers: Readonly<TokenMatcher[]> = [
5353
regex: /https?:\/\/[^\s<]+[^<.,:;"')\]\s]/g, // Discord's regex to match URLs
5454
tag: "url",
5555
},
56-
// TODO: match emails, so they are not parsed as mentions (@gmail, @hotmail, etc)
56+
// Emails matcher must be before @mentions matcher to avoid false positives
57+
{
58+
// Match emails
59+
regex: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
60+
tag: "email",
61+
},
5762
{
5863
// match @mentions
59-
regex: /@[\p{L}_0-9]+/giu,
64+
regex: /(^|\s)@[\p{L}_0-9]+/giu,
6065
tag: "mention",
61-
transform: (match) => match.slice(1), // remove @
66+
transform: (match) => match.trim().slice(1), // remove @
6267
},
6368
{
6469
// match emojis 🔥

tests/process/nlp/Tokenizer.test.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ describe("should match the correct tag", () => {
4545
// urls
4646
["http://example.com", "http://example.com", "url"],
4747
// mentions
48+
4849
["@mention", "mention", "mention"],
50+
[" @mention", "mention", "mention"], // with whitespace
4951
["@123123123", "123123123", "mention"],
5052
// emojis
5153
["🔥", "🔥", "emoji"],
@@ -61,12 +63,19 @@ describe("should match the correct tag", () => {
6163

6264
test.each(cases)("%p → %p (tag=%p)", async (input, expectedText, expectedTag) => {
6365
const tokens = tokenize(input);
64-
expect(tokens.length).toBe(1);
65-
expect(tokens[0].text).toBe(expectedText);
66-
expect(tokens[0].tag).toBe(expectedTag);
66+
expect(tokens).toStrictEqual([{ text: expectedText, tag: expectedTag }]);
6767
});
6868
});
6969

70+
it("should not classify 'abc@xyz' as an email or mention", () => {
71+
const tokens = tokenize("abc@xyz");
72+
expect(tokens).toStrictEqual([
73+
{ text: "abc", tag: "word" },
74+
{ text: "@", tag: "unknown" },
75+
{ text: "xyz", tag: "word" },
76+
]);
77+
});
78+
7079
it("exclude outside ' matching words", () => {
7180
const tokens = tokenize("'hello'");
7281
expect(tokens.length).toBe(3);

0 commit comments

Comments
 (0)