diff --git a/Gopkg.lock b/Gopkg.lock index 8896d92..0d543e2 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -149,6 +149,14 @@ pruneopts = "UT" revision = "dc11ecdae0a9889dc81a343585516404e8dc6ead" +[[projects]] + branch = "master" + digest = "1:350070e36cb0725fb4fc8128c33f7a7ddba2be65e4d63bfb3daea8a2cdd01522" + name = "github.com/wangii/emoji" + packages = ["."] + pruneopts = "UT" + revision = "d15b69a4831e56a3fcf1fc990f5cbc247641b783" + [[projects]] branch = "master" digest = "1:8e4024a39f73657fda08fc46908003698955a5f1fdeba7ceb6801070720de922" @@ -219,6 +227,7 @@ "github.com/mozillazg/go-unidecode", "github.com/olebedev/emitter", "github.com/skip2/go-qrcode", + "github.com/wangii/emoji", "golang.org/x/crypto/hkdf", "gopkg.in/sorcix/irc.v2", "gopkg.in/sorcix/irc.v2/ctcp", diff --git a/Gopkg.toml b/Gopkg.toml index 5ca38c8..9364761 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -44,3 +44,7 @@ [[constraint]] branch = "master" name = "github.com/olebedev/emitter" + +[[constraint]] + name = "github.com/wangii/emoji" + branch = "master" \ No newline at end of file diff --git a/types.go b/types.go index c656efb..4325428 100644 --- a/types.go +++ b/types.go @@ -21,11 +21,11 @@ func (p *Participant) FullName() string { // SafeName returns the irc-safe name for the current Participant. func (p *Participant) SafeName() string { str := p.FullName() - if numberRegex.MatchString(str) && ircSafeString(p.Contact.PushName) != "" { + if numberRegex.MatchString(str) && IrcSafeString(p.Contact.PushName) != "" { str = p.Contact.PushName } - return ircSafeString(str) + return IrcSafeString(str) } // Chat represents a chat on the bridge. @@ -45,7 +45,7 @@ type Chat struct { // SafeName returns the IRC-safe name for the current chat. func (c *Chat) SafeName() string { - return ircSafeString(c.Name) + return IrcSafeString(c.Name) } // Identifier returns the safe IRC identifier for the current chat. diff --git a/util.go b/util.go index 21555f2..23d7fbb 100644 --- a/util.go +++ b/util.go @@ -1,6 +1,8 @@ package main import ( + "encoding/hex" + "fmt" "log" "mime" "os" @@ -11,6 +13,12 @@ import ( "github.com/h2non/filetype" "github.com/mozillazg/go-unidecode" + "github.com/wangii/emoji" +) + +var ( + identifiers = make(map[string]int) + unsafeRegex = regexp.MustCompile(`(?i)[^a-z\d+]`) ) func strTimestamp() string { @@ -51,11 +59,36 @@ func getExtensionByMimeOrBytes(mime string, bytes []byte) string { return getExtension(bytes) } -var unsafeRegex = regexp.MustCompile(`(?i)[^a-z\d+]`) +// IrcSafeString converts any emoji unicode characters into emojitag, then +// converts any non-ascii characters into their ascii equivalents, then strips +// characters that satisfy unsafeRegex, and finally disambiguates the +// identifier if required +func IrcSafeString(str string) string { + emojiTagged := emoji.UnicodeToEmojiTag(str) + decoded := unidecode.Unidecode(emojiTagged) + ircSafe := unsafeRegex.ReplaceAllLiteralString(decoded, "") + + if ircSafe == "" { + return ensureIdentifierIsDistinct("x" + hex.EncodeToString([]byte(str))) + } + + return ensureIdentifierIsDistinct(ircSafe) +} + +func ensureIdentifierIsDistinct(identity string) string { + // we've encountered this identifier before so + // increment the counter and append the new count + // to the identifier we return + if _, exists := identifiers[identity]; exists { + identifiers[identity]++ + counter := identifiers[identity] + return fmt.Sprintf("%s_%d", identity, counter) + } -func ircSafeString(str string) string { - str = unidecode.Unidecode(str) - return unsafeRegex.ReplaceAllLiteralString(str, "") + // it's the first time we're encountering this identifier + // so we initialise the counter + identifiers[identity] = 1 + return identity } func onInterrupt(fn func()) { diff --git a/util_test.go b/util_test.go new file mode 100644 index 0000000..6a39551 --- /dev/null +++ b/util_test.go @@ -0,0 +1,121 @@ +package main + +import ( + "encoding/hex" + "github.com/wangii/emoji" + "strings" + "testing" +) + +const unsafeCharacters = "$&!:;/?^%#*~`" + +func TestIrcSafeStringSimpleNoEmoji(t *testing.T) { + simpleNoEmojiStr := IrcSafeString(unsafeCharacters) + if simpleNoEmojiStr == "" { + t.Fatalf("expected simple no emoji string to not be empty string after invoking IrcSafeString but found empty string: %s", simpleNoEmojiStr) + } + t.Logf("simple no emoji string is unique after invoking IrcSafeString: %s", simpleNoEmojiStr) +} + +func TestIrcSafeStringSimpleNoEmojiDecode(t *testing.T) { + simpleNoEmojiStr := IrcSafeString(unsafeCharacters) + if simpleNoEmojiStr == "" { + t.Fatalf("expected simple no emoji string to not be empty string after invoking IrcSafeString but found empty string: %s", simpleNoEmojiStr) + } + parts := strings.Split(simpleNoEmojiStr, "_") + stripped := parts[0][1:] + original, err := hex.DecodeString(stripped) + if err != nil { + t.Fatalf("error decoding hex string: %e %s %s", err, stripped, original) + } + if string(original) != unsafeCharacters { + t.Fatalf("expected strings to match but found no match: %s %s", stripped, original) + } + t.Logf("strings match after decoding back to original string value: %s %s", original, unsafeCharacters) +} + +func TestIrcSafeStringSimple(t *testing.T) { + // a basic emoji string + simpleEmojiStr := emoji.EmojiTagToUnicode(":ok_hand:" + unsafeCharacters) + + // we invoke the test subject a few times in a + // row to test the map[string]int side effects + simpleSafeStr1 := IrcSafeString(simpleEmojiStr) + simpleSafeStr2 := IrcSafeString(simpleEmojiStr) + simpleSafeStr3 := IrcSafeString(simpleEmojiStr) + + if simpleSafeStr1 == "" || + simpleSafeStr2 == "" || + simpleSafeStr3 == "" { + t.Fatalf("expected simple emoji strings to not be empty strings after invoking IrcSafeString but found empty strings: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3) + } + + // ensure none of the strings match + if simpleSafeStr1 == simpleSafeStr2 || + simpleSafeStr1 == simpleSafeStr3 || + simpleSafeStr2 == simpleSafeStr3 { + t.Fatalf("expected simple emoji strings to NOT match after invoking IrcSafeString but found matching strings: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3) + } + + // ensure none of them contain the characters we expected to not exist + if strings.Contains(simpleSafeStr1, unsafeCharacters) || + strings.Contains(simpleSafeStr2, unsafeCharacters) || + strings.Contains(simpleSafeStr3, unsafeCharacters) { + t.Fatalf("expected simple emoji strings to NOT contain unsafeCharacters substring after invoking IrcSafeString but found substring match: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3) + } + + // ensure the duplicate strings have been made distinct with an incrementing integer + if !strings.HasSuffix(simpleSafeStr2, "2") { + t.Fatalf("expected duplicate simple emoji string to end with a disambiguating numeric suffix: %s", simpleSafeStr2) + } + + // ensure the duplicate strings have been made distinct with an incrementing integer + if !strings.HasSuffix(simpleSafeStr3, "3") { + t.Fatalf("expected duplicate simple emoji string to end with a disambiguating numeric suffix: %s", simpleSafeStr3) + } + + t.Logf("simple emoji strings are unique after invoking IrcSafeString: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3) +} + +func TestIrcSafeStringComplex(t *testing.T) { + // a much longer one + complexEmojiStr := emoji.EmojiTagToUnicode(":ok_hand::ok_hand::ok_hand::ok_hand::ok_hand::ok_hand::ok_hand:" + unsafeCharacters) + + // we invoke the test subject a few times in a + // row to test the map[string]int side effects + complexSafeStr1 := IrcSafeString(complexEmojiStr) + complexSafeStr2 := IrcSafeString(complexEmojiStr) + complexSafeStr3 := IrcSafeString(complexEmojiStr) + + if complexSafeStr1 == "" || + complexSafeStr2 == "" || + complexSafeStr3 == "" { + t.Fatalf("expected complex emoji strings to not be empty strings after invoking IrcSafeString but found empty strings: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3) + } + + // ensure none of the strings match + if complexSafeStr1 == complexSafeStr2 || + complexSafeStr1 == complexSafeStr3 || + complexSafeStr2 == complexSafeStr3 { + t.Fatalf("expected complex emoji strings to NOT match after invoking IrcSafeString but found matching strings: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3) + } + + // ensure none of them contain the characters we expected to not exist + if strings.Contains(complexSafeStr1, unsafeCharacters) || + strings.Contains(complexSafeStr2, unsafeCharacters) || + strings.Contains(complexSafeStr3, unsafeCharacters) { + t.Fatalf("expected complex emoji strings to NOT contain unsafeCharacters substring after invoking IrcSafeString but found substring match: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3) + } + + // ensure the duplicate strings have been made distinct with an incrementing integer + if !strings.HasSuffix(complexSafeStr2, "2") { + t.Fatalf("expected duplicate complex emoji string to end with a disambiguating numeric suffix: %s", complexSafeStr2) + } + + // ensure the duplicate strings have been made distinct with an incrementing integer + if !strings.HasSuffix(complexSafeStr3, "3") { + t.Fatalf("expected duplicate complex emoji string to end with a disambiguating numeric suffix: %s", complexSafeStr3) + } + + t.Logf("complex emoji strings are unique after invoking IrcSafeString: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3) +}