Skip to content
This repository has been archived by the owner on Oct 7, 2020. It is now read-only.

emoji/unsafe identifiers and identifier collisions #46

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,7 @@
[[constraint]]
branch = "master"
name = "github.com/olebedev/emitter"

[[constraint]]
name = "github.com/wangii/emoji"
branch = "master"
6 changes: 3 additions & 3 deletions types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ func (p *Participant) FullName() string {
// SafeName returns the irc-safe name for the current Participant.
func (p *Participant) SafeName() string {
str := p.FullName()
if numberRegex.MatchString(str) && ircSafeString(p.Contact.PushName) != "" {
if numberRegex.MatchString(str) && IrcSafeString(p.Contact.PushName) != "" {
str = p.Contact.PushName
}

return ircSafeString(str)
return IrcSafeString(str)
}

// Chat represents a chat on the bridge.
Expand All @@ -45,7 +45,7 @@ type Chat struct {

// SafeName returns the IRC-safe name for the current chat.
func (c *Chat) SafeName() string {
return ircSafeString(c.Name)
return IrcSafeString(c.Name)
}

// Identifier returns the safe IRC identifier for the current chat.
Expand Down
41 changes: 37 additions & 4 deletions util.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package main

import (
"encoding/hex"
"fmt"
"log"
"mime"
"os"
Expand All @@ -11,6 +13,12 @@ import (

"github.com/h2non/filetype"
"github.com/mozillazg/go-unidecode"
"github.com/wangii/emoji"
)

var (
identifiers = make(map[string]int)
unsafeRegex = regexp.MustCompile(`(?i)[^a-z\d+]`)
)

func strTimestamp() string {
Expand Down Expand Up @@ -51,11 +59,36 @@ func getExtensionByMimeOrBytes(mime string, bytes []byte) string {
return getExtension(bytes)
}

var unsafeRegex = regexp.MustCompile(`(?i)[^a-z\d+]`)
// IrcSafeString converts any emoji unicode characters into emojitag, then
// converts any non-ascii characters into their ascii equivalents, then strips
// characters that satisfy unsafeRegex, and finally disambiguates the
// identifier if required
func IrcSafeString(str string) string {
emojiTagged := emoji.UnicodeToEmojiTag(str)
decoded := unidecode.Unidecode(emojiTagged)
ircSafe := unsafeRegex.ReplaceAllLiteralString(decoded, "")

if ircSafe == "" {
return ensureIdentifierIsDistinct("x" + hex.EncodeToString([]byte(str)))
}

return ensureIdentifierIsDistinct(ircSafe)
}

func ensureIdentifierIsDistinct(identity string) string {
// we've encountered this identifier before so
// increment the counter and append the new count
// to the identifier we return
if _, exists := identifiers[identity]; exists {
identifiers[identity]++
counter := identifiers[identity]
return fmt.Sprintf("%s_%d", identity, counter)
}

func ircSafeString(str string) string {
str = unidecode.Unidecode(str)
return unsafeRegex.ReplaceAllLiteralString(str, "")
// it's the first time we're encountering this identifier
// so we initialise the counter
identifiers[identity] = 1
return identity
}

func onInterrupt(fn func()) {
Expand Down
121 changes: 121 additions & 0 deletions util_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package main

import (
"encoding/hex"
"github.com/wangii/emoji"
"strings"
"testing"
)

const unsafeCharacters = "$&!:;/?^%#*~`"

func TestIrcSafeStringSimpleNoEmoji(t *testing.T) {
simpleNoEmojiStr := IrcSafeString(unsafeCharacters)
if simpleNoEmojiStr == "" {
t.Fatalf("expected simple no emoji string to not be empty string after invoking IrcSafeString but found empty string: %s", simpleNoEmojiStr)
}
t.Logf("simple no emoji string is unique after invoking IrcSafeString: %s", simpleNoEmojiStr)
}

func TestIrcSafeStringSimpleNoEmojiDecode(t *testing.T) {
simpleNoEmojiStr := IrcSafeString(unsafeCharacters)
if simpleNoEmojiStr == "" {
t.Fatalf("expected simple no emoji string to not be empty string after invoking IrcSafeString but found empty string: %s", simpleNoEmojiStr)
}
parts := strings.Split(simpleNoEmojiStr, "_")
stripped := parts[0][1:]
original, err := hex.DecodeString(stripped)
if err != nil {
t.Fatalf("error decoding hex string: %e %s %s", err, stripped, original)
}
if string(original) != unsafeCharacters {
t.Fatalf("expected strings to match but found no match: %s %s", stripped, original)
}
t.Logf("strings match after decoding back to original string value: %s %s", original, unsafeCharacters)
}

func TestIrcSafeStringSimple(t *testing.T) {
// a basic emoji string
simpleEmojiStr := emoji.EmojiTagToUnicode(":ok_hand:" + unsafeCharacters)

// we invoke the test subject a few times in a
// row to test the map[string]int side effects
simpleSafeStr1 := IrcSafeString(simpleEmojiStr)
simpleSafeStr2 := IrcSafeString(simpleEmojiStr)
simpleSafeStr3 := IrcSafeString(simpleEmojiStr)

if simpleSafeStr1 == "" ||
simpleSafeStr2 == "" ||
simpleSafeStr3 == "" {
t.Fatalf("expected simple emoji strings to not be empty strings after invoking IrcSafeString but found empty strings: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3)
}

// ensure none of the strings match
if simpleSafeStr1 == simpleSafeStr2 ||
simpleSafeStr1 == simpleSafeStr3 ||
simpleSafeStr2 == simpleSafeStr3 {
t.Fatalf("expected simple emoji strings to NOT match after invoking IrcSafeString but found matching strings: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3)
}

// ensure none of them contain the characters we expected to not exist
if strings.Contains(simpleSafeStr1, unsafeCharacters) ||
strings.Contains(simpleSafeStr2, unsafeCharacters) ||
strings.Contains(simpleSafeStr3, unsafeCharacters) {
t.Fatalf("expected simple emoji strings to NOT contain unsafeCharacters substring after invoking IrcSafeString but found substring match: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3)
}

// ensure the duplicate strings have been made distinct with an incrementing integer
if !strings.HasSuffix(simpleSafeStr2, "2") {
t.Fatalf("expected duplicate simple emoji string to end with a disambiguating numeric suffix: %s", simpleSafeStr2)
}

// ensure the duplicate strings have been made distinct with an incrementing integer
if !strings.HasSuffix(simpleSafeStr3, "3") {
t.Fatalf("expected duplicate simple emoji string to end with a disambiguating numeric suffix: %s", simpleSafeStr3)
}

t.Logf("simple emoji strings are unique after invoking IrcSafeString: %s %s %s", simpleSafeStr1, simpleSafeStr2, simpleSafeStr3)
}

func TestIrcSafeStringComplex(t *testing.T) {
// a much longer one
complexEmojiStr := emoji.EmojiTagToUnicode(":ok_hand::ok_hand::ok_hand::ok_hand::ok_hand::ok_hand::ok_hand:" + unsafeCharacters)

// we invoke the test subject a few times in a
// row to test the map[string]int side effects
complexSafeStr1 := IrcSafeString(complexEmojiStr)
complexSafeStr2 := IrcSafeString(complexEmojiStr)
complexSafeStr3 := IrcSafeString(complexEmojiStr)

if complexSafeStr1 == "" ||
complexSafeStr2 == "" ||
complexSafeStr3 == "" {
t.Fatalf("expected complex emoji strings to not be empty strings after invoking IrcSafeString but found empty strings: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3)
}

// ensure none of the strings match
if complexSafeStr1 == complexSafeStr2 ||
complexSafeStr1 == complexSafeStr3 ||
complexSafeStr2 == complexSafeStr3 {
t.Fatalf("expected complex emoji strings to NOT match after invoking IrcSafeString but found matching strings: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3)
}

// ensure none of them contain the characters we expected to not exist
if strings.Contains(complexSafeStr1, unsafeCharacters) ||
strings.Contains(complexSafeStr2, unsafeCharacters) ||
strings.Contains(complexSafeStr3, unsafeCharacters) {
t.Fatalf("expected complex emoji strings to NOT contain unsafeCharacters substring after invoking IrcSafeString but found substring match: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3)
}

// ensure the duplicate strings have been made distinct with an incrementing integer
if !strings.HasSuffix(complexSafeStr2, "2") {
t.Fatalf("expected duplicate complex emoji string to end with a disambiguating numeric suffix: %s", complexSafeStr2)
}

// ensure the duplicate strings have been made distinct with an incrementing integer
if !strings.HasSuffix(complexSafeStr3, "3") {
t.Fatalf("expected duplicate complex emoji string to end with a disambiguating numeric suffix: %s", complexSafeStr3)
}

t.Logf("complex emoji strings are unique after invoking IrcSafeString: %s %s %s", complexSafeStr1, complexSafeStr2, complexSafeStr3)
}