Skip to content

Commit

Permalink
fix: use indivisble line hashes
Browse files Browse the repository at this point in the history
Revert implementation of diffLines to use runes to fix sergi#140.  In order
to not regress sergi#89, skip invalid utf8 runes when munging lines.
  • Loading branch information
schroederc committed Mar 31, 2023
1 parent facec63 commit 4019a12
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 79 deletions.
85 changes: 54 additions & 31 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ import (
"unicode/utf8"
)

// LineMap is a mapping from a line hash to its text.
type LineMap map[rune]string

// Operation defines the operation of a diff item.
type Operation int8

Expand All @@ -34,8 +37,6 @@ const (
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
//IndexSeparator is used to seperate the array indexes in an index string
IndexSeparator = ","
)

// Diff represents one diff operation
Expand Down Expand Up @@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {

// DiffMain finds the differences between two texts.
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
//
// Note: if checklines is true, the limitation noted in DiffLinesToChars applies
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
}

// DiffMainRunes finds the differences between two rune sequences.
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
//
// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
var deadline time.Time
if dmp.DiffTimeout > 0 {
Expand Down Expand Up @@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,

// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return chars1, chars2, lineArray
//
// Note: since we hash lines to runes, there is an upper limit to the number of
// unique lines this algorithm can handle. That limit is 1,112,063 unique
// lines.
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) {
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
return chars1, chars2, lineMap
}

// DiffLinesToRunes splits two texts into a list of runes.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineArray
//
// Note: since we hash lines to runes, there is an upper limit to the number of
// unique lines this algorithm can handle. That limit is 1,112,063 unique
// lines.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) {
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineMap
}

// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := strings.Split(aDiff.Text, IndexSeparator)
text := make([]string, len(chars))
runes := []rune(aDiff.Text)
text := make([]string, len(runes))

for i, r := range chars {
i1, err := strconv.Atoi(r)
if err == nil {
text[i] = lineArray[i1]
}
for i, r := range runes {
text[i] = lineMap[r]
}

aDiff.Text = strings.Join(text, "")
Expand Down Expand Up @@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
}

// diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) {
lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n'

lineHash := make(map[string]int)
//Each string has the index of lineArray which it points to
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash)
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash)
lineHash := make(map[string]rune)
//Each string has the index of lineMap which it points to
runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash)
runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash)

return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
return string(runes1), string(runes2), lineMap
}

// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 {
// Code points in the surrogate range are not valid for UTF-8.
const (
surrogateMin = 0xD800
surrogateMax = 0xDFFF
)

// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap.
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune {
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
lineStart := 0
lineEnd := -1
strs := []uint32{}
var strs []rune

for lineEnd < len(text)-1 {
lineEnd = indexOf(text, "\n", lineStart)
Expand All @@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
lineValue, ok := lineHash[line]

if ok {
strs = append(strs, uint32(lineValue))
strs = append(strs, lineValue)
} else {
*lineArray = append(*lineArray, line)
lineHash[line] = len(*lineArray) - 1
strs = append(strs, uint32(len(*lineArray)-1))
nextRune := rune(len(lineMap) + 1)
if nextRune >= surrogateMin {
// Skip invalid utf8 runes, if needed.
nextRune += surrogateMax - surrogateMin + 1
}
if nextRune > utf8.MaxRune {
panic("too many unique lines to use rune hashing")
}
lineMap[nextRune] = line
lineHash[line] = nextRune
strs = append(strs, nextRune)
}
}

Expand Down
141 changes: 111 additions & 30 deletions diffmatchpatch/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) {

ExpectedChars1 string
ExpectedChars2 string
ExpectedLines []string
ExpectedLines LineMap
}

dmp := New()

for i, tc := range []TestCase{
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "1", "2", []string{"", "a", "b"}},
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}},
{"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}},
// Omit final newline.
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
// Same lines in Text1 and Text2
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
Expand All @@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) {

// More than 256 to reveal any 8-bit limitations.
n := 300
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
var charList []string
var lines []string
lineMap := LineMap{}
var charList []rune
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
line := strconv.Itoa(x) + "\n"
lines = append(lines, line)
lineMap[rune(x)] = line
charList = append(charList, rune(x))
}
lines := strings.Join(lineList, "")
chars := strings.Join(charList[:], ",")
assert.Equal(t, n, len(strings.Split(chars, ",")))
chars := string(charList)
assert.Equal(t, n, utf8.RuneCountInString(chars))

actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "")
assert.Equal(t, chars, actualChars1)
assert.Equal(t, "", actualChars2)
assert.Equal(t, lineList, actualLines)
assert.Equal(t, lineMap, actualLines)
}

func TestDiffCharsToLines(t *testing.T) {
type TestCase struct {
Diffs []Diff
Lines []string
Lines map[rune]string

Expected []Diff
}
Expand All @@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) {
for i, tc := range []TestCase{
{
Diffs: []Diff{
{DiffEqual, "1,2,1"},
{DiffInsert, "2,1,2"},
{DiffEqual, "\u0001\u0002\u0001"},
{DiffInsert, "\u0002\u0001\u0002"},
},
Lines: []string{"", "alpha\n", "beta\n"},
Lines: map[rune]string{1: "alpha\n", 2: "beta\n"},

Expected: []Diff{
{DiffEqual, "alpha\nbeta\nalpha\n"},
Expand All @@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) {

// More than 256 to reveal any 8-bit limitations.
n := 300
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
charList := []string{}
var lines []string
lineMap := LineMap{}
charList := []rune{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
line := strconv.Itoa(x) + "\n"
lines = append(lines, line)
lineMap[rune(x)] = line
charList = append(charList, rune(x))
}
assert.Equal(t, n, len(charList))
chars := strings.Join(charList[:], ",")

actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual)
}

func TestDiffCleanupMerge(t *testing.T) {
Expand Down Expand Up @@ -1531,3 +1529,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
diffs = dmp.DiffCharsToLines(diffs, linearray)
}
}

func TestLineDiff(t *testing.T) {
t.Run("VeryLarge", func(t *testing.T) {
var beforeBuf, afterBuf bytes.Buffer

for i := 0; i <= surrogateMax+1; i++ {
beforeBuf.WriteString(fmt.Sprintf("%d\n", i))
afterBuf.WriteString(fmt.Sprintf("%d\n", i/2))
}

before, after := beforeBuf.String(), afterBuf.String()

diff := New().DiffMain(before, after, true)
checkDiffText(t, before, after, diff)
})

t.Run("Chars", func(t *testing.T) {
before := `1
2
3
4
5
6
7
8
9
`
after := `10
`

dmp := New()
txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after))
diff := dmp.DiffMain(txt1, txt2, false)
diff = dmp.DiffCharsToLines(diff, lines)

checkDiffText(t, before, after, diff)
})

t.Run("Runes", func(t *testing.T) {
before := `1
2
3
4
5
6
7
8
9
`
after := `10
`

dmp := New()
txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after))
diff := dmp.DiffMainRunes(txt1, txt2, false)
diff = dmp.DiffCharsToLines(diff, lines)

checkDiffText(t, before, after, diff)
})
}

func checkDiffText(t *testing.T, before, after string, diff []Diff) {
t.Helper()
var foundBefore, foundAfter string
for _, d := range diff {
switch d.Type {
case DiffEqual:
foundBefore += d.Text
foundAfter += d.Text
case DiffDelete:
foundBefore += d.Text
case DiffInsert:
foundAfter += d.Text
}
}

if foundBefore != before {
t.Errorf("Expected before %q; found %q", before, foundBefore)
}
if foundAfter != after {
t.Errorf("Expected after %q; found %q", after, foundAfter)
}
}
18 changes: 0 additions & 18 deletions diffmatchpatch/stringutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
package diffmatchpatch

import (
"strconv"
"strings"
"unicode/utf8"
)
Expand Down Expand Up @@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int {
}
return -1
}

func intArrayToString(ns []uint32) string {
if len(ns) == 0 {
return ""
}

indexSeparator := IndexSeparator[0]

// Appr. 3 chars per num plus the comma.
b := []byte{}
for _, n := range ns {
b = strconv.AppendInt(b, int64(n), 10)
b = append(b, indexSeparator)
}
b = b[:len(b)-1]
return string(b)
}

0 comments on commit 4019a12

Please sign in to comment.