diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 4f7b424..1523e9d 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -22,6 +22,9 @@ import ( "unicode/utf8" ) +// LineMap is a mapping from a line hash to its text. +type LineMap map[rune]string + // Operation defines the operation of a diff item. type Operation int8 @@ -34,8 +37,6 @@ const ( DiffInsert Operation = 1 // DiffEqual item represents an equal diff. DiffEqual Operation = 0 - //IndexSeparator is used to seperate the array indexes in an index string - IndexSeparator = "," ) // Diff represents one diff operation @@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff { // DiffMain finds the differences between two texts. // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. +// +// Note: if checklines is true, the limitation noted in DiffLinesToChars applies func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff { return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines) } // DiffMainRunes finds the differences between two rune sequences. // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character. +// +// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff { var deadline time.Time if dmp.DiffTimeout > 0 { @@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, // DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. // It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. -func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) { - chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return chars1, chars2, lineArray +// +// Note: since we hash lines to runes, there is an upper limit to the number of +// unique lines this algorithm can handle. That limit is 1,112,063 unique +// lines. +func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) { + chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2) + return chars1, chars2, lineMap } // DiffLinesToRunes splits two texts into a list of runes. -func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { - chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return []rune(chars1), []rune(chars2), lineArray +// +// Note: since we hash lines to runes, there is an upper limit to the number of +// unique lines this algorithm can handle. That limit is 1,112,063 unique +// lines. +func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) { + chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2) + return []rune(chars1), []rune(chars2), lineMap } // DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. -func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { +func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff { hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { - chars := strings.Split(aDiff.Text, IndexSeparator) - text := make([]string, len(chars)) + runes := []rune(aDiff.Text) + text := make([]string, len(runes)) - for i, r := range chars { - i1, err := strconv.Atoi(r) - if err == nil { - text[i] = lineArray[i1] - } + for i, r := range runes { + text[i] = lineMap[r] } aDiff.Text = strings.Join(text, "") @@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di } // diffLinesToStrings splits two texts into a list of strings. Each string represents one line. -func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { - // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. - lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' +func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) { + lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n' - lineHash := make(map[string]int) - //Each string has the index of lineArray which it points to - strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash) - strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash) + lineHash := make(map[string]rune) + //Each string has the index of lineMap which it points to + runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash) + runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash) - return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray + return string(runes1), string(runes2), lineMap } -// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. -func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 { +// Code points in the surrogate range are not valid for UTF-8. +const ( + surrogateMin = 0xD800 + surrogateMax = 0xDFFF +) + +// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap. +func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune { // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. lineStart := 0 lineEnd := -1 - strs := []uint32{} + var strs []rune for lineEnd < len(text)-1 { lineEnd = indexOf(text, "\n", lineStart) @@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str lineValue, ok := lineHash[line] if ok { - strs = append(strs, uint32(lineValue)) + strs = append(strs, lineValue) } else { - *lineArray = append(*lineArray, line) - lineHash[line] = len(*lineArray) - 1 - strs = append(strs, uint32(len(*lineArray)-1)) + nextRune := rune(len(lineMap) + 1) + if nextRune >= surrogateMin { + // Skip invalid utf8 runes, if needed. + nextRune += surrogateMax - surrogateMin + 1 + } + if nextRune > utf8.MaxRune { + panic("too many unique lines to use rune hashing") + } + lineMap[nextRune] = line + lineHash[line] = nextRune + strs = append(strs, nextRune) } } diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index d6fed50..0d0bf09 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) { ExpectedChars1 string ExpectedChars2 string - ExpectedLines []string + ExpectedLines LineMap } dmp := New() for i, tc := range []TestCase{ - {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, - {"a", "b", "1", "2", []string{"", "a", "b"}}, + {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}}, + {"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}}, // Omit final newline. - {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, - // Same lines in Text1 and Text2 - {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, + {"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) { // More than 256 to reveal any 8-bit limitations. n := 300 - lineList := []string{ - "", // Account for the initial empty element of the lines array. - } - var charList []string + var lines []string + lineMap := LineMap{} + var charList []rune for x := 1; x < n+1; x++ { - lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + line := strconv.Itoa(x) + "\n" + lines = append(lines, line) + lineMap[rune(x)] = line + charList = append(charList, rune(x)) } - lines := strings.Join(lineList, "") - chars := strings.Join(charList[:], ",") - assert.Equal(t, n, len(strings.Split(chars, ","))) + chars := string(charList) + assert.Equal(t, n, utf8.RuneCountInString(chars)) - actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "") + actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "") assert.Equal(t, chars, actualChars1) assert.Equal(t, "", actualChars2) - assert.Equal(t, lineList, actualLines) + assert.Equal(t, lineMap, actualLines) } func TestDiffCharsToLines(t *testing.T) { type TestCase struct { Diffs []Diff - Lines []string + Lines map[rune]string Expected []Diff } @@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) { for i, tc := range []TestCase{ { Diffs: []Diff{ - {DiffEqual, "1,2,1"}, - {DiffInsert, "2,1,2"}, + {DiffEqual, "\u0001\u0002\u0001"}, + {DiffInsert, "\u0002\u0001\u0002"}, }, - Lines: []string{"", "alpha\n", "beta\n"}, + Lines: map[rune]string{1: "alpha\n", 2: "beta\n"}, Expected: []Diff{ {DiffEqual, "alpha\nbeta\nalpha\n"}, @@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) { // More than 256 to reveal any 8-bit limitations. n := 300 - lineList := []string{ - "", // Account for the initial empty element of the lines array. - } - charList := []string{} + var lines []string + lineMap := LineMap{} + charList := []rune{} for x := 1; x <= n; x++ { - lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + line := strconv.Itoa(x) + "\n" + lines = append(lines, line) + lineMap[rune(x)] = line + charList = append(charList, rune(x)) } assert.Equal(t, n, len(charList)) - chars := strings.Join(charList[:], ",") - actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList) - assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual) + actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap) + assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual) } func TestDiffCleanupMerge(t *testing.T) { @@ -1531,3 +1529,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) { diffs = dmp.DiffCharsToLines(diffs, linearray) } } + +func TestLineDiff(t *testing.T) { + t.Run("VeryLarge", func(t *testing.T) { + var beforeBuf, afterBuf bytes.Buffer + + for i := 0; i <= surrogateMax+1; i++ { + beforeBuf.WriteString(fmt.Sprintf("%d\n", i)) + afterBuf.WriteString(fmt.Sprintf("%d\n", i/2)) + } + + before, after := beforeBuf.String(), afterBuf.String() + + diff := New().DiffMain(before, after, true) + checkDiffText(t, before, after, diff) + }) + + t.Run("Chars", func(t *testing.T) { + before := `1 +2 +3 +4 +5 +6 +7 +8 +9 +` + after := `10 +` + + dmp := New() + txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after)) + diff := dmp.DiffMain(txt1, txt2, false) + diff = dmp.DiffCharsToLines(diff, lines) + + checkDiffText(t, before, after, diff) + }) + + t.Run("Runes", func(t *testing.T) { + before := `1 +2 +3 +4 +5 +6 +7 +8 +9 +` + after := `10 +` + + dmp := New() + txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after)) + diff := dmp.DiffMainRunes(txt1, txt2, false) + diff = dmp.DiffCharsToLines(diff, lines) + + checkDiffText(t, before, after, diff) + }) +} + +func checkDiffText(t *testing.T, before, after string, diff []Diff) { + t.Helper() + var foundBefore, foundAfter string + for _, d := range diff { + switch d.Type { + case DiffEqual: + foundBefore += d.Text + foundAfter += d.Text + case DiffDelete: + foundBefore += d.Text + case DiffInsert: + foundAfter += d.Text + } + } + + if foundBefore != before { + t.Errorf("Expected before %q; found %q", before, foundBefore) + } + if foundAfter != after { + t.Errorf("Expected after %q; found %q", after, foundAfter) + } +} diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index 44c4359..265f29c 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go @@ -9,7 +9,6 @@ package diffmatchpatch import ( - "strconv" "strings" "unicode/utf8" ) @@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int { } return -1 } - -func intArrayToString(ns []uint32) string { - if len(ns) == 0 { - return "" - } - - indexSeparator := IndexSeparator[0] - - // Appr. 3 chars per num plus the comma. - b := []byte{} - for _, n := range ns { - b = strconv.AppendInt(b, int64(n), 10) - b = append(b, indexSeparator) - } - b = b[:len(b)-1] - return string(b) -}