diff --git a/index.js b/index.js index 63db2a4..7ac276f 100644 --- a/index.js +++ b/index.js @@ -1176,6 +1176,166 @@ diff_match_patch.prototype.diff_cleanupMerge = function(diffs) { } }; +/** + * Rearrange diff boundaries that split Unicode surrogate pairs. + * + * @param {!Array.} diffs Array of diff tuples. + */ +diff_match_patch.prototype.diff_cleanupSplitSurrogates = function(diffs) { + var lastEnd; + for (var x = 0; x < diffs.length; x++) { + var thisDiff = diffs[x]; + var thisTop = thisDiff[1][0]; + var thisEnd = thisDiff[1][thisDiff[1].length - 1]; + + if (0 === thisDiff[1].length) { + diffs.splice(x--, 1); + continue; + } + + if (thisEnd && this.isHighSurrogate(thisEnd)) { + lastEnd = thisEnd; + thisDiff[1] = thisDiff[1].slice(0, -1); + } + + if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) { + thisDiff[1] = lastEnd + thisDiff[1]; + } + + if (0 === thisDiff[1].length) { + diffs.splice(x--, 1); + continue; + } + } + + return diffs; +}; + +diff_match_patch.prototype.isHighSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xD800 && v <= 0xDBFF; +}; + +diff_match_patch.prototype.isLowSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xDC00 && v <= 0xDFFF; +}; + +diff_match_patch.prototype.digit16 = function(c) { + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'A': case 'a': return 10; + case 'B': case 'b': return 11; + case 'C': case 'c': return 12; + case 'D': case 'd': return 13; + case 'E': case 'e': return 14; + case 'F': case 'f': return 15; + default: throw new Error('Invalid hex-code'); + } +}; + +/** + * Decode URI-encoded string but allow for encoded surrogate halves + * + * diff_match_patch needs this relaxation of the requirements because + * not all libraries and versions produce valid URI strings in toDelta + * and we don't want to crash this code when the input is valid input + * but at the same time invalid utf-8 + * + * @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70' + * @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c' + * + * @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js + * + * @param {String} text input string encoded by encodeURI() or equivalent + * @return {String} + */ +diff_match_patch.prototype.decodeURI = function(text) { + try { + return decodeURI(text); + } catch ( e ) { + var i = 0; + var decoded = ''; + + while (i < text.length) { + if ( text[i] !== '%' ) { + decoded += text[i++]; + continue; + } + + // start a percent-sequence + var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]); + if ((byte1 & 0x80) === 0) { + decoded += String.fromCharCode(byte1); + i += 3; + continue; + } + + if ('%' !== text[i + 3]) { + throw new URIError('URI malformed'); + } + + var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]); + if ((byte2 & 0xC0) !== 0x80) { + throw new URIError('URI malformed'); + } + byte2 = byte2 & 0x3F; + if ((byte1 & 0xE0) === 0xC0) { + decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2); + i += 6; + continue; + } + + if ('%' !== text[i + 6]) { + throw new URIError('URI malformed'); + } + + var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]); + if ((byte3 & 0xC0) !== 0x80) { + throw new URIError('URI malformed'); + } + byte3 = byte3 & 0x3F; + if ((byte1 & 0xF0) === 0xE0) { + // unpaired surrogate are fine here + decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3); + i += 9; + continue; + } + + if ('%' !== text[i + 9]) { + throw new URIError('URI malformed'); + } + + var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]); + if ((byte4 & 0xC0) !== 0x80) { + throw new URIError('URI malformed'); + } + byte4 = byte4 & 0x3F; + if ((byte1 & 0xF8) === 0xF0) { + var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { + decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800); + decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF); + i += 12; + continue; + } + } + + throw new URIError('URI malformed'); + } + + return decoded; + } +}; /** * loc is a location in text1, compute and return the equivalent location in @@ -1219,6 +1379,7 @@ diff_match_patch.prototype.diff_xIndex = function(diffs, loc) { * @return {string} HTML representation. */ diff_match_patch.prototype.diff_prettyHtml = function(diffs) { + diffs = this.diff_cleanupSplitSurrogates(diffs); var html = []; var pattern_amp = /&/g; var pattern_lt = / 0 && + diff_match_patch.prototype.isLowSurrogate(text[patch.start2 - padding]) + ) { + padding++; + } var prefix = text.substring(patch.start2 - padding, patch.start2); if (prefix) { patch.diffs.unshift([DIFF_EQUAL, prefix]); } // Add the suffix. + if ( + patch.start2 + patch.length1 + padding < text.length && + diff_match_patch.prototype.isHighSurrogate(text[patch.start2 + patch.length1 + padding]) + ) { + padding++; + } var suffix = text.substring(patch.start2 + patch.length1, patch.start2 + patch.length1 + padding); if (suffix) { @@ -1675,6 +1849,7 @@ diff_match_patch.prototype.patch_make = function(a, opt_b, opt_c) { if (diffs.length === 0) { return []; // Get rid of the null case. } + diffs = this.diff_cleanupSplitSurrogates(diffs); var patches = []; var patch = new diff_match_patch.patch_obj(); var patchDiffLength = 0; // Keeping our own length var is faster in JS. @@ -2171,6 +2346,7 @@ diff_match_patch.patch_obj.prototype.toString = function() { var text = ['@@ -' + coords1 + ' +' + coords2 + ' @@\n']; var op; // Escape the body of the patch with %xx notation. + diff_match_patch.prototype.diff_cleanupSplitSurrogates(this.diffs); for (var x = 0; x < this.diffs.length; x++) { switch (this.diffs[x][0]) { case DIFF_INSERT: diff --git a/test/index.js b/test/index.js index 16dbe21..17c7202 100644 --- a/test/index.js +++ b/test/index.js @@ -483,6 +483,137 @@ function testDiffDelta() { // Convert delta string into a diff. assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta)); + + diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']]; + try { + delta = dmp.diff_toDelta(diffs); + assertEquals('=2\t+%F0%9F%99%8C\t=2', delta); + } catch ( e ) { + assertEquals(false, true); + } + + (function(){ + const originalText = `U+1F17x πŸ…°οΈ πŸ…±οΈ πŸ…ΎοΈ πŸ…ΏοΈ safhawifhkw + U+1F18x πŸ†Ž + 0 1 2 3 4 5 6 7 8 9 A B C D E F + U+1F19x πŸ†‘ πŸ†’ πŸ†“ πŸ†” πŸ†• πŸ†– πŸ†— πŸ†˜ πŸ†™ πŸ†š + U+1F20x 🈁 πŸˆ‚οΈ sfss.,_||saavvvbbds + U+1F21x 🈚 + U+1F22x 🈯 + U+1F23x 🈲 🈳 🈴 🈡 🈢 🈷️ 🈸 🈹 🈺 + U+1F25x πŸ‰ πŸ‰‘ + U+1F30x πŸŒ€ 🌁 πŸŒ‚ πŸŒƒ πŸŒ„ πŸŒ… πŸŒ† πŸŒ‡ 🌈 πŸŒ‰ 🌊 πŸŒ‹ 🌌 🌍 🌎 🌏 + U+1F31x 🌐 πŸŒ‘ πŸŒ’ πŸŒ“ πŸŒ” πŸŒ• πŸŒ– πŸŒ— 🌘 πŸŒ™ 🌚 πŸŒ› 🌜 🌝 🌞 `; + + // applies some random edits to string and returns new, edited string + function applyRandomTextEdit(text) { + let textArr = [...text]; + let r = Math.random(); + if(r < 1/3) { // swap + let swapCount = Math.floor(Math.random()*5); + for(let i = 0; i < swapCount; i++) { + let swapPos1 = Math.floor(Math.random()*textArr.length); + let swapPos2 = Math.floor(Math.random()*textArr.length); + let char1 = textArr[swapPos1]; + let char2 = textArr[swapPos2]; + textArr[swapPos1] = char2; + textArr[swapPos2] = char1; + } + } else if(r < 2/3) { // remove + let removeCount = Math.floor(Math.random()*5); + for(let i = 0; i < removeCount; i++) { + let removePos = Math.floor(Math.random()*textArr.length); + textArr[removePos] = ""; + } + } else { // add + let addCount = Math.floor(Math.random()*5); + for(let i = 0; i < addCount; i++) { + let addPos = Math.floor(Math.random()*textArr.length); + let addFromPos = Math.floor(Math.random()*textArr.length); + textArr[addPos] = textArr[addPos] + textArr[addFromPos]; + } + } + return textArr.join(""); + } + + for(let i = 0; i < 1000; i++) { + const newText = applyRandomTextEdit(originalText); + dmp.diff_toDelta(dmp.diff_main(originalText, newText)); + } + })(); + + // Unicode - splitting surrogates + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Inserting similar surrogate pair at beginning', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Inserting similar surrogate pair in the middle', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Deleting similar surrogate pair at the beginning', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Deleting similar surrogate pair in the middle', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]), + dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]), + ); + } catch ( e ) { + assertEquals('Swap surrogate pair', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]), + dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]), + ); + } catch ( e ) { + assertEquals('Swap surrogate pair', 'crashed'); + } + + // Empty diff groups + assertEquivalent( + dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]), + dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]), + ); + + // Different versions of the library may have created deltas with + // half of a surrogate pair encoded as if it were valid UTF-8 + try { + assertEquivalent( + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')), + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1')) + ); + } catch ( e ) { + assertEquals('Decode UTF8-encoded surrogate half', 'crashed'); + } + // Verify pool of unchanged characters. diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']]; var text2 = dmp.diff_text2(diffs); @@ -1019,4 +1150,4 @@ var tests = [ for (var x = 0; x < tests.length; x++) { test(tests[x], eval(tests[x])) - } \ No newline at end of file + }