From 80967168adb34fb16cc9c58a80d127d4f8e425ab Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 31 Aug 2025 17:21:07 -0700 Subject: [PATCH 01/14] Add regression tests, plus test of current unpaired surrogate behaviors. --- .../src/sjsonnet/UnicodeHandlingTests.scala | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala new file mode 100644 index 00000000..097644ac --- /dev/null +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -0,0 +1,174 @@ +package sjsonnet + +import utest._ +import TestUtils.{eval, evalErr} + +/** + * Tests for correct handling of Unicode strings, especially those that require surrogate pairs in + * UTF-16 (i.e., codepoints above U+FFFF). + */ +object UnicodeHandlingTests extends TestSuite { + def tests: Tests = Tests { + + test("stringLength") { + eval("std.length('๐ŸŒ')") ==> ujson.Num(1) + eval("std.length('Hello ๐ŸŒ')") ==> ujson.Num(7) + eval("std.length('๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ')") ==> ujson.Num(7) // Family emoji with ZWJ sequences + } + + test("stringIndex") { + eval("'Hello ๐ŸŒ World'[6]") ==> ujson.Str("๐ŸŒ") + eval("'A๐ŸŒB'[1]") ==> ujson.Str("๐ŸŒ") + assert(evalErr("'A๐ŸŒB'[3]").contains("string bounds error")) + } + + test("codepoint") { + eval("std.codepoint('๐ŸŒ')") ==> ujson.Num(127757) + assert(evalErr("std.codepoint('')").contains("expected a single character string")) + assert(evalErr("std.codepoint('๐ŸŒ!')").contains("expected a single character string")) + assert(evalErr("std.codepoint('abc')").contains("expected a single character string")) + } + + test("char") { + eval("std.char(127757)") ==> ujson.Str("๐ŸŒ") + } + + test("stringChars") { + eval("std.stringChars('๐ŸŒ')") ==> ujson.Arr("๐ŸŒ") + eval("std.stringChars('Hello ๐ŸŒ')") ==> ujson.Arr("H", "e", "l", "l", "o", " ", "๐ŸŒ") + } + + test("map") { + eval("std.map(function(x) std.codepoint(x), '๐ŸŒ')") ==> ujson.Arr(127757) + eval("std.map(function(x) std.codepoint(x), 'A๐ŸŒB')") ==> ujson.Arr(65, 127757, 66) + } + + test("substr") { + eval("std.substr('A๐ŸŒB', 0, 1)") ==> ujson.Str("A") + eval("std.substr('A๐ŸŒB', 1, 1)") ==> ujson.Str("๐ŸŒ") + eval("std.substr('A๐ŸŒB', 2, 1)") ==> ujson.Str("B") + eval("std.substr('Hello ๐ŸŒ World', 6, 100)") ==> ujson.Str("๐ŸŒ World") + eval("std.substr('๐ŸŒ', 1, 5)") ==> ujson.Str("") // Beyond string length + } + + test("stringSlice") { + eval("'A๐ŸŒB'[0:1]") ==> ujson.Str("A") + eval("'A๐ŸŒB'[1:2]") ==> ujson.Str("๐ŸŒ") + eval("'A๐ŸŒB๐Ÿš€C'[0:5:2]") ==> ujson.Str("ABC") + eval("'ABC๐Ÿš€'[-2:]") ==> ujson.Str("C๐Ÿš€") + } + + test("codepointVsUtf16Ordering") { + // This test demonstrates why sjsonnet uses Unicode codepoint ordering instead of UTF-16 code unit ordering. + // + // The problem: UTF-16 encodes characters above U+FFFF as "surrogate pairs" - two 16-bit code units. + // When comparing strings by UTF-16 code units, we only look at the raw 16-bit values, not the + // actual Unicode codepoints they represent. + // + // Critical test case: + // - U+FFFF = "\uFFFF" โ†’ UTF-16: [0xFFFF] (single code unit) + // - U+10000 = "\uD800\uDC00" โ†’ UTF-16: [0xD800, 0xDC00] (surrogate pair: high + low surrogate) + // + // Correct Unicode codepoint order: U+FFFF (65535) < U+10000 (65536) โ†’ "\uFFFF" < "\uD800\uDC00" + // + // Wrong UTF-16 code unit order: When comparing "\uFFFF" vs "\uD800\uDC00", UTF-16 comparison sees: + // 0xFFFF (65535) vs 0xD800 (55296) โ† only looks at first code unit of surrogate pair! + // Since 65535 > 55296, it incorrectly concludes "\uFFFF" > "\uD800\uDC00" + // + // This breaks ordering for ALL characters above U+FFFF (emojis, math symbols, etc.) + + val testStrings = Array("\uFFFF", "\uD800\uDC00") // U+FFFF, U+10000 + + // Scala's default string ordering uses UTF-16 code units: "\uD800\uDC00" < "\uFFFF" + val utf16Sorted = testStrings.sorted.toList + utf16Sorted ==> List("\uD800\uDC00", "\uFFFF") + + // Our Unicode codepoint ordering: "\uFFFF" < "\uD800\uDC00" + val codepointSorted = testStrings.sorted(sjsonnet.Util.CodepointStringOrdering).toList + codepointSorted ==> List("\uFFFF", "\uD800\uDC00") + + // Critical: these produce different results! This test would fail with UTF-16 ordering. + assert(utf16Sorted != codepointSorted) + + // Jsonnet string operations should use Unicode codepoint ordering + eval("'\\uFFFF' < '\\uD800\\uDC00'") ==> ujson.Bool(true) + eval("std.sort(['\\uD800\\uDC00', '\\uFFFF'])") ==> ujson.Arr("\uFFFF", "\uD800\uDC00") + } + + // sjsonnet allows unpaired surrogates in Unicode escape sequences, + // unlike go-jsonnet and C++ jsonnet which reject them at parse time. + + test("unpairedSurrogatesInEscapes") { + // These should parse successfully (go-jsonnet/C++ jsonnet would fail) + eval("\"\\uD800\"") ==> ujson.Str("\uD800") // High surrogate + eval("\"\\uDC00\"") ==> ujson.Str("\uDC00") // Low surrogate + eval("\"\\uD83D\\uDE00\"") ==> ujson.Str("๐Ÿ˜€") // Valid pair + } + + test("stdCharPreservesRawSurrogates") { + // std.char() preserves raw surrogate codepoints (go-jsonnet replaces with U+FFFD) + eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate + eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate + } + + test("stringComparisons") { + // Comprehensive test of ALL comparison operators with the critical UTF-16 vs Unicode boundary case + // This ensures mutation testing catches bugs in any specific comparison implementation + + val maxBmp = "'\\uFFFF'" // U+FFFF (max Basic Multilingual Plane) + val minSupplementary = "'\\uD800\\uDC00'" // U+10000 (min Supplementary Plane) + + // All comparison operators must use Unicode codepoint ordering + // With UTF-16 code unit ordering, these would ALL give wrong results + + // Less than: U+FFFF < U+10000 + eval(s"$maxBmp < $minSupplementary") ==> ujson.Bool(true) + eval(s"$minSupplementary < $maxBmp") ==> ujson.Bool(false) + + // Less than or equal: U+FFFF <= U+10000 + eval(s"$maxBmp <= $minSupplementary") ==> ujson.Bool(true) + eval(s"$minSupplementary <= $maxBmp") ==> ujson.Bool(false) + eval(s"$maxBmp <= $maxBmp") ==> ujson.Bool(true) // Reflexivity + + // Greater than: U+10000 > U+FFFF + eval(s"$minSupplementary > $maxBmp") ==> ujson.Bool(true) + eval(s"$maxBmp > $minSupplementary") ==> ujson.Bool(false) + + // Greater than or equal: U+10000 >= U+FFFF + eval(s"$minSupplementary >= $maxBmp") ==> ujson.Bool(true) + eval(s"$maxBmp >= $minSupplementary") ==> ujson.Bool(false) + eval(s"$maxBmp >= $maxBmp") ==> ujson.Bool(true) // Reflexivity + + // Equality and inequality + eval(s"$maxBmp == $minSupplementary") ==> ujson.Bool(false) + eval(s"$maxBmp != $minSupplementary") ==> ujson.Bool(true) + eval(s"$maxBmp == $maxBmp") ==> ujson.Bool(true) // Reflexivity + eval(s"$maxBmp != $maxBmp") ==> ujson.Bool(false) + + // Array sorting must also use codepoint ordering + eval(s"std.sort([$minSupplementary, $maxBmp, 'Z', 'A'])") ==> ujson.Arr("A", "Z", "\uFFFF", "\uD800\uDC00") + } + + + test("objectFieldOrdering") { + // Test object field ordering with the critical UTF-16 vs Unicode boundary case + // Input deliberately in REVERSE codepoint order to make the test non-trivial + val testObject = "{\"\uD800\uDC00\": 4, \"\uFFFF\": 3, \"z\": 2, \"a\": 1}" + + // All object field functions should sort by Unicode codepoint order + eval(s"std.objectFields($testObject)") ==> ujson.Arr("a", "z", "\uFFFF", "\uD800\uDC00") + eval(s"std.objectFieldsAll($testObject)") ==> ujson.Arr("a", "z", "\uFFFF", "\uD800\uDC00") + + // Default object rendering also uses codepoint ordering + eval(testObject).toString ==> "{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}" + + // JSON manifest should maintain the same ordering + eval(s"std.manifestJsonMinified($testObject)") ==> + ujson.Str("{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}") + + // TOML manifest should also use codepoint ordering (with escaped Unicode) + eval(s"std.manifestTomlEx($testObject, ' ')") ==> + ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") + } + } +} From 4e01cf76b370c2af0a07fb116954b2b33ea630a7 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 31 Aug 2025 17:24:37 -0700 Subject: [PATCH 02/14] Fix std.length --- sjsonnet/src/sjsonnet/Std.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 047b5808..9b6cdd00 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -51,7 +51,7 @@ class Std( Val.Num( pos, x.force match { - case Val.Str(_, s) => s.length + case Val.Str(_, s) => s.codePointCount(0, s.length) case a: Val.Arr => a.length case o: Val.Obj => o.visibleKeyNames.length case o: Val.Func => o.params.names.length From e01705a64efacf7ef0787f195fcb88aa0ef4f049 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 31 Aug 2025 17:27:22 -0700 Subject: [PATCH 03/14] Fix codepoint. --- sjsonnet/src/sjsonnet/Std.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 9b6cdd00..789886be 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -97,12 +97,14 @@ class Std( } private object Codepoint extends Val.Builtin1("codepoint", "str") { - def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = if ( - str.force.asString.length != 1 || str.force.asString.codePointCount(0, 1) > 1 - ) { - Error.fail("expected a single character string, got " + str.force.asString) - } else { - Val.Num(pos, str.force.asString.codePointAt(0).toDouble) + def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = { + val s = str.force.asString + val codePointCount = s.codePointCount(0, s.length) + if (codePointCount != 1) { + Error.fail("expected a single character string, got " + s) + } else { + Val.Num(pos, s.codePointAt(0).toDouble) + } } } From 742220a026a0ad005ed784e1736c8185465afe0f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 31 Aug 2025 17:26:42 -0700 Subject: [PATCH 04/14] Fix slicing, indexing, substr --- sjsonnet/src/sjsonnet/Evaluator.scala | 8 ++- sjsonnet/src/sjsonnet/Std.scala | 15 ++++- sjsonnet/src/sjsonnet/Util.scala | 82 +++++++++++++++++++++------ 3 files changed, 82 insertions(+), 23 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala index 4db8c676..891f7e4f 100644 --- a/sjsonnet/src/sjsonnet/Evaluator.scala +++ b/sjsonnet/src/sjsonnet/Evaluator.scala @@ -375,9 +375,11 @@ class Evaluator( case (v: Val.Str, i: Val.Num) => val int = i.asPositiveInt if (v.value.isEmpty) Error.fail("string bounds error: string is empty", pos) - if (int >= v.value.length) - Error.fail(s"string bounds error: $int not within [0, ${v.value.length})", pos) - Val.Str(pos, new String(Array(v.value(int)))) + val unicodeLength = v.value.codePointCount(0, v.value.length) + if (int >= unicodeLength) + Error.fail(s"string bounds error: $int not within [0, $unicodeLength)", pos) + val (startUtf16, endUtf16) = Util.codePointOffsetsToStringIndices(v.value, int, int + 1) + Val.Str(pos, v.value.substring(startUtf16, endUtf16)) case (v: Val.Obj, i: Val.Str) => v.value(i.value, pos) case (lhs, rhs) => diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 789886be..a879b2bc 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -535,9 +535,18 @@ class Std( case v: Val.Num => v.asPositiveInt case _ => Error.fail("Expected a number for len in substr, got " + len.force.prettyName) } - val safeOffset = math.min(offset, str.length) - val safeLength = math.min(length, str.length - safeOffset) - Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength)) + + val unicodeLength = str.codePointCount(0, str.length) + val safeOffset = math.min(offset, unicodeLength) + val safeLength = math.min(length, unicodeLength - safeOffset) + + if (safeLength <= 0) { + Val.Str(pos, "") + } else { + val (startUtf16, endUtf16) = + Util.codePointOffsetsToStringIndices(str, safeOffset, safeOffset + safeLength) + Val.Str(pos, str.substring(startUtf16, endUtf16)) + } } } diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index 9cc3e843..a4f86ff2 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -17,7 +17,7 @@ object Util { s"${line + 1}:${col + 1}" } - def sliceArr[T: scala.reflect.ClassTag]( + private def sliceArr[T: scala.reflect.ClassTag]( arr: Array[T], start: Int, end: Int, @@ -41,7 +41,7 @@ object Util { _end: Option[Int], _step: Option[Int]): Val = { def length0(e: Val): Int = e match { - case Val.Str(_, s) => s.length + case Val.Str(_, s) => s.codePointCount(0, s.length) case a: Val.Arr => a.length case x => Error.fail("Cannot get length of " + x.prettyName, e.pos)(ev) } @@ -72,26 +72,74 @@ object Util { res: Val } - def sliceArr[T: scala.reflect.ClassTag]( - arr: Array[T], - start: Option[Int], - end: Option[Int], - step: Option[Int]): Array[T] = { - sliceArr(arr, start.getOrElse(0), end.getOrElse(arr.length), step.getOrElse(1)) + /** + * Converts Unicode codepoint positions to Java String indices. For example, the string "๐ŸŒ!" has + * a length of 3 UTF-16 code units, but only 2 Unicode codepoints, so this function would map the + * range (0, 2) to (0, 3). + */ + def codePointOffsetsToStringIndices( + s: String, + startCodePointOffset: Int, + endCodePointOffset: Int): (Int, Int) = { + val unicodeLength = s.codePointCount(0, s.length) + val safeStart = math.max(0, math.min(startCodePointOffset, unicodeLength)) + val safeEnd = math.max(safeStart, math.min(endCodePointOffset, unicodeLength)) + + if (safeStart == safeEnd) { + val utf16Pos = if (safeStart == 0) 0 else s.offsetByCodePoints(0, safeStart) + (utf16Pos, utf16Pos) + } else { + val startUtf16 = if (safeStart == 0) 0 else s.offsetByCodePoints(0, safeStart) + val endUtf16 = s.offsetByCodePoints(startUtf16, safeEnd - safeStart) + (startUtf16, endUtf16) + } } - def sliceStr(s: String, start: Int, end: Int, step: Int): String = { - if (start >= end || start >= s.length) { + + private def sliceStr(s: String, start: Int, end: Int, step: Int): String = { + val unicodeLength = s.codePointCount(0, s.length) + if (start >= end || start >= unicodeLength) { "" - } else + } else { step match { - case 1 => s.slice(start, end) + case 1 => + val (startUtf16, endUtf16) = codePointOffsetsToStringIndices(s, start, end) + s.substring(startUtf16, endUtf16) case _ => - val range = start until end by step - new String(range.dropWhile(_ < 0).takeWhile(_ < s.length).map(s).toArray) + val result = new java.lang.StringBuilder() + var sIdx = 0 + var codepointIndex = 0 + + // Skip to start codepoint position + while (sIdx < s.length && codepointIndex < start) { + val cp = s.codePointAt(sIdx) + sIdx += java.lang.Character.charCount(cp) + codepointIndex += 1 + } + + // Collect every `step`th codepoint until `end` + var rel = 0 // relative index from start + while (sIdx < s.length && codepointIndex < end) { + val c = s.charAt(sIdx) + if (java.lang.Character.isSurrogate(c)) { + // Handle surrogate pair + val cp = s.codePointAt(sIdx) + if (rel % step == 0) { + result.append(java.lang.Character.toChars(cp)) + } + sIdx += java.lang.Character.charCount(cp) + } else { + // Single char, non-surrogate + if (rel % step == 0) { + result.append(c) + } + sIdx += 1 + } + codepointIndex += 1 + rel += 1 + } + result.toString } - } - def sliceStr(s: String, start: Option[Int], end: Option[Int], step: Option[Int]): String = { - sliceStr(s, start.getOrElse(0), end.getOrElse(s.length), step.getOrElse(1)) + } } val isWindows: Boolean = { From 21050ea14055cb52e359ff662612aea00129342b Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 31 Aug 2025 17:27:33 -0700 Subject: [PATCH 05/14] Fix stringChars (and, by extension, map). --- sjsonnet/src/sjsonnet/Std.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index a879b2bc..90ea62b9 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -2113,13 +2113,16 @@ class Std( } def stringChars(pos: Position, str: String): Val.Arr = { - val a = new Array[Lazy](str.length) + val chars = new Array[Lazy](str.codePointCount(0, str.length)) + var charIndex = 0 var i = 0 - while (i < a.length) { - a(i) = Val.Str(pos, String.valueOf(str.charAt(i))) - i += 1 + while (i < str.length) { + val codePoint = str.codePointAt(i) + chars(charIndex) = Val.Str(pos, new String(Character.toChars(codePoint))) + i += Character.charCount(codePoint) + charIndex += 1 } - Val.Arr(pos, a) + Val.Arr(pos, chars) } def getVisibleKeys(ev: EvalScope, v1: Val.Obj): Array[String] = From 0571d6e4d0a45414844258d403eff5648d4193e9 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 1 Sep 2025 17:24:47 -0700 Subject: [PATCH 06/14] Fix string comparison / ordering --- sjsonnet/src/sjsonnet/Evaluator.scala | 10 +++--- sjsonnet/src/sjsonnet/Materializer.scala | 2 +- sjsonnet/src/sjsonnet/Std.scala | 10 +++--- sjsonnet/src/sjsonnet/Util.scala | 41 ++++++++++++++++++++++++ sjsonnet/src/sjsonnet/Val.scala | 2 +- 5 files changed, 53 insertions(+), 12 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala index 891f7e4f..25409ea7 100644 --- a/sjsonnet/src/sjsonnet/Evaluator.scala +++ b/sjsonnet/src/sjsonnet/Evaluator.scala @@ -524,7 +524,7 @@ class Evaluator( case Expr.BinaryOp.OP_< => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l < r) + case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) < 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l < r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) < 0) case _ => fail() @@ -532,7 +532,7 @@ class Evaluator( case Expr.BinaryOp.OP_> => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l > r) + case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) > 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l > r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) > 0) case _ => fail() @@ -540,7 +540,7 @@ class Evaluator( case Expr.BinaryOp.OP_<= => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l <= r) + case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) <= 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l <= r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) <= 0) case _ => fail() @@ -548,7 +548,7 @@ class Evaluator( case Expr.BinaryOp.OP_>= => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l >= r) + case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) >= 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l >= r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) >= 0) case _ => fail() @@ -834,7 +834,7 @@ class Evaluator( def compare(x: Val, y: Val): Int = (x, y) match { case (_: Val.Null, _: Val.Null) => 0 case (x: Val.Num, y: Val.Num) => x.asDouble.compareTo(y.asDouble) - case (x: Val.Str, y: Val.Str) => x.value.compareTo(y.value) + case (x: Val.Str, y: Val.Str) => Util.compareStringsByCodepoint(x.value, y.value) case (x: Val.Bool, y: Val.Bool) => x.asBoolean.compareTo(y.asBoolean) case (x: Val.Arr, y: Val.Arr) => val len = math.min(x.length, y.length) diff --git a/sjsonnet/src/sjsonnet/Materializer.scala b/sjsonnet/src/sjsonnet/Materializer.scala index 77f21c5d..d1a3c614 100644 --- a/sjsonnet/src/sjsonnet/Materializer.scala +++ b/sjsonnet/src/sjsonnet/Materializer.scala @@ -34,7 +34,7 @@ abstract class Materializer { -1 ) if (sort) { - if (prevKey != null && k.compareTo(prevKey) <= 0) + if (prevKey != null && Util.compareStringsByCodepoint(k, prevKey) <= 0) Error.fail( s"""Internal error: Unexpected key "$k" after "$prevKey" in sorted object materialization""", v.pos diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 90ea62b9..74ff8fdf 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -1042,7 +1042,7 @@ class Std( indexedPath: Seq[String])(implicit ev: EvalScope): StringWriter = { val (sections, nonSections) = v.visibleKeyNames.partition(k => isSection(v.value(k, v.pos)(ev))) - for (k <- nonSections.sorted) { + for (k <- nonSections.sorted(Util.CodepointStringOrdering)) { out.write(cumulatedIndent) out.write(TomlRenderer.escapeKey(k)) out.write(" = ") @@ -1052,7 +1052,7 @@ class Std( } out.write('\n') - for (k <- sections.sorted) { + for (k <- sections.sorted(Util.CodepointStringOrdering)) { val v0 = v.value(k, v.pos, v)(ev) if (isTableArray(v0)) { for (i <- 0 until v0.asArr.length) { @@ -2078,7 +2078,7 @@ class Std( val indices = Array.range(0, vs.length) val sortedIndices = if (keyType == classOf[Val.Str]) { - indices.sortBy(i => keys(i).cast[Val.Str].asString) + indices.sortBy(i => keys(i).cast[Val.Str].asString)(Util.CodepointStringOrdering) } else if (keyType == classOf[Val.Num]) { indices.sortBy(i => keys(i).cast[Val.Num].asDouble) } else if (keyType == classOf[Val.Arr]) { @@ -2097,7 +2097,7 @@ class Std( Error.fail("Cannot sort with values that are not all the same type") if (keyType == classOf[Val.Str]) { - vs.map(_.force.cast[Val.Str]).sortBy(_.asString) + vs.map(_.force.cast[Val.Str]).sortBy(_.asString)(Util.CodepointStringOrdering) } else if (keyType == classOf[Val.Num]) { vs.map(_.force.cast[Val.Num]).sortBy(_.asDouble) } else if (keyType == classOf[Val.Arr]) { @@ -2132,7 +2132,7 @@ class Std( maybeSortKeys(ev, v1.allKeyNames) @inline private def maybeSortKeys(ev: EvalScope, keys: Array[String]): Array[String] = - if (ev.settings.preserveOrder) keys else keys.sorted + if (ev.settings.preserveOrder) keys else keys.sorted(Util.CodepointStringOrdering) def getObjValuesFromKeys( pos: Position, diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index a4f86ff2..9bf9f1d3 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -142,6 +142,47 @@ object Util { } } + /** + * Compares two strings by Unicode codepoint values rather than UTF-16 code units. + * This ensures that strings with characters above U+FFFF (which require surrogate pairs + * in UTF-16) are compared correctly according to their Unicode codepoint values. + */ + def compareStringsByCodepoint(s1: String, s2: String): Int = { + val n1 = s1.length + val n2 = s2.length + var i1 = 0 + var i2 = 0 + while (i1 < n1 && i2 < n2) { + val c1 = s1.charAt(i1) + val c2 = s2.charAt(i2) + val c1Sur = java.lang.Character.isSurrogate(c1) + val c2Sur = java.lang.Character.isSurrogate(c2) + + if (!c1Sur && !c2Sur) { + // Both are non-surrogates, compare directly + if (c1 != c2) return java.lang.Character.compare(c1, c2) + i1 += 1 + i2 += 1 + } else { + // At least one is a surrogate, use full codepoint logic + val cp1 = s1.codePointAt(i1) + val cp2 = s2.codePointAt(i2) + if (cp1 != cp2) return java.lang.Integer.compare(cp1, cp2) + i1 += java.lang.Character.charCount(cp1) + i2 += java.lang.Character.charCount(cp2) + } + } + if (i1 < n1) 1 else if (i2 < n2) -1 else 0 + } + + /** + * A reusable Ordering[String] that compares by Unicode codepoint values. + * Use this in place of default `.sorted` when ordering should be codepoint-aware. + */ + val CodepointStringOrdering: Ordering[String] = new Ordering[String] { + override def compare(x: String, y: String): Int = compareStringsByCodepoint(x, y) + } + val isWindows: Boolean = { // This is normally non-null on the JVM, but it might be null in ScalaJS hence the Option: Option(System.getProperty("os.name")).exists(_.toLowerCase.startsWith("windows")) diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala index 1fd07a8e..c4216668 100644 --- a/sjsonnet/src/sjsonnet/Val.scala +++ b/sjsonnet/src/sjsonnet/Val.scala @@ -455,7 +455,7 @@ object Val { def foreachElement(sort: Boolean, pos: Position)(f: (String, Val) => Unit)(implicit ev: EvalScope): Unit = { - val keys = if (sort) visibleKeyNames.sorted else visibleKeyNames + val keys = if (sort) visibleKeyNames.sorted(Util.CodepointStringOrdering) else visibleKeyNames for (k <- keys) { val v = value(k, pos) f(k, v) From f48762e55292131ab374da8dd2b0e4962ca6b50f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 3 Sep 2025 11:29:31 -0700 Subject: [PATCH 07/14] Use Java 11 Character.toString(codepoint: int); minor import cleanup --- sjsonnet/src/sjsonnet/Std.scala | 2 +- sjsonnet/src/sjsonnet/Util.scala | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 74ff8fdf..9e062cdb 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -2118,7 +2118,7 @@ class Std( var i = 0 while (i < str.length) { val codePoint = str.codePointAt(i) - chars(charIndex) = Val.Str(pos, new String(Character.toChars(codePoint))) + chars(charIndex) = Val.Str(pos, Character.toString(codePoint)) i += Character.charCount(codePoint) charIndex += 1 } diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index 9bf9f1d3..664d951f 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -112,7 +112,7 @@ object Util { // Skip to start codepoint position while (sIdx < s.length && codepointIndex < start) { val cp = s.codePointAt(sIdx) - sIdx += java.lang.Character.charCount(cp) + sIdx += Character.charCount(cp) codepointIndex += 1 } @@ -120,13 +120,13 @@ object Util { var rel = 0 // relative index from start while (sIdx < s.length && codepointIndex < end) { val c = s.charAt(sIdx) - if (java.lang.Character.isSurrogate(c)) { + if (Character.isSurrogate(c)) { // Handle surrogate pair val cp = s.codePointAt(sIdx) if (rel % step == 0) { - result.append(java.lang.Character.toChars(cp)) + result.append(Character.toString(cp)) } - sIdx += java.lang.Character.charCount(cp) + sIdx += Character.charCount(cp) } else { // Single char, non-surrogate if (rel % step == 0) { @@ -155,21 +155,21 @@ object Util { while (i1 < n1 && i2 < n2) { val c1 = s1.charAt(i1) val c2 = s2.charAt(i2) - val c1Sur = java.lang.Character.isSurrogate(c1) - val c2Sur = java.lang.Character.isSurrogate(c2) + val c1Sur = Character.isSurrogate(c1) + val c2Sur = Character.isSurrogate(c2) if (!c1Sur && !c2Sur) { // Both are non-surrogates, compare directly - if (c1 != c2) return java.lang.Character.compare(c1, c2) + if (c1 != c2) return Character.compare(c1, c2) i1 += 1 i2 += 1 } else { // At least one is a surrogate, use full codepoint logic val cp1 = s1.codePointAt(i1) val cp2 = s2.codePointAt(i2) - if (cp1 != cp2) return java.lang.Integer.compare(cp1, cp2) - i1 += java.lang.Character.charCount(cp1) - i2 += java.lang.Character.charCount(cp2) + if (cp1 != cp2) return Integer.compare(cp1, cp2) + i1 += Character.charCount(cp1) + i2 += Character.charCount(cp2) } } if (i1 < n1) 1 else if (i2 < n2) -1 else 0 From 1d461dd20901a84519354cfa9aa66d358ca3ae0d Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 3 Sep 2025 11:46:48 -0700 Subject: [PATCH 08/14] Fixes to flatMap and findSubstr (WIP) --- sjsonnet/src/sjsonnet/Std.scala | 21 ++++-- .../src/sjsonnet/UnicodeHandlingTests.scala | 64 +++++++++++++++++++ 2 files changed, 81 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 9e062cdb..060fca14 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -1416,10 +1416,13 @@ class Std( Val.Arr(pos, arrResults) case s: Val.Str => - val builder = new StringBuilder() - for (c: Char <- s.value) { + val builder = new java.lang.StringBuilder() + var i = 0 + while (i < s.value.length) { + val codePoint = s.value.codePointAt(i) + val codepointStr = Character.toString(codePoint) val fres = - func.apply1(Val.Str(pos, c.toString), pos.noOffset)(ev, TailstrictModeDisabled) + func.apply1(Val.Str(pos, codepointStr), pos.noOffset)(ev, TailstrictModeDisabled) builder.append( fres match { case fstr: Val.Str => fstr.value @@ -1433,6 +1436,7 @@ class Std( ) } ) + i += Character.charCount(codePoint) } Val.Str(pos, builder.toString) case _ => Error.fail("Argument must be either array or string") @@ -1462,8 +1466,17 @@ class Std( if (matchIndex == -1) Val.Arr(pos, emptyLazyArray) else { val indices = new mutable.ArrayBuilder.ofRef[Val.Num] + + // Compute codepoint indices incrementally, avoiding an O(n) calculation for each match. + var prevCharIndex = 0 + var prevCodePointIndex = 0 + while (0 <= matchIndex && matchIndex < str.length) { - indices.+=(Val.Num(pos, matchIndex)) + val codePointIndex = prevCodePointIndex + str.codePointCount(prevCharIndex, matchIndex) + indices.+=(Val.Num(pos, codePointIndex)) + + prevCharIndex = matchIndex + prevCodePointIndex = codePointIndex matchIndex = str.indexOf(pat, matchIndex + 1) } Val.Arr(pos, indices.result()) diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index 097644ac..6ca5e6b2 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -170,5 +170,69 @@ object UnicodeHandlingTests extends TestSuite { eval(s"std.manifestTomlEx($testObject, ' ')") ==> ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") } + + test("flatMap") { + // Test that std.flatMap now uses code point semantics like std.map + // This tests the consistency fix for flatMap on strings + + // Basic ASCII test + eval("std.flatMap(function(x) x + x, 'ABC')") ==> ujson.Str("AABBCC") + + // Unicode emoji test - should handle each emoji as a single character + eval("std.flatMap(function(x) x + '!', '๐ŸŒ๐Ÿš€')") ==> ujson.Str("๐ŸŒ!๐Ÿš€!") + + // Critical test: surrogate pair boundary case + // U+FFFF followed by U+10000 (which needs surrogate pair) + eval("std.flatMap(function(x) x + '-', '\\uFFFF\\uD800\\uDC00')") ==> ujson.Str("\uFFFF-\uD800\uDC00-") + + // Test consistency with std.map behavior + val testStr = "'A๐ŸŒB'"; + eval(s"std.length(std.flatMap(function(x) x, $testStr))") ==> + eval(s"std.length(std.map(function(x) x, $testStr))") + + // Test with function that returns null (should be converted to empty string) + eval("std.flatMap(function(x) if x == '๐ŸŒ' then null else x, 'A๐ŸŒB')") ==> ujson.Str("AB") + + // Array flatMap should still work (unchanged behavior) + eval("std.flatMap(function(x) [x, x], [1, 2])") ==> ujson.Arr(1, 1, 2, 2) + } + + test("findSubstr") { + // Test that std.findSubstr now returns code point offsets instead of code unit positions + // This tests the consistency fix for findSubstr indices + + // Basic ASCII test - should be unchanged + eval("std.findSubstr('l', 'hello')") ==> ujson.Arr(2, 3) + eval("std.findSubstr('o', 'hello world')") ==> ujson.Arr(4, 7) + + // Test with Unicode emojis - positions should be in code points + eval("std.findSubstr('๐ŸŒ', 'Hello ๐ŸŒ World')") ==> ujson.Arr(6) + eval("std.findSubstr('o', 'Hello ๐ŸŒ World')") ==> ujson.Arr(4, 9) + + // Critical test: surrogate pair boundary case + // The string "A\uFFFF\uD800\uDC00B" has: + // - 'A' at code point 0 + // - U+FFFF at code point 1 + // - U+10000 (\uD800\uDC00) at code point 2 + // - 'B' at code point 3 + eval("std.findSubstr('\\uFFFF', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(1) + eval("std.findSubstr('\\uD800\\uDC00', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(2) + eval("std.findSubstr('B', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(3) + + // Test with pattern that spans multiple code points + eval("std.findSubstr('๐ŸŒ๐Ÿš€', '๐ŸŒ๐Ÿš€ and more ๐ŸŒ๐Ÿš€')") ==> ujson.Arr(0, 12) + + // Empty pattern should return empty array + eval("std.findSubstr('', 'test')") ==> ujson.Arr() + + // Non-existent pattern should return empty array + eval("std.findSubstr('xyz', 'hello world')") ==> ujson.Arr() + + // Consistency check: positions returned by findSubstr should work with substr + val testString = "'Hello ๐ŸŒ World'"; + val searchPattern = "'๐ŸŒ'"; + // This verifies that the index returned by findSubstr works with substr to extract the same pattern + eval(s"std.substr($testString, std.findSubstr($searchPattern, $testString)[0], std.length($searchPattern))") ==> ujson.Str("๐ŸŒ") + } } } From 2d609947acdb8f818f577231377a2d4ec1493d43 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 3 Sep 2025 13:09:34 -0700 Subject: [PATCH 09/14] scalafmt --- sjsonnet/src/sjsonnet/Evaluator.scala | 12 ++++++---- sjsonnet/src/sjsonnet/Util.scala | 10 ++++---- .../src/sjsonnet/UnicodeHandlingTests.scala | 24 ++++++++++++------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala index 25409ea7..1c9522e0 100644 --- a/sjsonnet/src/sjsonnet/Evaluator.scala +++ b/sjsonnet/src/sjsonnet/Evaluator.scala @@ -524,7 +524,8 @@ class Evaluator( case Expr.BinaryOp.OP_< => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) < 0) + case (Val.Str(_, l), Val.Str(_, r)) => + Val.bool(pos, Util.compareStringsByCodepoint(l, r) < 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l < r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) < 0) case _ => fail() @@ -532,7 +533,8 @@ class Evaluator( case Expr.BinaryOp.OP_> => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) > 0) + case (Val.Str(_, l), Val.Str(_, r)) => + Val.bool(pos, Util.compareStringsByCodepoint(l, r) > 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l > r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) > 0) case _ => fail() @@ -540,7 +542,8 @@ class Evaluator( case Expr.BinaryOp.OP_<= => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) <= 0) + case (Val.Str(_, l), Val.Str(_, r)) => + Val.bool(pos, Util.compareStringsByCodepoint(l, r) <= 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l <= r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) <= 0) case _ => fail() @@ -548,7 +551,8 @@ class Evaluator( case Expr.BinaryOp.OP_>= => (l, r) match { - case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, Util.compareStringsByCodepoint(l, r) >= 0) + case (Val.Str(_, l), Val.Str(_, r)) => + Val.bool(pos, Util.compareStringsByCodepoint(l, r) >= 0) case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l >= r) case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) >= 0) case _ => fail() diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index 664d951f..efa2eb3b 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -143,9 +143,9 @@ object Util { } /** - * Compares two strings by Unicode codepoint values rather than UTF-16 code units. - * This ensures that strings with characters above U+FFFF (which require surrogate pairs - * in UTF-16) are compared correctly according to their Unicode codepoint values. + * Compares two strings by Unicode codepoint values rather than UTF-16 code units. This ensures + * that strings with characters above U+FFFF (which require surrogate pairs in UTF-16) are + * compared correctly according to their Unicode codepoint values. */ def compareStringsByCodepoint(s1: String, s2: String): Int = { val n1 = s1.length @@ -176,8 +176,8 @@ object Util { } /** - * A reusable Ordering[String] that compares by Unicode codepoint values. - * Use this in place of default `.sorted` when ordering should be codepoint-aware. + * A reusable Ordering[String] that compares by Unicode codepoint values. Use this in place of + * default `.sorted` when ordering should be codepoint-aware. */ val CodepointStringOrdering: Ordering[String] = new Ordering[String] { override def compare(x: String, y: String): Int = compareStringsByCodepoint(x, y) diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index 6ca5e6b2..f5671848 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -115,7 +115,7 @@ object UnicodeHandlingTests extends TestSuite { // Comprehensive test of ALL comparison operators with the critical UTF-16 vs Unicode boundary case // This ensures mutation testing catches bugs in any specific comparison implementation - val maxBmp = "'\\uFFFF'" // U+FFFF (max Basic Multilingual Plane) + val maxBmp = "'\\uFFFF'" // U+FFFF (max Basic Multilingual Plane) val minSupplementary = "'\\uD800\\uDC00'" // U+10000 (min Supplementary Plane) // All comparison operators must use Unicode codepoint ordering @@ -146,10 +146,14 @@ object UnicodeHandlingTests extends TestSuite { eval(s"$maxBmp != $maxBmp") ==> ujson.Bool(false) // Array sorting must also use codepoint ordering - eval(s"std.sort([$minSupplementary, $maxBmp, 'Z', 'A'])") ==> ujson.Arr("A", "Z", "\uFFFF", "\uD800\uDC00") + eval(s"std.sort([$minSupplementary, $maxBmp, 'Z', 'A'])") ==> ujson.Arr( + "A", + "Z", + "\uFFFF", + "\uD800\uDC00" + ) } - test("objectFieldOrdering") { // Test object field ordering with the critical UTF-16 vs Unicode boundary case // Input deliberately in REVERSE codepoint order to make the test non-trivial @@ -164,11 +168,11 @@ object UnicodeHandlingTests extends TestSuite { // JSON manifest should maintain the same ordering eval(s"std.manifestJsonMinified($testObject)") ==> - ujson.Str("{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}") + ujson.Str("{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}") // TOML manifest should also use codepoint ordering (with escaped Unicode) eval(s"std.manifestTomlEx($testObject, ' ')") ==> - ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") + ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") } test("flatMap") { @@ -183,12 +187,14 @@ object UnicodeHandlingTests extends TestSuite { // Critical test: surrogate pair boundary case // U+FFFF followed by U+10000 (which needs surrogate pair) - eval("std.flatMap(function(x) x + '-', '\\uFFFF\\uD800\\uDC00')") ==> ujson.Str("\uFFFF-\uD800\uDC00-") + eval("std.flatMap(function(x) x + '-', '\\uFFFF\\uD800\\uDC00')") ==> ujson.Str( + "\uFFFF-\uD800\uDC00-" + ) // Test consistency with std.map behavior val testStr = "'A๐ŸŒB'"; eval(s"std.length(std.flatMap(function(x) x, $testStr))") ==> - eval(s"std.length(std.map(function(x) x, $testStr))") + eval(s"std.length(std.map(function(x) x, $testStr))") // Test with function that returns null (should be converted to empty string) eval("std.flatMap(function(x) if x == '๐ŸŒ' then null else x, 'A๐ŸŒB')") ==> ujson.Str("AB") @@ -232,7 +238,9 @@ object UnicodeHandlingTests extends TestSuite { val testString = "'Hello ๐ŸŒ World'"; val searchPattern = "'๐ŸŒ'"; // This verifies that the index returned by findSubstr works with substr to extract the same pattern - eval(s"std.substr($testString, std.findSubstr($searchPattern, $testString)[0], std.length($searchPattern))") ==> ujson.Str("๐ŸŒ") + eval( + s"std.substr($testString, std.findSubstr($searchPattern, $testString)[0], std.length($searchPattern))" + ) ==> ujson.Str("๐ŸŒ") } } } From 9497d570c6a4020c1b95602cf7c8be6bf3737572 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 8 Sep 2025 16:34:28 -0700 Subject: [PATCH 10/14] Extract v.value into a local --- sjsonnet/src/sjsonnet/Evaluator.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala index 1c9522e0..1e430392 100644 --- a/sjsonnet/src/sjsonnet/Evaluator.scala +++ b/sjsonnet/src/sjsonnet/Evaluator.scala @@ -374,12 +374,13 @@ class Evaluator( v.force(int) case (v: Val.Str, i: Val.Num) => val int = i.asPositiveInt - if (v.value.isEmpty) Error.fail("string bounds error: string is empty", pos) - val unicodeLength = v.value.codePointCount(0, v.value.length) + val str = v.value + if (str.isEmpty) Error.fail("string bounds error: string is empty", pos) + val unicodeLength = str.codePointCount(0, str.length) if (int >= unicodeLength) Error.fail(s"string bounds error: $int not within [0, $unicodeLength)", pos) - val (startUtf16, endUtf16) = Util.codePointOffsetsToStringIndices(v.value, int, int + 1) - Val.Str(pos, v.value.substring(startUtf16, endUtf16)) + val (startUtf16, endUtf16) = Util.codePointOffsetsToStringIndices(str, int, int + 1) + Val.Str(pos, str.substring(startUtf16, endUtf16)) case (v: Val.Obj, i: Val.Str) => v.value(i.value, pos) case (lhs, rhs) => From 603aed9d4b5d0474408e531cce1e697867f5e581 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 8 Sep 2025 16:34:58 -0700 Subject: [PATCH 11/14] Pre-size string builder and avoid modulus. --- sjsonnet/src/sjsonnet/Util.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index efa2eb3b..454026c5 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -105,7 +105,8 @@ object Util { val (startUtf16, endUtf16) = codePointOffsetsToStringIndices(s, start, end) s.substring(startUtf16, endUtf16) case _ => - val result = new java.lang.StringBuilder() + val result = new java.lang.StringBuilder( + math.min(s.length, ((end - start) + step - 1) / step)) var sIdx = 0 var codepointIndex = 0 @@ -118,19 +119,22 @@ object Util { // Collect every `step`th codepoint until `end` var rel = 0 // relative index from start + var nextInclude = 0 // next relative index to include while (sIdx < s.length && codepointIndex < end) { val c = s.charAt(sIdx) if (Character.isSurrogate(c)) { - // Handle surrogate pair + // Handle surrogate pair (or unpaired surrogates) val cp = s.codePointAt(sIdx) - if (rel % step == 0) { + if (rel == nextInclude) { result.append(Character.toString(cp)) + nextInclude += step } sIdx += Character.charCount(cp) } else { // Single char, non-surrogate - if (rel % step == 0) { + if (rel == nextInclude) { result.append(c) + nextInclude += step } sIdx += 1 } From 733b6e956239d1caaaf4e9d07c1d0b1256868cf5 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 8 Sep 2025 17:02:36 -0700 Subject: [PATCH 12/14] scalafmt --- sjsonnet/src/sjsonnet/Util.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index 454026c5..1ed51e66 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -105,8 +105,8 @@ object Util { val (startUtf16, endUtf16) = codePointOffsetsToStringIndices(s, start, end) s.substring(startUtf16, endUtf16) case _ => - val result = new java.lang.StringBuilder( - math.min(s.length, ((end - start) + step - 1) / step)) + val result = + new java.lang.StringBuilder(math.min(s.length, ((end - start) + step - 1) / step)) var sIdx = 0 var codepointIndex = 0 From 8d5a95f932043646a60d0e5e7e7a09d3ccbbc3bf Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 8 Sep 2025 17:02:54 -0700 Subject: [PATCH 13/14] Test cleanups. --- .../src/sjsonnet/UnicodeHandlingTests.scala | 151 ++++++------------ 1 file changed, 47 insertions(+), 104 deletions(-) diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index f5671848..dd4d1587 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -5,7 +5,7 @@ import TestUtils.{eval, evalErr} /** * Tests for correct handling of Unicode strings, especially those that require surrogate pairs in - * UTF-16 (i.e., codepoints above U+FFFF). + * UTF-16 (i.e., characters with codepoints above U+FFFF). */ object UnicodeHandlingTests extends TestSuite { def tests: Tests = Tests { @@ -13,7 +13,9 @@ object UnicodeHandlingTests extends TestSuite { test("stringLength") { eval("std.length('๐ŸŒ')") ==> ujson.Num(1) eval("std.length('Hello ๐ŸŒ')") ==> ujson.Num(7) - eval("std.length('๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ')") ==> ujson.Num(7) // Family emoji with ZWJ sequences + // Jsonnet strings are defined over codepoints, not grapheme clusters, so the + // following "family" emoji has a length of 7 (because it has 7 codepoints): + eval("std.length('๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ')") ==> ujson.Num(7) } test("stringIndex") { @@ -43,6 +45,10 @@ object UnicodeHandlingTests extends TestSuite { eval("std.map(function(x) std.codepoint(x), 'A๐ŸŒB')") ==> ujson.Arr(65, 127757, 66) } + test("flatMap") { + eval("std.flatMap(function(x) x + '!', '๐ŸŒ๐Ÿš€')") ==> ujson.Str("๐ŸŒ!๐Ÿš€!") + } + test("substr") { eval("std.substr('A๐ŸŒB', 0, 1)") ==> ujson.Str("A") eval("std.substr('A๐ŸŒB', 1, 1)") ==> ujson.Str("๐ŸŒ") @@ -58,24 +64,17 @@ object UnicodeHandlingTests extends TestSuite { eval("'ABC๐Ÿš€'[-2:]") ==> ujson.Str("C๐Ÿš€") } - test("codepointVsUtf16Ordering") { - // This test demonstrates why sjsonnet uses Unicode codepoint ordering instead of UTF-16 code unit ordering. + test("codepointVsUtf16OrderingDemonstration") { + // This test demonstrates the difference between UTF-16 code unit ordering (wrong) + // and Unicode codepoint ordering (correct) at the critical boundary between + // the Basic Multilingual Plane (BMP) and Supplementary Planes. // - // The problem: UTF-16 encodes characters above U+FFFF as "surrogate pairs" - two 16-bit code units. - // When comparing strings by UTF-16 code units, we only look at the raw 16-bit values, not the - // actual Unicode codepoints they represent. + // Test case: U+FFFF (last BMP char) vs U+10000 (first supplementary char) + // - U+FFFF is represented as a single UTF-16 code unit: 0xFFFF + // - U+10000 requires a surrogate pair: 0xD800 0xDC00 // - // Critical test case: - // - U+FFFF = "\uFFFF" โ†’ UTF-16: [0xFFFF] (single code unit) - // - U+10000 = "\uD800\uDC00" โ†’ UTF-16: [0xD800, 0xDC00] (surrogate pair: high + low surrogate) - // - // Correct Unicode codepoint order: U+FFFF (65535) < U+10000 (65536) โ†’ "\uFFFF" < "\uD800\uDC00" - // - // Wrong UTF-16 code unit order: When comparing "\uFFFF" vs "\uD800\uDC00", UTF-16 comparison sees: - // 0xFFFF (65535) vs 0xD800 (55296) โ† only looks at first code unit of surrogate pair! - // Since 65535 > 55296, it incorrectly concludes "\uFFFF" > "\uD800\uDC00" - // - // This breaks ordering for ALL characters above U+FFFF (emojis, math symbols, etc.) + // UTF-16 comparison incorrectly compares 0xFFFF > 0xD800 (first code unit only) + // Unicode comparison correctly compares U+FFFF < U+10000 (actual codepoints) val testStrings = Array("\uFFFF", "\uD800\uDC00") // U+FFFF, U+10000 @@ -87,40 +86,42 @@ object UnicodeHandlingTests extends TestSuite { val codepointSorted = testStrings.sorted(sjsonnet.Util.CodepointStringOrdering).toList codepointSorted ==> List("\uFFFF", "\uD800\uDC00") - // Critical: these produce different results! This test would fail with UTF-16 ordering. + // These produce different results, demonstrating the bug that was fixed assert(utf16Sorted != codepointSorted) + } - // Jsonnet string operations should use Unicode codepoint ordering + test("codepointOrderingInJsonnet") { + // Verify that Jsonnet operations use Unicode codepoint ordering eval("'\\uFFFF' < '\\uD800\\uDC00'") ==> ujson.Bool(true) eval("std.sort(['\\uD800\\uDC00', '\\uFFFF'])") ==> ujson.Arr("\uFFFF", "\uD800\uDC00") } - // sjsonnet allows unpaired surrogates in Unicode escape sequences, - // unlike go-jsonnet and C++ jsonnet which reject them at parse time. + // Unpaired surrogate handling - sjsonnet-specific behavior + // + // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet: + // - go/C++ reject unpaired surrogates in escape sequences at parse time + // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD + // - sjsonnet preserves unpaired surrogates throughout + // + // This permissive behavior is maintained for backwards compatibility. test("unpairedSurrogatesInEscapes") { - // These should parse successfully (go-jsonnet/C++ jsonnet would fail) - eval("\"\\uD800\"") ==> ujson.Str("\uD800") // High surrogate - eval("\"\\uDC00\"") ==> ujson.Str("\uDC00") // Low surrogate - eval("\"\\uD83D\\uDE00\"") ==> ujson.Str("๐Ÿ˜€") // Valid pair + // sjsonnet parses these successfully (go/C++ would reject) + eval("\"\\uD800\"") ==> ujson.Str("\uD800") // High surrogate alone + eval("\"\\uDC00\"") ==> ujson.Str("\uDC00") // Low surrogate alone + eval("\"\\uD83D\\uDE00\"") ==> ujson.Str("๐Ÿ˜€") // Valid surrogate pair } test("stdCharPreservesRawSurrogates") { - // std.char() preserves raw surrogate codepoints (go-jsonnet replaces with U+FFFD) + // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD) eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate } test("stringComparisons") { - // Comprehensive test of ALL comparison operators with the critical UTF-16 vs Unicode boundary case - // This ensures mutation testing catches bugs in any specific comparison implementation - val maxBmp = "'\\uFFFF'" // U+FFFF (max Basic Multilingual Plane) val minSupplementary = "'\\uD800\\uDC00'" // U+10000 (min Supplementary Plane) - // All comparison operators must use Unicode codepoint ordering - // With UTF-16 code unit ordering, these would ALL give wrong results - // Less than: U+FFFF < U+10000 eval(s"$maxBmp < $minSupplementary") ==> ujson.Bool(true) eval(s"$minSupplementary < $maxBmp") ==> ujson.Bool(false) @@ -128,7 +129,7 @@ object UnicodeHandlingTests extends TestSuite { // Less than or equal: U+FFFF <= U+10000 eval(s"$maxBmp <= $minSupplementary") ==> ujson.Bool(true) eval(s"$minSupplementary <= $maxBmp") ==> ujson.Bool(false) - eval(s"$maxBmp <= $maxBmp") ==> ujson.Bool(true) // Reflexivity + eval(s"$maxBmp <= $maxBmp") ==> ujson.Bool(true) // Greater than: U+10000 > U+FFFF eval(s"$minSupplementary > $maxBmp") ==> ujson.Bool(true) @@ -137,12 +138,12 @@ object UnicodeHandlingTests extends TestSuite { // Greater than or equal: U+10000 >= U+FFFF eval(s"$minSupplementary >= $maxBmp") ==> ujson.Bool(true) eval(s"$maxBmp >= $minSupplementary") ==> ujson.Bool(false) - eval(s"$maxBmp >= $maxBmp") ==> ujson.Bool(true) // Reflexivity + eval(s"$maxBmp >= $maxBmp") ==> ujson.Bool(true) // Equality and inequality eval(s"$maxBmp == $minSupplementary") ==> ujson.Bool(false) eval(s"$maxBmp != $minSupplementary") ==> ujson.Bool(true) - eval(s"$maxBmp == $maxBmp") ==> ujson.Bool(true) // Reflexivity + eval(s"$maxBmp == $maxBmp") ==> ujson.Bool(true) eval(s"$maxBmp != $maxBmp") ==> ujson.Bool(false) // Array sorting must also use codepoint ordering @@ -155,92 +156,34 @@ object UnicodeHandlingTests extends TestSuite { } test("objectFieldOrdering") { - // Test object field ordering with the critical UTF-16 vs Unicode boundary case - // Input deliberately in REVERSE codepoint order to make the test non-trivial val testObject = "{\"\uD800\uDC00\": 4, \"\uFFFF\": 3, \"z\": 2, \"a\": 1}" - // All object field functions should sort by Unicode codepoint order + // Object fields: eval(s"std.objectFields($testObject)") ==> ujson.Arr("a", "z", "\uFFFF", "\uD800\uDC00") eval(s"std.objectFieldsAll($testObject)") ==> ujson.Arr("a", "z", "\uFFFF", "\uD800\uDC00") - // Default object rendering also uses codepoint ordering + // Default object rendering: eval(testObject).toString ==> "{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}" - // JSON manifest should maintain the same ordering + // JSON manifest variants: eval(s"std.manifestJsonMinified($testObject)") ==> ujson.Str("{\"a\":1,\"z\":2,\"\uFFFF\":3,\"\uD800\uDC00\":4}") - // TOML manifest should also use codepoint ordering (with escaped Unicode) - eval(s"std.manifestTomlEx($testObject, ' ')") ==> - ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") - } - - test("flatMap") { - // Test that std.flatMap now uses code point semantics like std.map - // This tests the consistency fix for flatMap on strings + eval(s"std.manifestJson($testObject)") ==> + ujson.Str("{\n \"a\": 1,\n \"z\": 2,\n \"\uFFFF\": 3,\n \"\uD800\uDC00\": 4\n}") - // Basic ASCII test - eval("std.flatMap(function(x) x + x, 'ABC')") ==> ujson.Str("AABBCC") + eval(s"std.manifestJsonEx($testObject, ' ')") ==> + ujson.Str("{\n \"a\": 1,\n \"z\": 2,\n \"\uFFFF\": 3,\n \"\uD800\uDC00\": 4\n}") - // Unicode emoji test - should handle each emoji as a single character - eval("std.flatMap(function(x) x + '!', '๐ŸŒ๐Ÿš€')") ==> ujson.Str("๐ŸŒ!๐Ÿš€!") - - // Critical test: surrogate pair boundary case - // U+FFFF followed by U+10000 (which needs surrogate pair) - eval("std.flatMap(function(x) x + '-', '\\uFFFF\\uD800\\uDC00')") ==> ujson.Str( - "\uFFFF-\uD800\uDC00-" - ) - - // Test consistency with std.map behavior - val testStr = "'A๐ŸŒB'"; - eval(s"std.length(std.flatMap(function(x) x, $testStr))") ==> - eval(s"std.length(std.map(function(x) x, $testStr))") - - // Test with function that returns null (should be converted to empty string) - eval("std.flatMap(function(x) if x == '๐ŸŒ' then null else x, 'A๐ŸŒB')") ==> ujson.Str("AB") - - // Array flatMap should still work (unchanged behavior) - eval("std.flatMap(function(x) [x, x], [1, 2])") ==> ujson.Arr(1, 1, 2, 2) + // TOML manifest: + eval(s"std.manifestTomlEx($testObject, ' ')") ==> + ujson.Str("a = 1\nz = 2\n\"\\uffff\" = 3\n\"\\ud800\\udc00\" = 4") } test("findSubstr") { - // Test that std.findSubstr now returns code point offsets instead of code unit positions - // This tests the consistency fix for findSubstr indices - - // Basic ASCII test - should be unchanged - eval("std.findSubstr('l', 'hello')") ==> ujson.Arr(2, 3) - eval("std.findSubstr('o', 'hello world')") ==> ujson.Arr(4, 7) - - // Test with Unicode emojis - positions should be in code points eval("std.findSubstr('๐ŸŒ', 'Hello ๐ŸŒ World')") ==> ujson.Arr(6) eval("std.findSubstr('o', 'Hello ๐ŸŒ World')") ==> ujson.Arr(4, 9) - - // Critical test: surrogate pair boundary case - // The string "A\uFFFF\uD800\uDC00B" has: - // - 'A' at code point 0 - // - U+FFFF at code point 1 - // - U+10000 (\uD800\uDC00) at code point 2 - // - 'B' at code point 3 - eval("std.findSubstr('\\uFFFF', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(1) - eval("std.findSubstr('\\uD800\\uDC00', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(2) - eval("std.findSubstr('B', 'A\\uFFFF\\uD800\\uDC00B')") ==> ujson.Arr(3) - - // Test with pattern that spans multiple code points eval("std.findSubstr('๐ŸŒ๐Ÿš€', '๐ŸŒ๐Ÿš€ and more ๐ŸŒ๐Ÿš€')") ==> ujson.Arr(0, 12) - - // Empty pattern should return empty array - eval("std.findSubstr('', 'test')") ==> ujson.Arr() - - // Non-existent pattern should return empty array - eval("std.findSubstr('xyz', 'hello world')") ==> ujson.Arr() - - // Consistency check: positions returned by findSubstr should work with substr - val testString = "'Hello ๐ŸŒ World'"; - val searchPattern = "'๐ŸŒ'"; - // This verifies that the index returned by findSubstr works with substr to extract the same pattern - eval( - s"std.substr($testString, std.findSubstr($searchPattern, $testString)[0], std.length($searchPattern))" - ) ==> ujson.Str("๐ŸŒ") } } } From 60454bc1aba676db14a958baba0f9318b7d9b5a4 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 8 Sep 2025 17:46:43 -0700 Subject: [PATCH 14/14] Remove codePointOffsetsToStringIndices and inline its logic: Due to checks performed at its callers, one of the branches in codePointOffsetsToStringIndices was unreachable and untested. It's clearer (and likely more performant) to eliminate this method and inline specialized versions of its logic at its former callsites. --- sjsonnet/src/sjsonnet/Evaluator.scala | 3 ++- sjsonnet/src/sjsonnet/Std.scala | 4 ++-- sjsonnet/src/sjsonnet/Util.scala | 29 +++++---------------------- 3 files changed, 9 insertions(+), 27 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala index 1e430392..001001e8 100644 --- a/sjsonnet/src/sjsonnet/Evaluator.scala +++ b/sjsonnet/src/sjsonnet/Evaluator.scala @@ -379,7 +379,8 @@ class Evaluator( val unicodeLength = str.codePointCount(0, str.length) if (int >= unicodeLength) Error.fail(s"string bounds error: $int not within [0, $unicodeLength)", pos) - val (startUtf16, endUtf16) = Util.codePointOffsetsToStringIndices(str, int, int + 1) + val startUtf16 = if (int == 0) 0 else str.offsetByCodePoints(0, int) + val endUtf16 = str.offsetByCodePoints(startUtf16, 1) Val.Str(pos, str.substring(startUtf16, endUtf16)) case (v: Val.Obj, i: Val.Str) => v.value(i.value, pos) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 060fca14..67c07dd8 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -543,8 +543,8 @@ class Std( if (safeLength <= 0) { Val.Str(pos, "") } else { - val (startUtf16, endUtf16) = - Util.codePointOffsetsToStringIndices(str, safeOffset, safeOffset + safeLength) + val startUtf16 = if (safeOffset == 0) 0 else str.offsetByCodePoints(0, safeOffset) + val endUtf16 = str.offsetByCodePoints(startUtf16, safeLength) Val.Str(pos, str.substring(startUtf16, endUtf16)) } } diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala index 1ed51e66..a50813e0 100644 --- a/sjsonnet/src/sjsonnet/Util.scala +++ b/sjsonnet/src/sjsonnet/Util.scala @@ -72,29 +72,6 @@ object Util { res: Val } - /** - * Converts Unicode codepoint positions to Java String indices. For example, the string "๐ŸŒ!" has - * a length of 3 UTF-16 code units, but only 2 Unicode codepoints, so this function would map the - * range (0, 2) to (0, 3). - */ - def codePointOffsetsToStringIndices( - s: String, - startCodePointOffset: Int, - endCodePointOffset: Int): (Int, Int) = { - val unicodeLength = s.codePointCount(0, s.length) - val safeStart = math.max(0, math.min(startCodePointOffset, unicodeLength)) - val safeEnd = math.max(safeStart, math.min(endCodePointOffset, unicodeLength)) - - if (safeStart == safeEnd) { - val utf16Pos = if (safeStart == 0) 0 else s.offsetByCodePoints(0, safeStart) - (utf16Pos, utf16Pos) - } else { - val startUtf16 = if (safeStart == 0) 0 else s.offsetByCodePoints(0, safeStart) - val endUtf16 = s.offsetByCodePoints(startUtf16, safeEnd - safeStart) - (startUtf16, endUtf16) - } - } - private def sliceStr(s: String, start: Int, end: Int, step: Int): String = { val unicodeLength = s.codePointCount(0, s.length) if (start >= end || start >= unicodeLength) { @@ -102,7 +79,11 @@ object Util { } else { step match { case 1 => - val (startUtf16, endUtf16) = codePointOffsetsToStringIndices(s, start, end) + // Preconditions: start >= 0, start < end, start < unicodeLength + val safeEnd = math.min(end, unicodeLength) + val sliceLength = safeEnd - start + val startUtf16 = if (start == 0) 0 else s.offsetByCodePoints(0, start) + val endUtf16 = s.offsetByCodePoints(startUtf16, sliceLength) s.substring(startUtf16, endUtf16) case _ => val result =