databricks · stephenamar-db · Sep 17, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala
@@ -374,10 +374,14 @@ class Evaluator(
         v.force(int)
       case (v: Val.Str, i: Val.Num) =>
         val int = i.asPositiveInt
-        if (v.value.isEmpty) Error.fail("string bounds error: string is empty", pos)
-        if (int >= v.value.length)
-          Error.fail(s"string bounds error: $int not within [0, ${v.value.length})", pos)
-        Val.Str(pos, new String(Array(v.value(int))))
+        val str = v.value
+        if (str.isEmpty) Error.fail("string bounds error: string is empty", pos)
+        val unicodeLength = str.codePointCount(0, str.length)
+        if (int >= unicodeLength)
+          Error.fail(s"string bounds error: $int not within [0, $unicodeLength)", pos)
+        val startUtf16 = if (int == 0) 0 else str.offsetByCodePoints(0, int)
+        val endUtf16 = str.offsetByCodePoints(startUtf16, 1)
+        Val.Str(pos, str.substring(startUtf16, endUtf16))
       case (v: Val.Obj, i: Val.Str) =>
         v.value(i.value, pos)
       case (lhs, rhs) =>
@@ -522,31 +526,35 @@ class Evaluator(
 
       case Expr.BinaryOp.OP_< =>
         (l, r) match {
-          case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l < r)
+          case (Val.Str(_, l), Val.Str(_, r)) =>
+            Val.bool(pos, Util.compareStringsByCodepoint(l, r) < 0)
           case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l < r)
           case (x: Val.Arr, y: Val.Arr)       => Val.bool(pos, compare(x, y) < 0)
           case _                              => fail()
         }
 
       case Expr.BinaryOp.OP_> =>
         (l, r) match {
-          case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l > r)
+          case (Val.Str(_, l), Val.Str(_, r)) =>
+            Val.bool(pos, Util.compareStringsByCodepoint(l, r) > 0)
           case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l > r)
           case (x: Val.Arr, y: Val.Arr)       => Val.bool(pos, compare(x, y) > 0)
           case _                              => fail()
         }
 
       case Expr.BinaryOp.OP_<= =>
         (l, r) match {
-          case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l <= r)
+          case (Val.Str(_, l), Val.Str(_, r)) =>
+            Val.bool(pos, Util.compareStringsByCodepoint(l, r) <= 0)
           case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l <= r)
           case (x: Val.Arr, y: Val.Arr)       => Val.bool(pos, compare(x, y) <= 0)
           case _                              => fail()
         }
 
       case Expr.BinaryOp.OP_>= =>
         (l, r) match {
-          case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l >= r)
+          case (Val.Str(_, l), Val.Str(_, r)) =>
+            Val.bool(pos, Util.compareStringsByCodepoint(l, r) >= 0)
           case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l >= r)
           case (x: Val.Arr, y: Val.Arr)       => Val.bool(pos, compare(x, y) >= 0)
           case _                              => fail()
@@ -832,7 +840,7 @@ class Evaluator(
   def compare(x: Val, y: Val): Int = (x, y) match {
     case (_: Val.Null, _: Val.Null) => 0
     case (x: Val.Num, y: Val.Num)   => x.asDouble.compareTo(y.asDouble)
-    case (x: Val.Str, y: Val.Str)   => x.value.compareTo(y.value)
+    case (x: Val.Str, y: Val.Str)   => Util.compareStringsByCodepoint(x.value, y.value)
     case (x: Val.Bool, y: Val.Bool) => x.asBoolean.compareTo(y.asBoolean)
     case (x: Val.Arr, y: Val.Arr)   =>
       val len = math.min(x.length, y.length)

diff --git a/sjsonnet/src/sjsonnet/Materializer.scala b/sjsonnet/src/sjsonnet/Materializer.scala
@@ -34,7 +34,7 @@ abstract class Materializer {
             -1
           )
           if (sort) {
-            if (prevKey != null && k.compareTo(prevKey) <= 0)
+            if (prevKey != null && Util.compareStringsByCodepoint(k, prevKey) <= 0)
               Error.fail(
                 s"""Internal error: Unexpected key "$k" after "$prevKey" in sorted object materialization""",
                 v.pos

diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala
@@ -51,7 +51,7 @@ class Std(
       Val.Num(
         pos,
         x.force match {
-          case Val.Str(_, s) => s.length
+          case Val.Str(_, s) => s.codePointCount(0, s.length)
           case a: Val.Arr    => a.length
           case o: Val.Obj    => o.visibleKeyNames.length
           case o: Val.Func   => o.params.names.length
@@ -97,12 +97,14 @@ class Std(
   }
 
   private object Codepoint extends Val.Builtin1("codepoint", "str") {
-    def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = if (
-      str.force.asString.length != 1 || str.force.asString.codePointCount(0, 1) > 1
-    ) {
-      Error.fail("expected a single character string, got " + str.force.asString)
-    } else {
-      Val.Num(pos, str.force.asString.codePointAt(0).toDouble)
+    def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = {
+      val s = str.force.asString
+      val codePointCount = s.codePointCount(0, s.length)
+      if (codePointCount != 1) {
+        Error.fail("expected a single character string, got " + s)
+      } else {
+        Val.Num(pos, s.codePointAt(0).toDouble)
+      }
     }
   }
 
@@ -533,9 +535,18 @@ class Std(
         case v: Val.Num => v.asPositiveInt
         case _ => Error.fail("Expected a number for len in substr, got " + len.force.prettyName)
       }
-      val safeOffset = math.min(offset, str.length)
-      val safeLength = math.min(length, str.length - safeOffset)
-      Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength))
+
+      val unicodeLength = str.codePointCount(0, str.length)
+      val safeOffset = math.min(offset, unicodeLength)
+      val safeLength = math.min(length, unicodeLength - safeOffset)
+
+      if (safeLength <= 0) {
+        Val.Str(pos, "")
+      } else {
+        val startUtf16 = if (safeOffset == 0) 0 else str.offsetByCodePoints(0, safeOffset)
+        val endUtf16 = str.offsetByCodePoints(startUtf16, safeLength)
+        Val.Str(pos, str.substring(startUtf16, endUtf16))
+      }
     }
   }
 
@@ -1031,7 +1042,7 @@ class Std(
         indexedPath: Seq[String])(implicit ev: EvalScope): StringWriter = {
       val (sections, nonSections) =
         v.visibleKeyNames.partition(k => isSection(v.value(k, v.pos)(ev)))
-      for (k <- nonSections.sorted) {
+      for (k <- nonSections.sorted(Util.CodepointStringOrdering)) {
         out.write(cumulatedIndent)
         out.write(TomlRenderer.escapeKey(k))
         out.write(" = ")
@@ -1041,7 +1052,7 @@ class Std(
       }
       out.write('\n')
 
-      for (k <- sections.sorted) {
+      for (k <- sections.sorted(Util.CodepointStringOrdering)) {
         val v0 = v.value(k, v.pos, v)(ev)
         if (isTableArray(v0)) {
           for (i <- 0 until v0.asArr.length) {
@@ -1405,10 +1416,13 @@ class Std(
           Val.Arr(pos, arrResults)
 
         case s: Val.Str =>
-          val builder = new StringBuilder()
-          for (c: Char <- s.value) {
+          val builder = new java.lang.StringBuilder()
+          var i = 0
+          while (i < s.value.length) {
+            val codePoint = s.value.codePointAt(i)
+            val codepointStr = Character.toString(codePoint)
             val fres =
-              func.apply1(Val.Str(pos, c.toString), pos.noOffset)(ev, TailstrictModeDisabled)
+              func.apply1(Val.Str(pos, codepointStr), pos.noOffset)(ev, TailstrictModeDisabled)
             builder.append(
               fres match {
                 case fstr: Val.Str => fstr.value
@@ -1422,6 +1436,7 @@ class Std(
                   )
               }
             )
+            i += Character.charCount(codePoint)
           }
           Val.Str(pos, builder.toString)
         case _ => Error.fail("Argument must be either array or string")
@@ -1451,8 +1466,17 @@ class Std(
         if (matchIndex == -1) Val.Arr(pos, emptyLazyArray)
         else {
           val indices = new mutable.ArrayBuilder.ofRef[Val.Num]
+
+          // Compute codepoint indices incrementally, avoiding an O(n) calculation for each match.
+          var prevCharIndex = 0
+          var prevCodePointIndex = 0
+
           while (0 <= matchIndex && matchIndex < str.length) {
-            indices.+=(Val.Num(pos, matchIndex))
+            val codePointIndex = prevCodePointIndex + str.codePointCount(prevCharIndex, matchIndex)
+            indices.+=(Val.Num(pos, codePointIndex))
+
+            prevCharIndex = matchIndex
+            prevCodePointIndex = codePointIndex
             matchIndex = str.indexOf(pat, matchIndex + 1)
           }
           Val.Arr(pos, indices.result())
@@ -2067,7 +2091,7 @@ class Std(
           val indices = Array.range(0, vs.length)
 
           val sortedIndices = if (keyType == classOf[Val.Str]) {
-            indices.sortBy(i => keys(i).cast[Val.Str].asString)
+            indices.sortBy(i => keys(i).cast[Val.Str].asString)(Util.CodepointStringOrdering)
           } else if (keyType == classOf[Val.Num]) {
             indices.sortBy(i => keys(i).cast[Val.Num].asDouble)
           } else if (keyType == classOf[Val.Arr]) {
@@ -2086,7 +2110,7 @@ class Std(
             Error.fail("Cannot sort with values that are not all the same type")
 
           if (keyType == classOf[Val.Str]) {
-            vs.map(_.force.cast[Val.Str]).sortBy(_.asString)
+            vs.map(_.force.cast[Val.Str]).sortBy(_.asString)(Util.CodepointStringOrdering)
           } else if (keyType == classOf[Val.Num]) {
             vs.map(_.force.cast[Val.Num]).sortBy(_.asDouble)
           } else if (keyType == classOf[Val.Arr]) {
@@ -2102,13 +2126,16 @@ class Std(
   }
 
   def stringChars(pos: Position, str: String): Val.Arr = {
-    val a = new Array[Lazy](str.length)
+    val chars = new Array[Lazy](str.codePointCount(0, str.length))
+    var charIndex = 0
     var i = 0
-    while (i < a.length) {
-      a(i) = Val.Str(pos, String.valueOf(str.charAt(i)))
-      i += 1
+    while (i < str.length) {
+      val codePoint = str.codePointAt(i)
+      chars(charIndex) = Val.Str(pos, Character.toString(codePoint))
+      i += Character.charCount(codePoint)
+      charIndex += 1
     }
-    Val.Arr(pos, a)
+    Val.Arr(pos, chars)
   }
 
   def getVisibleKeys(ev: EvalScope, v1: Val.Obj): Array[String] =
@@ -2118,7 +2145,7 @@ class Std(
     maybeSortKeys(ev, v1.allKeyNames)
 
   @inline private def maybeSortKeys(ev: EvalScope, keys: Array[String]): Array[String] =
-    if (ev.settings.preserveOrder) keys else keys.sorted
+    if (ev.settings.preserveOrder) keys else keys.sorted(Util.CodepointStringOrdering)
 
   def getObjValuesFromKeys(
       pos: Position,

diff --git a/sjsonnet/src/sjsonnet/Util.scala b/sjsonnet/src/sjsonnet/Util.scala
@@ -17,7 +17,7 @@ object Util {
     s"${line + 1}:${col + 1}"
   }
 
-  def sliceArr[T: scala.reflect.ClassTag](
+  private def sliceArr[T: scala.reflect.ClassTag](
       arr: Array[T],
       start: Int,
       end: Int,
@@ -41,7 +41,7 @@ object Util {
       _end: Option[Int],
       _step: Option[Int]): Val = {
     def length0(e: Val): Int = e match {
-      case Val.Str(_, s) => s.length
+      case Val.Str(_, s) => s.codePointCount(0, s.length)
       case a: Val.Arr    => a.length
       case x             => Error.fail("Cannot get length of " + x.prettyName, e.pos)(ev)
     }
@@ -72,26 +72,100 @@ object Util {
     res: Val
   }
 
-  def sliceArr[T: scala.reflect.ClassTag](
-      arr: Array[T],
-      start: Option[Int],
-      end: Option[Int],
-      step: Option[Int]): Array[T] = {
-    sliceArr(arr, start.getOrElse(0), end.getOrElse(arr.length), step.getOrElse(1))
-  }
-  def sliceStr(s: String, start: Int, end: Int, step: Int): String = {
-    if (start >= end || start >= s.length) {
+  private def sliceStr(s: String, start: Int, end: Int, step: Int): String = {
+    val unicodeLength = s.codePointCount(0, s.length)
+    if (start >= end || start >= unicodeLength) {
       ""
-    } else
+    } else {
       step match {
-        case 1 => s.slice(start, end)
+        case 1 =>
+          // Preconditions: start >= 0, start < end, start < unicodeLength
+          val safeEnd = math.min(end, unicodeLength)
+          val sliceLength = safeEnd - start
+          val startUtf16 = if (start == 0) 0 else s.offsetByCodePoints(0, start)
+          val endUtf16 = s.offsetByCodePoints(startUtf16, sliceLength)
+          s.substring(startUtf16, endUtf16)
         case _ =>
-          val range = start until end by step
-          new String(range.dropWhile(_ < 0).takeWhile(_ < s.length).map(s).toArray)
+          val result =
+            new java.lang.StringBuilder(math.min(s.length, ((end - start) + step - 1) / step))
+          var sIdx = 0
+          var codepointIndex = 0
+
+          // Skip to start codepoint position
+          while (sIdx < s.length && codepointIndex < start) {
+            val cp = s.codePointAt(sIdx)
+            sIdx += Character.charCount(cp)
+            codepointIndex += 1
+          }
+
+          // Collect every `step`th codepoint until `end`
+          var rel = 0 // relative index from start
+          var nextInclude = 0 // next relative index to include
+          while (sIdx < s.length && codepointIndex < end) {
+            val c = s.charAt(sIdx)
+            if (Character.isSurrogate(c)) {
+              // Handle surrogate pair (or unpaired surrogates)
+              val cp = s.codePointAt(sIdx)
+              if (rel == nextInclude) {
+                result.append(Character.toString(cp))
+                nextInclude += step
+              }
+              sIdx += Character.charCount(cp)
+            } else {
+              // Single char, non-surrogate
+              if (rel == nextInclude) {
+                result.append(c)
+                nextInclude += step
+              }
+              sIdx += 1
+            }
+            codepointIndex += 1
+            rel += 1
+          }
+          result.toString
+      }
+    }
+  }
+
+  /**
+   * Compares two strings by Unicode codepoint values rather than UTF-16 code units. This ensures
+   * that strings with characters above U+FFFF (which require surrogate pairs in UTF-16) are
+   * compared correctly according to their Unicode codepoint values.
+   */
+  def compareStringsByCodepoint(s1: String, s2: String): Int = {
+    val n1 = s1.length
+    val n2 = s2.length
+    var i1 = 0
+    var i2 = 0
+    while (i1 < n1 && i2 < n2) {
+      val c1 = s1.charAt(i1)
+      val c2 = s2.charAt(i2)
+      val c1Sur = Character.isSurrogate(c1)
+      val c2Sur = Character.isSurrogate(c2)
+
+      if (!c1Sur && !c2Sur) {
+        // Both are non-surrogates, compare directly
+        if (c1 != c2) return Character.compare(c1, c2)
+        i1 += 1
+        i2 += 1
+      } else {
+        // At least one is a surrogate, use full codepoint logic
+        val cp1 = s1.codePointAt(i1)
+        val cp2 = s2.codePointAt(i2)
+        if (cp1 != cp2) return Integer.compare(cp1, cp2)
+        i1 += Character.charCount(cp1)
+        i2 += Character.charCount(cp2)
       }
+    }
+    if (i1 < n1) 1 else if (i2 < n2) -1 else 0
   }
-  def sliceStr(s: String, start: Option[Int], end: Option[Int], step: Option[Int]): String = {
-    sliceStr(s, start.getOrElse(0), end.getOrElse(s.length), step.getOrElse(1))
+
+  /**
+   * A reusable Ordering[String] that compares by Unicode codepoint values. Use this in place of
+   * default `.sorted` when ordering should be codepoint-aware.
+   */
+  val CodepointStringOrdering: Ordering[String] = new Ordering[String] {
+    override def compare(x: String, y: String): Int = compareStringsByCodepoint(x, y)
   }
 
   val isWindows: Boolean = {

diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala
@@ -455,7 +455,7 @@ object Val {
 
     def foreachElement(sort: Boolean, pos: Position)(f: (String, Val) => Unit)(implicit
         ev: EvalScope): Unit = {
-      val keys = if (sort) visibleKeyNames.sorted else visibleKeyNames
+      val keys = if (sort) visibleKeyNames.sorted(Util.CodepointStringOrdering) else visibleKeyNames
       for (k <- keys) {
         val v = value(k, pos)
         f(k, v)