Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions sjsonnet/src/sjsonnet/Evaluator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -374,10 +374,14 @@ class Evaluator(
v.force(int)
case (v: Val.Str, i: Val.Num) =>
val int = i.asPositiveInt
if (v.value.isEmpty) Error.fail("string bounds error: string is empty", pos)
if (int >= v.value.length)
Error.fail(s"string bounds error: $int not within [0, ${v.value.length})", pos)
Val.Str(pos, new String(Array(v.value(int))))
val str = v.value
if (str.isEmpty) Error.fail("string bounds error: string is empty", pos)
val unicodeLength = str.codePointCount(0, str.length)
if (int >= unicodeLength)
Error.fail(s"string bounds error: $int not within [0, $unicodeLength)", pos)
val startUtf16 = if (int == 0) 0 else str.offsetByCodePoints(0, int)
val endUtf16 = str.offsetByCodePoints(startUtf16, 1)
Val.Str(pos, str.substring(startUtf16, endUtf16))
case (v: Val.Obj, i: Val.Str) =>
v.value(i.value, pos)
case (lhs, rhs) =>
Expand Down Expand Up @@ -522,31 +526,35 @@ class Evaluator(

case Expr.BinaryOp.OP_< =>
(l, r) match {
case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l < r)
case (Val.Str(_, l), Val.Str(_, r)) =>
Val.bool(pos, Util.compareStringsByCodepoint(l, r) < 0)
case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l < r)
case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) < 0)
case _ => fail()
}

case Expr.BinaryOp.OP_> =>
(l, r) match {
case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l > r)
case (Val.Str(_, l), Val.Str(_, r)) =>
Val.bool(pos, Util.compareStringsByCodepoint(l, r) > 0)
case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l > r)
case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) > 0)
case _ => fail()
}

case Expr.BinaryOp.OP_<= =>
(l, r) match {
case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l <= r)
case (Val.Str(_, l), Val.Str(_, r)) =>
Val.bool(pos, Util.compareStringsByCodepoint(l, r) <= 0)
case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l <= r)
case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) <= 0)
case _ => fail()
}

case Expr.BinaryOp.OP_>= =>
(l, r) match {
case (Val.Str(_, l), Val.Str(_, r)) => Val.bool(pos, l >= r)
case (Val.Str(_, l), Val.Str(_, r)) =>
Val.bool(pos, Util.compareStringsByCodepoint(l, r) >= 0)
case (Val.Num(_, l), Val.Num(_, r)) => Val.bool(pos, l >= r)
case (x: Val.Arr, y: Val.Arr) => Val.bool(pos, compare(x, y) >= 0)
case _ => fail()
Expand Down Expand Up @@ -832,7 +840,7 @@ class Evaluator(
def compare(x: Val, y: Val): Int = (x, y) match {
case (_: Val.Null, _: Val.Null) => 0
case (x: Val.Num, y: Val.Num) => x.asDouble.compareTo(y.asDouble)
case (x: Val.Str, y: Val.Str) => x.value.compareTo(y.value)
case (x: Val.Str, y: Val.Str) => Util.compareStringsByCodepoint(x.value, y.value)
case (x: Val.Bool, y: Val.Bool) => x.asBoolean.compareTo(y.asBoolean)
case (x: Val.Arr, y: Val.Arr) =>
val len = math.min(x.length, y.length)
Expand Down
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/Materializer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ abstract class Materializer {
-1
)
if (sort) {
if (prevKey != null && k.compareTo(prevKey) <= 0)
if (prevKey != null && Util.compareStringsByCodepoint(k, prevKey) <= 0)
Error.fail(
s"""Internal error: Unexpected key "$k" after "$prevKey" in sorted object materialization""",
v.pos
Expand Down
75 changes: 51 additions & 24 deletions sjsonnet/src/sjsonnet/Std.scala
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class Std(
Val.Num(
pos,
x.force match {
case Val.Str(_, s) => s.length
case Val.Str(_, s) => s.codePointCount(0, s.length)
case a: Val.Arr => a.length
case o: Val.Obj => o.visibleKeyNames.length
case o: Val.Func => o.params.names.length
Expand Down Expand Up @@ -97,12 +97,14 @@ class Std(
}

private object Codepoint extends Val.Builtin1("codepoint", "str") {
def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = if (
str.force.asString.length != 1 || str.force.asString.codePointCount(0, 1) > 1
) {
Error.fail("expected a single character string, got " + str.force.asString)
} else {
Val.Num(pos, str.force.asString.codePointAt(0).toDouble)
def evalRhs(str: Lazy, ev: EvalScope, pos: Position): Val = {
val s = str.force.asString
val codePointCount = s.codePointCount(0, s.length)
if (codePointCount != 1) {
Error.fail("expected a single character string, got " + s)
} else {
Val.Num(pos, s.codePointAt(0).toDouble)
}
}
}

Expand Down Expand Up @@ -533,9 +535,18 @@ class Std(
case v: Val.Num => v.asPositiveInt
case _ => Error.fail("Expected a number for len in substr, got " + len.force.prettyName)
}
val safeOffset = math.min(offset, str.length)
val safeLength = math.min(length, str.length - safeOffset)
Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength))

val unicodeLength = str.codePointCount(0, str.length)
val safeOffset = math.min(offset, unicodeLength)
val safeLength = math.min(length, unicodeLength - safeOffset)

if (safeLength <= 0) {
Val.Str(pos, "")
} else {
val startUtf16 = if (safeOffset == 0) 0 else str.offsetByCodePoints(0, safeOffset)
val endUtf16 = str.offsetByCodePoints(startUtf16, safeLength)
Val.Str(pos, str.substring(startUtf16, endUtf16))
}
}
}

Expand Down Expand Up @@ -1031,7 +1042,7 @@ class Std(
indexedPath: Seq[String])(implicit ev: EvalScope): StringWriter = {
val (sections, nonSections) =
v.visibleKeyNames.partition(k => isSection(v.value(k, v.pos)(ev)))
for (k <- nonSections.sorted) {
for (k <- nonSections.sorted(Util.CodepointStringOrdering)) {
out.write(cumulatedIndent)
out.write(TomlRenderer.escapeKey(k))
out.write(" = ")
Expand All @@ -1041,7 +1052,7 @@ class Std(
}
out.write('\n')

for (k <- sections.sorted) {
for (k <- sections.sorted(Util.CodepointStringOrdering)) {
val v0 = v.value(k, v.pos, v)(ev)
if (isTableArray(v0)) {
for (i <- 0 until v0.asArr.length) {
Expand Down Expand Up @@ -1405,10 +1416,13 @@ class Std(
Val.Arr(pos, arrResults)

case s: Val.Str =>
val builder = new StringBuilder()
for (c: Char <- s.value) {
val builder = new java.lang.StringBuilder()
var i = 0
while (i < s.value.length) {
val codePoint = s.value.codePointAt(i)
val codepointStr = Character.toString(codePoint)
val fres =
func.apply1(Val.Str(pos, c.toString), pos.noOffset)(ev, TailstrictModeDisabled)
func.apply1(Val.Str(pos, codepointStr), pos.noOffset)(ev, TailstrictModeDisabled)
builder.append(
fres match {
case fstr: Val.Str => fstr.value
Expand All @@ -1422,6 +1436,7 @@ class Std(
)
}
)
i += Character.charCount(codePoint)
}
Val.Str(pos, builder.toString)
case _ => Error.fail("Argument must be either array or string")
Expand Down Expand Up @@ -1451,8 +1466,17 @@ class Std(
if (matchIndex == -1) Val.Arr(pos, emptyLazyArray)
else {
val indices = new mutable.ArrayBuilder.ofRef[Val.Num]

// Compute codepoint indices incrementally, avoiding an O(n) calculation for each match.
var prevCharIndex = 0
var prevCodePointIndex = 0

while (0 <= matchIndex && matchIndex < str.length) {
indices.+=(Val.Num(pos, matchIndex))
val codePointIndex = prevCodePointIndex + str.codePointCount(prevCharIndex, matchIndex)
indices.+=(Val.Num(pos, codePointIndex))

prevCharIndex = matchIndex
prevCodePointIndex = codePointIndex
matchIndex = str.indexOf(pat, matchIndex + 1)
}
Val.Arr(pos, indices.result())
Expand Down Expand Up @@ -2067,7 +2091,7 @@ class Std(
val indices = Array.range(0, vs.length)

val sortedIndices = if (keyType == classOf[Val.Str]) {
indices.sortBy(i => keys(i).cast[Val.Str].asString)
indices.sortBy(i => keys(i).cast[Val.Str].asString)(Util.CodepointStringOrdering)
} else if (keyType == classOf[Val.Num]) {
indices.sortBy(i => keys(i).cast[Val.Num].asDouble)
} else if (keyType == classOf[Val.Arr]) {
Expand All @@ -2086,7 +2110,7 @@ class Std(
Error.fail("Cannot sort with values that are not all the same type")

if (keyType == classOf[Val.Str]) {
vs.map(_.force.cast[Val.Str]).sortBy(_.asString)
vs.map(_.force.cast[Val.Str]).sortBy(_.asString)(Util.CodepointStringOrdering)
} else if (keyType == classOf[Val.Num]) {
vs.map(_.force.cast[Val.Num]).sortBy(_.asDouble)
} else if (keyType == classOf[Val.Arr]) {
Expand All @@ -2102,13 +2126,16 @@ class Std(
}

def stringChars(pos: Position, str: String): Val.Arr = {
val a = new Array[Lazy](str.length)
val chars = new Array[Lazy](str.codePointCount(0, str.length))
var charIndex = 0
var i = 0
while (i < a.length) {
a(i) = Val.Str(pos, String.valueOf(str.charAt(i)))
i += 1
while (i < str.length) {
val codePoint = str.codePointAt(i)
chars(charIndex) = Val.Str(pos, Character.toString(codePoint))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a String.toString(codePoint) in java 11 :(, but those two are the same anyway.

i += Character.charCount(codePoint)
charIndex += 1
}
Val.Arr(pos, a)
Val.Arr(pos, chars)
}

def getVisibleKeys(ev: EvalScope, v1: Val.Obj): Array[String] =
Expand All @@ -2118,7 +2145,7 @@ class Std(
maybeSortKeys(ev, v1.allKeyNames)

@inline private def maybeSortKeys(ev: EvalScope, keys: Array[String]): Array[String] =
if (ev.settings.preserveOrder) keys else keys.sorted
if (ev.settings.preserveOrder) keys else keys.sorted(Util.CodepointStringOrdering)

def getObjValuesFromKeys(
pos: Position,
Expand Down
108 changes: 91 additions & 17 deletions sjsonnet/src/sjsonnet/Util.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ object Util {
s"${line + 1}:${col + 1}"
}

def sliceArr[T: scala.reflect.ClassTag](
private def sliceArr[T: scala.reflect.ClassTag](
arr: Array[T],
start: Int,
end: Int,
Expand All @@ -41,7 +41,7 @@ object Util {
_end: Option[Int],
_step: Option[Int]): Val = {
def length0(e: Val): Int = e match {
case Val.Str(_, s) => s.length
case Val.Str(_, s) => s.codePointCount(0, s.length)
case a: Val.Arr => a.length
case x => Error.fail("Cannot get length of " + x.prettyName, e.pos)(ev)
}
Expand Down Expand Up @@ -72,26 +72,100 @@ object Util {
res: Val
}

def sliceArr[T: scala.reflect.ClassTag](
arr: Array[T],
start: Option[Int],
end: Option[Int],
step: Option[Int]): Array[T] = {
sliceArr(arr, start.getOrElse(0), end.getOrElse(arr.length), step.getOrElse(1))
}
def sliceStr(s: String, start: Int, end: Int, step: Int): String = {
if (start >= end || start >= s.length) {
private def sliceStr(s: String, start: Int, end: Int, step: Int): String = {
val unicodeLength = s.codePointCount(0, s.length)
if (start >= end || start >= unicodeLength) {
""
} else
} else {
step match {
case 1 => s.slice(start, end)
case 1 =>
// Preconditions: start >= 0, start < end, start < unicodeLength
val safeEnd = math.min(end, unicodeLength)
val sliceLength = safeEnd - start
val startUtf16 = if (start == 0) 0 else s.offsetByCodePoints(0, start)
val endUtf16 = s.offsetByCodePoints(startUtf16, sliceLength)
s.substring(startUtf16, endUtf16)
case _ =>
val range = start until end by step
new String(range.dropWhile(_ < 0).takeWhile(_ < s.length).map(s).toArray)
val result =
new java.lang.StringBuilder(math.min(s.length, ((end - start) + step - 1) / step))
var sIdx = 0
var codepointIndex = 0

// Skip to start codepoint position
while (sIdx < s.length && codepointIndex < start) {
val cp = s.codePointAt(sIdx)
sIdx += Character.charCount(cp)
codepointIndex += 1
}

// Collect every `step`th codepoint until `end`
var rel = 0 // relative index from start
var nextInclude = 0 // next relative index to include
while (sIdx < s.length && codepointIndex < end) {
val c = s.charAt(sIdx)
if (Character.isSurrogate(c)) {
// Handle surrogate pair (or unpaired surrogates)
val cp = s.codePointAt(sIdx)
if (rel == nextInclude) {
result.append(Character.toString(cp))
nextInclude += step
}
sIdx += Character.charCount(cp)
} else {
// Single char, non-surrogate
if (rel == nextInclude) {
result.append(c)
nextInclude += step
}
sIdx += 1
}
codepointIndex += 1
rel += 1
}
result.toString
}
}
}

/**
* Compares two strings by Unicode codepoint values rather than UTF-16 code units. This ensures
* that strings with characters above U+FFFF (which require surrogate pairs in UTF-16) are
* compared correctly according to their Unicode codepoint values.
*/
def compareStringsByCodepoint(s1: String, s2: String): Int = {
val n1 = s1.length
val n2 = s2.length
var i1 = 0
var i2 = 0
while (i1 < n1 && i2 < n2) {
val c1 = s1.charAt(i1)
val c2 = s2.charAt(i2)
val c1Sur = Character.isSurrogate(c1)
val c2Sur = Character.isSurrogate(c2)

if (!c1Sur && !c2Sur) {
// Both are non-surrogates, compare directly
if (c1 != c2) return Character.compare(c1, c2)
i1 += 1
i2 += 1
} else {
// At least one is a surrogate, use full codepoint logic
val cp1 = s1.codePointAt(i1)
val cp2 = s2.codePointAt(i2)
if (cp1 != cp2) return Integer.compare(cp1, cp2)
i1 += Character.charCount(cp1)
i2 += Character.charCount(cp2)
}
}
if (i1 < n1) 1 else if (i2 < n2) -1 else 0
}
def sliceStr(s: String, start: Option[Int], end: Option[Int], step: Option[Int]): String = {
sliceStr(s, start.getOrElse(0), end.getOrElse(s.length), step.getOrElse(1))

/**
* A reusable Ordering[String] that compares by Unicode codepoint values. Use this in place of
* default `.sorted` when ordering should be codepoint-aware.
*/
val CodepointStringOrdering: Ordering[String] = new Ordering[String] {
override def compare(x: String, y: String): Int = compareStringsByCodepoint(x, y)
}

val isWindows: Boolean = {
Expand Down
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/Val.scala
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ object Val {

def foreachElement(sort: Boolean, pos: Position)(f: (String, Val) => Unit)(implicit
ev: EvalScope): Unit = {
val keys = if (sort) visibleKeyNames.sorted else visibleKeyNames
val keys = if (sort) visibleKeyNames.sorted(Util.CodepointStringOrdering) else visibleKeyNames
for (k <- keys) {
val v = value(k, pos)
f(k, v)
Expand Down
Loading