From 1fbeaf95c2a59c95ba833a7755bcb6515519d84e Mon Sep 17 00:00:00 2001 From: Toine Hartman Date: Wed, 27 Aug 2025 17:14:39 +0200 Subject: [PATCH 1/5] Add string format tools to library. --- src/org/rascalmpl/library/String.rsc | 119 +++++++++++++++++- .../lang/rascal/tests/library/String.rsc | 21 ++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index a9fe8ce8ade..3a7b225ccfa 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -20,7 +20,9 @@ module String extend Exception; import List; +import Map; import ParseTree; +import Set; @synopsis{All functions in this module that have a charset parameter use this as default.} private str DEFAULT_CHARSET = "UTF-8"; @@ -680,4 +682,119 @@ or the indentation. * This function works fine if `indentation` is not spaces or tabs; but it does not make much sense. } @javaClass{org.rascalmpl.library.Prelude} -java str indent(str indentation, str content, bool indentFirstLine=false); \ No newline at end of file +java str indent(str indentation, str content, bool indentFirstLine=false); + +list[str] newLineCharacters = [ + "\u000A", // LF + "\u000B", // VT + "\u000C", // FF + "\u000D", // CR + "\u000D\u000A", // CRLF + "\u0085", // NEL + "\u2028", // LS + "\u2029" // PS +]; + +@synopsis{Comparator to sort strings by length (ascending).} +private bool bySize(str a, str b) = size(a) < size(b); + +@synopsis{Comparator to sort strings by relative position in a reference list.} +private bool(str, str) byIndex(list[str] indices) { + return bool(str a, str b) { + return indexOf(indices, a) < indexOf(indices, b); + }; +} + +@synopsis{Determine the most-used newline character in a string.} +str mostUsedNewline(str input, list[str] lineseps = newLineCharacters, str(list[str]) tieBreaker = getFirstFrom) { + linesepCounts = (nl: 0 | nl <- lineseps); + for (nl <- sort(lineseps, bySize)) { + int count = size(findAll(input, nl)); + linesepCounts[nl] = count; + // subtract all occurrences of substrings of newline characters that we counted before + for (str snl <- substrings(nl), linesepCounts[snl]?) { + linesepCounts[snl] = linesepCounts[snl] - count; + } + } + + byCount = invert(linesepCounts); + return tieBreaker(sort(byCount[max(domain(byCount))], byIndex(lineseps))); +} + +@synopsis{Split a string to an indentation prefix and the remainder of the string.} +tuple[str indentation, str rest] splitIndentation(/^/) + = ; + +str(str) indentSpacesAsTabs(int tabSize) { + str spaces = ("" | it + " " | _ <- [0..tabSize]); + return str(str line) { + parts = splitIndentation(line); + return ""; + }; +} + +str(str) indentTabsAsSpaces(int tabSize) { + str spaces = ("" | it + " " | _ <- [0..tabSize]); + return str(str line) { + parts = splitIndentation(line); + return ""; + }; +} + +@synopsis{Compute all possible strict substrings of a string.} +set[str] substrings(str input) + = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)], i + l <= size(input)}; + +@synopsis{If a string does not end with a newline character, append one. } +str insertFinalNewline(str input, list[str] lineseps = newLineCharacters) + = any(nl <- lineseps, endsWith(input, nl)) + ? input + : input + mostUsedNewline(input, lineseps=lineseps) + ; + +@synopsis{Remove all newlines from the end of a string.} +str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) { + orderedSeps = reverse(sort(lineseps, bySize)); + while (nl <- orderedSeps, endsWith(input, nl)) { + input = input[0..-size(nl)]; + } + return input; +} + +@synopsis{Split a string in pairs for each line.} +list[tuple[str, str]] separateLines(str input, list[str] lineseps = newLineCharacters) { + orderedSeps = reverse(sort(lineseps, bySize)); + + list[tuple[str, str]] lines = []; + int next = 0; + for (int i <- [0..size(input)]) { + // greedily match line separators (longest first) + if (i >= next, str nl <- orderedSeps, nl == input[i..i+size(nl)]) { + lines += ; + next = i + size(nl); // skip to the start of the next line + } + } + + // last line + if (str nl <- orderedSeps, nl == input[-size(nl)..]) { + lines += ; + } + + return lines; +} + +@synopsis{Concatenate a list of pairs to form a single string.} +str mergeLines(list[tuple[str, str]] lines) + = ("" | it + line + sep | <- lines); + +@synopsis{Process the text of a string per line, maintaining the original newline characters.} +str perLine(str input, str(str) lineFunc, list[str] lineseps = newLineCharacters) + = mergeLines([ | <- separateLines(input, lineseps=lineseps)]); + +@synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.} +str trimTrailingWhitespace(str input) { + str trimLineTrailingWs(/^\s*$/) = nonWhiteSpace; + default str trimLineTrailingWs(/^\s*$/) = ""; + + return perLine(input, trimLineTrailingWs); +} diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc index f806220cb4b..0ee6c951b26 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc @@ -261,3 +261,24 @@ test bool testBase32AllChars1() = testBase32("`1234567890-=~!@#$%^&*"); test bool testBase32AllChars2() = testBase32("()_+qwertyuiop[]\\QWERTYUIOP"); test bool testBase32AllChars3() = testBase32("{}|asdfghjkl;\'ASDFGHJKL:\""); test bool testBase32AllChars4() = testBase32("zxcvbnm,./ZXCVBNM\<\>? "); + +// mostUsedNewLline +test bool mostUsedNewlineTestMixed() = mostUsedNewline("\r\n\n\r\n\t\t\t\t") == "\r\n"; +test bool mostUsedNewlineTestTie() = mostUsedNewline("\n\n\r\n\r\n") == "\n"; +test bool mostUsedNewlineTestGreedy() = mostUsedNewline("\r\n\r\n\n") == "\r\n"; + +// insertFinalNewline +test bool insertFinalNewlineTestSimple() = insertFinalNewline("a\nb") == "a\nb\n"; +test bool insertFinalNewlineTestNoop() = insertFinalNewline("a\nb\n") == "a\nb\n"; +test bool insertFinalNewlineTestMixed() = insertFinalNewline("a\nb\r\n") == "a\nb\r\n"; + +// trimFinalNewlines +test bool trimFinalNewlineTestSimple() = trimFinalNewlines("a\n\n\n") == "a"; +test bool trimFinalNewlineTestEndOnly() = trimFinalNewlines("a\n\n\nb\n\n") == "a\n\n\nb"; +test bool trimFinalNewlineTestWhiteSpace() = trimFinalNewlines("a\n\n\nb\n\n ") == "a\n\n\nb\n\n "; + +// trimTrailingWhitespace +test bool trimTrailingWhitespaceTest() = trimTrailingWhitespace("a \nb\t\n c \n") == "a\nb\n c\n"; + +// perLine +test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\nx"; From 9c395848fa21282577acc461cd3aff113f13fd88 Mon Sep 17 00:00:00 2001 From: Toine Hartman Date: Thu, 28 Aug 2025 17:18:51 +0200 Subject: [PATCH 2/5] Document `substrings` pitfalls. --- src/org/rascalmpl/library/String.rsc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 3a7b225ccfa..5533e8f2d13 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -742,6 +742,11 @@ str(str) indentTabsAsSpaces(int tabSize) { } @synopsis{Compute all possible strict substrings of a string.} +@pitfalls{ +* Does not include the empty string. +* Does not include the input string itself. +* The number of substrings is quadratic in the size of the string; expensive to compute. +} set[str] substrings(str input) = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)], i + l <= size(input)}; From 9754bd44b6dfbe0b8b82e8f5e30dacebea6b7910 Mon Sep 17 00:00:00 2001 From: Toine Hartman Date: Thu, 28 Aug 2025 17:20:26 +0200 Subject: [PATCH 3/5] Small fixes (h/t @rodinaarssen) --- src/org/rascalmpl/library/String.rsc | 8 ++++---- .../library/lang/rascal/tests/library/String.rsc | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 5533e8f2d13..5e3176bc2df 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -772,9 +772,9 @@ list[tuple[str, str]] separateLines(str input, list[str] lineseps = newLineChara list[tuple[str, str]] lines = []; int next = 0; - for (int i <- [0..size(input)]) { + for (int i <- [0..size(input)], i >= next) { // greedily match line separators (longest first) - if (i >= next, str nl <- orderedSeps, nl == input[i..i+size(nl)]) { + if (str nl <- orderedSeps, nl == input[i..i+size(nl)]) { lines += ; next = i + size(nl); // skip to the start of the next line } @@ -797,9 +797,9 @@ str perLine(str input, str(str) lineFunc, list[str] lineseps = newLineCharacters = mergeLines([ | <- separateLines(input, lineseps=lineseps)]); @synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.} -str trimTrailingWhitespace(str input) { +str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) { str trimLineTrailingWs(/^\s*$/) = nonWhiteSpace; default str trimLineTrailingWs(/^\s*$/) = ""; - return perLine(input, trimLineTrailingWs); + return perLine(input, trimLineTrailingWs, lineseps=lineseps); } diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc index 0ee6c951b26..ed3ebb9220b 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc @@ -265,6 +265,7 @@ test bool testBase32AllChars4() = testBase32("zxcvbnm,./ZXCVBNM\<\>? "); // mostUsedNewLline test bool mostUsedNewlineTestMixed() = mostUsedNewline("\r\n\n\r\n\t\t\t\t") == "\r\n"; test bool mostUsedNewlineTestTie() = mostUsedNewline("\n\n\r\n\r\n") == "\n"; +test bool mostUsedNewlineTestNone() = mostUsedNewline("abcdefg") == "\n"; test bool mostUsedNewlineTestGreedy() = mostUsedNewline("\r\n\r\n\n") == "\r\n"; // insertFinalNewline From 4be1f35ff80a3b69b4e4dc7cc351a9ee90b8ecff Mon Sep 17 00:00:00 2001 From: Toine Hartman Date: Thu, 28 Aug 2025 17:46:17 +0200 Subject: [PATCH 4/5] Optionally include empty last line when string ends with newline. --- src/org/rascalmpl/library/String.rsc | 10 +++++----- .../library/lang/rascal/tests/library/String.rsc | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 5e3176bc2df..d1ce23ca556 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -767,7 +767,7 @@ str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) { } @synopsis{Split a string in pairs for each line.} -list[tuple[str, str]] separateLines(str input, list[str] lineseps = newLineCharacters) { +list[tuple[str, str]] separateLines(str input, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) { orderedSeps = reverse(sort(lineseps, bySize)); list[tuple[str, str]] lines = []; @@ -781,8 +781,8 @@ list[tuple[str, str]] separateLines(str input, list[str] lineseps = newLineChara } // last line - if (str nl <- orderedSeps, nl == input[-size(nl)..]) { - lines += ; + if (next < size(input) || includeEmptyLastLine) { + lines += ; } return lines; @@ -793,8 +793,8 @@ str mergeLines(list[tuple[str, str]] lines) = ("" | it + line + sep | <- lines); @synopsis{Process the text of a string per line, maintaining the original newline characters.} -str perLine(str input, str(str) lineFunc, list[str] lineseps = newLineCharacters) - = mergeLines([ | <- separateLines(input, lineseps=lineseps)]); +str perLine(str input, str(str) lineFunc, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) + = mergeLines([ | <- separateLines(input, includeEmptyLastLine=includeEmptyLastLine, lineseps=lineseps)]); @synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.} str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) { diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc index ed3ebb9220b..ea53c6e655f 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc @@ -282,4 +282,18 @@ test bool trimFinalNewlineTestWhiteSpace() = trimFinalNewlines("a\n\n\nb\n\n ") test bool trimTrailingWhitespaceTest() = trimTrailingWhitespace("a \nb\t\n c \n") == "a\nb\n c\n"; // perLine -test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\nx"; +test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\n"; + +// separateLines +test bool separateLinesTestSimple() = separateLines("a\nb\r\nc\n\r\n") == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">]; +test bool separateLinesTestSimpleWithLast() = separateLines("a\nb\r\nc\n\r\n", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">, <"", "">]; +test bool separateLinesTestNoFinalNewline() = separateLines("a\nb\r\nc") == [<"a", "\n">, <"b", "\r\n">, <"c", "">]; +test bool separateLinesTestNoFinalNewlineNoEmpty() = separateLines("a\nb\r\nc", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "">]; +test bool separateLinesTestOnlyNewlines() = separateLines("\n\r\n\n\r\n") == [<"", "\n">, <"", "\r\n">, <"", "\n">, <"", "\r\n">]; +test bool separateLinesTestNoNewlines() = separateLines("abc") == [<"abc", "">]; + +// substrings +test bool substringsTestEmpty() = substrings("") == {}; +test bool substringsTestSingle() = substrings("a") == {}; +test bool substringsTestTwo() = substrings("ab") == {"a", "b"}; +test bool substringsTestThree() = substrings("abc") == {"a", "b", "c", "ab", "bc"}; From 8d8de3e5e07037188f88ea0bc2995d5c0b3ffeb2 Mon Sep 17 00:00:00 2001 From: Toine Hartman Date: Fri, 29 Aug 2025 13:15:14 +0200 Subject: [PATCH 5/5] Optimize `substrings` for performance (h/t @rodinaarssen) --- src/org/rascalmpl/library/String.rsc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index d1ce23ca556..73ddc3264ee 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -748,7 +748,7 @@ str(str) indentTabsAsSpaces(int tabSize) { * The number of substrings is quadratic in the size of the string; expensive to compute. } set[str] substrings(str input) - = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)], i + l <= size(input)}; + = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)-i+1]} - input; @synopsis{If a string does not end with a newline character, append one. } str insertFinalNewline(str input, list[str] lineseps = newLineCharacters)