usethesource · toinehartman · Aug 27, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc
@@ -20,7 +20,9 @@ module String
 
 extend Exception;
 import List;
+import Map;
 import ParseTree;
+import Set;
 
 @synopsis{All functions in this module that have a charset parameter use this as default.}
 private str DEFAULT_CHARSET = "UTF-8";
@@ -680,4 +682,124 @@ or the indentation.
 * This function works fine if `indentation` is not spaces or tabs; but it does not make much sense. 
 }
 @javaClass{org.rascalmpl.library.Prelude}
-java str indent(str indentation, str content, bool indentFirstLine=false);
+java str indent(str indentation, str content, bool indentFirstLine=false);
+
+list[str] newLineCharacters = [
+    "\u000A", // LF
+    "\u000B", // VT
+    "\u000C", // FF
+    "\u000D", // CR
+    "\u000D\u000A", // CRLF
+    "\u0085", // NEL
+    "\u2028", // LS
+    "\u2029" // PS
+];
+
+@synopsis{Comparator to sort strings by length (ascending).}
+private bool bySize(str a, str b) = size(a) < size(b);
+
+@synopsis{Comparator to sort strings by relative position in a reference list.}
+private bool(str, str) byIndex(list[str] indices) {
+    return bool(str a, str b) {
+        return indexOf(indices, a) < indexOf(indices, b);
+    };
+}
+
+@synopsis{Determine the most-used newline character in a string.}
+str mostUsedNewline(str input, list[str] lineseps = newLineCharacters, str(list[str]) tieBreaker = getFirstFrom) {
+    linesepCounts = (nl: 0 | nl <- lineseps);
+    for (nl <- sort(lineseps, bySize)) {
+        int count = size(findAll(input, nl));
+        linesepCounts[nl] = count;
+        // subtract all occurrences of substrings of newline characters that we counted before
+        for (str snl <- substrings(nl), linesepCounts[snl]?) {
+            linesepCounts[snl] = linesepCounts[snl] - count;
+        }
+    }
+
+    byCount = invert(linesepCounts);
+    return tieBreaker(sort(byCount[max(domain(byCount))], byIndex(lineseps)));
+}
+
+@synopsis{Split a string to an indentation prefix and the remainder of the string.}
+tuple[str indentation, str rest] splitIndentation(/^<indentation:\s*><rest:.*>/)
+    = <indentation, rest>;
+
+str(str) indentSpacesAsTabs(int tabSize) {
+    str spaces = ("" | it + " " | _ <- [0..tabSize]);
+    return str(str line) {
+        parts = splitIndentation(line);
+        return "<replaceAll(parts.indentation, spaces, "\t")><parts.rest>";
+    };
+}
+
+str(str) indentTabsAsSpaces(int tabSize) {
+    str spaces = ("" | it + " " | _ <- [0..tabSize]);
+    return str(str line) {
+        parts = splitIndentation(line);
+        return "<replaceAll(parts.indentation, "\t", spaces)><parts.rest>";
+    };
+}
+
+@synopsis{Compute all possible strict substrings of a string.}
+@pitfalls{
+* Does not include the empty string.
+* Does not include the input string itself.
+* The number of substrings is quadratic in the size of the string; expensive to compute.
+}
+set[str] substrings(str input)
+    = {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)-i+1]} - input;
+
+@synopsis{If a string does not end with a newline character, append one. }
+str insertFinalNewline(str input, list[str] lineseps = newLineCharacters)
+    = any(nl <- lineseps, endsWith(input, nl))
+    ? input
+    : input + mostUsedNewline(input, lineseps=lineseps)
+    ;
+
+@synopsis{Remove all newlines from the end of a string.}
+str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) {
+    orderedSeps = reverse(sort(lineseps, bySize));
+    while (nl <- orderedSeps, endsWith(input, nl)) {
+        input = input[0..-size(nl)];
+    }
+    return input;
+}
+
+@synopsis{Split a string in <text, newline> pairs for each line.}
+list[tuple[str, str]] separateLines(str input, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) {
+    orderedSeps = reverse(sort(lineseps, bySize));
+
+    list[tuple[str, str]] lines = [];
+    int next = 0;
+    for (int i <- [0..size(input)], i >= next) {
+        // greedily match line separators (longest first)
+        if (str nl <- orderedSeps, nl == input[i..i+size(nl)]) {
+            lines += <input[next..i], nl>;
+            next = i + size(nl); // skip to the start of the next line
+        }
+    }
+
+    // last line
+    if (next < size(input) || includeEmptyLastLine) {
+        lines += <input[next..], "">;
+    }
+
+    return lines;
+}
+
+@synopsis{Concatenate a list of <line, newline> pairs to form a single string.}
+str mergeLines(list[tuple[str, str]] lines)
+    = ("" | it + line + sep | <line, sep> <- lines);
+
+@synopsis{Process the text of a string per line, maintaining the original newline characters.}
+str perLine(str input, str(str) lineFunc, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters)
+    = mergeLines([<lineFunc(l), nl> | <l, nl> <- separateLines(input, includeEmptyLastLine=includeEmptyLastLine, lineseps=lineseps)]);
+
+@synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.}
+str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) {
+    str trimLineTrailingWs(/^<nonWhiteSpace:.*\S>\s*$/) = nonWhiteSpace;
+    default str trimLineTrailingWs(/^\s*$/) = "";
+
+    return perLine(input, trimLineTrailingWs, lineseps=lineseps);
+}
diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc
@@ -261,3 +261,39 @@ test bool testBase32AllChars1() = testBase32("`1234567890-=~!@#$%^&*");
 test bool testBase32AllChars2() = testBase32("()_+qwertyuiop[]\\QWERTYUIOP");
 test bool testBase32AllChars3() = testBase32("{}|asdfghjkl;\'ASDFGHJKL:\"");
 test bool testBase32AllChars4() = testBase32("zxcvbnm,./ZXCVBNM\<\>? ");
+
+// mostUsedNewLline
+test bool mostUsedNewlineTestMixed() = mostUsedNewline("\r\n\n\r\n\t\t\t\t") == "\r\n";
+test bool mostUsedNewlineTestTie() = mostUsedNewline("\n\n\r\n\r\n") == "\n";
+test bool mostUsedNewlineTestNone() = mostUsedNewline("abcdefg") == "\n";
+test bool mostUsedNewlineTestGreedy() = mostUsedNewline("\r\n\r\n\n") == "\r\n";
+
+// insertFinalNewline
+test bool insertFinalNewlineTestSimple() = insertFinalNewline("a\nb") == "a\nb\n";
+test bool insertFinalNewlineTestNoop() = insertFinalNewline("a\nb\n") == "a\nb\n";
+test bool insertFinalNewlineTestMixed() = insertFinalNewline("a\nb\r\n") == "a\nb\r\n";
+
+// trimFinalNewlines
+test bool trimFinalNewlineTestSimple() = trimFinalNewlines("a\n\n\n") == "a";
+test bool trimFinalNewlineTestEndOnly() = trimFinalNewlines("a\n\n\nb\n\n") == "a\n\n\nb";
+test bool trimFinalNewlineTestWhiteSpace() = trimFinalNewlines("a\n\n\nb\n\n ") == "a\n\n\nb\n\n ";
+
+// trimTrailingWhitespace
+test bool trimTrailingWhitespaceTest() = trimTrailingWhitespace("a  \nb\t\n  c  \n") == "a\nb\n  c\n";
+
+// perLine
+test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\n";
+
+// separateLines
+test bool separateLinesTestSimple() = separateLines("a\nb\r\nc\n\r\n") == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">];
+test bool separateLinesTestSimpleWithLast() = separateLines("a\nb\r\nc\n\r\n", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">, <"", "">];
+test bool separateLinesTestNoFinalNewline() = separateLines("a\nb\r\nc") == [<"a", "\n">, <"b", "\r\n">, <"c", "">];
+test bool separateLinesTestNoFinalNewlineNoEmpty() = separateLines("a\nb\r\nc", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "">];
+test bool separateLinesTestOnlyNewlines() = separateLines("\n\r\n\n\r\n") == [<"", "\n">, <"", "\r\n">, <"", "\n">, <"", "\r\n">];
+test bool separateLinesTestNoNewlines() = separateLines("abc") == [<"abc", "">];
+
+// substrings
+test bool substringsTestEmpty() = substrings("") == {};
+test bool substringsTestSingle() = substrings("a") == {};
+test bool substringsTestTwo() = substrings("ab") == {"a", "b"};
+test bool substringsTestThree() = substrings("abc") == {"a", "b", "c", "ab", "bc"};