Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 123 additions & 1 deletion src/org/rascalmpl/library/String.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ module String

extend Exception;
import List;
import Map;
import ParseTree;
import Set;

@synopsis{All functions in this module that have a charset parameter use this as default.}
private str DEFAULT_CHARSET = "UTF-8";
Expand Down Expand Up @@ -680,4 +682,124 @@ or the indentation.
* This function works fine if `indentation` is not spaces or tabs; but it does not make much sense.
}
@javaClass{org.rascalmpl.library.Prelude}
java str indent(str indentation, str content, bool indentFirstLine=false);
java str indent(str indentation, str content, bool indentFirstLine=false);

list[str] newLineCharacters = [
"\u000A", // LF
"\u000B", // VT
"\u000C", // FF
"\u000D", // CR
"\u000D\u000A", // CRLF
"\u0085", // NEL
"\u2028", // LS
"\u2029" // PS
];

@synopsis{Comparator to sort strings by length (ascending).}
private bool bySize(str a, str b) = size(a) < size(b);

@synopsis{Comparator to sort strings by relative position in a reference list.}
private bool(str, str) byIndex(list[str] indices) {
return bool(str a, str b) {
return indexOf(indices, a) < indexOf(indices, b);
};
}

@synopsis{Determine the most-used newline character in a string.}
str mostUsedNewline(str input, list[str] lineseps = newLineCharacters, str(list[str]) tieBreaker = getFirstFrom) {
linesepCounts = (nl: 0 | nl <- lineseps);
for (nl <- sort(lineseps, bySize)) {
int count = size(findAll(input, nl));
linesepCounts[nl] = count;
// subtract all occurrences of substrings of newline characters that we counted before
for (str snl <- substrings(nl), linesepCounts[snl]?) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this almost looks like pattern matching on strings? (which we only have reasonable support over)

for example:

rascal>visit("abcd") { case str m : println(m); }
abcd
bcd
cd
d

come to think of it, this whole function smells like an parsing automata. Where we build a big state table of all the possible matches and then iterate through all the chars and count the matches based on their state.

In java this would be 20/30 lines, but in rascal we might be missing some primitives (as we don't have a character loop).

Copy link
Member Author

@toinehartman toinehartman Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we hard-code the set of newline characters (e.g. to all Unicode newline chars), we could write it as a grammar and use the parser generator. Downside (as we discussed) is that all (transitive) imports of this module will trigger generation of a parser. We could also move some of those to a specific Format module.

linesepCounts[snl] = linesepCounts[snl] - count;
}
}

byCount = invert(linesepCounts);
return tieBreaker(sort(byCount[max(domain(byCount))], byIndex(lineseps)));
}

@synopsis{Split a string to an indentation prefix and the remainder of the string.}
tuple[str indentation, str rest] splitIndentation(/^<indentation:\s*><rest:.*>/)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if the string contains multiple lines?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that should be an explicit exception (invalid argument or similar)

= <indentation, rest>;

str(str) indentSpacesAsTabs(int tabSize) {
str spaces = ("" | it + " " | _ <- [0..tabSize]);
return str(str line) {
parts = splitIndentation(line);
return "<replaceAll(parts.indentation, spaces, "\t")><parts.rest>";
};
}

str(str) indentTabsAsSpaces(int tabSize) {
str spaces = ("" | it + " " | _ <- [0..tabSize]);
return str(str line) {
parts = splitIndentation(line);
return "<replaceAll(parts.indentation, "\t", spaces)><parts.rest>";
};
}

@synopsis{Compute all possible strict substrings of a string.}
@pitfalls{
* Does not include the empty string.
* Does not include the input string itself.
* The number of substrings is quadratic in the size of the string; expensive to compute.
}
set[str] substrings(str input)
= {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)-i+1]} - input;

@synopsis{If a string does not end with a newline character, append one. }
str insertFinalNewline(str input, list[str] lineseps = newLineCharacters)
= any(nl <- lineseps, endsWith(input, nl))
? input
: input + mostUsedNewline(input, lineseps=lineseps)
;

@synopsis{Remove all newlines from the end of a string.}
str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) {
orderedSeps = reverse(sort(lineseps, bySize));
while (nl <- orderedSeps, endsWith(input, nl)) {
input = input[0..-size(nl)];
}
return input;
}

@synopsis{Split a string in <text, newline> pairs for each line.}
list[tuple[str, str]] separateLines(str input, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) {
orderedSeps = reverse(sort(lineseps, bySize));

list[tuple[str, str]] lines = [];
int next = 0;
for (int i <- [0..size(input)], i >= next) {
// greedily match line separators (longest first)
if (str nl <- orderedSeps, nl == input[i..i+size(nl)]) {
lines += <input[next..i], nl>;
next = i + size(nl); // skip to the start of the next line
}
}

// last line
if (next < size(input) || includeEmptyLastLine) {
lines += <input[next..], "">;
}

return lines;
}

@synopsis{Concatenate a list of <line, newline> pairs to form a single string.}
str mergeLines(list[tuple[str, str]] lines)
= ("" | it + line + sep | <line, sep> <- lines);

@synopsis{Process the text of a string per line, maintaining the original newline characters.}
str perLine(str input, str(str) lineFunc, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters)
= mergeLines([<lineFunc(l), nl> | <l, nl> <- separateLines(input, includeEmptyLastLine=includeEmptyLastLine, lineseps=lineseps)]);

@synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.}
str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) {
str trimLineTrailingWs(/^<nonWhiteSpace:.*\S>\s*$/) = nonWhiteSpace;
default str trimLineTrailingWs(/^\s*$/) = "";

return perLine(input, trimLineTrailingWs, lineseps=lineseps);
}
36 changes: 36 additions & 0 deletions src/org/rascalmpl/library/lang/rascal/tests/library/String.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,39 @@ test bool testBase32AllChars1() = testBase32("`1234567890-=~!@#$%^&*");
test bool testBase32AllChars2() = testBase32("()_+qwertyuiop[]\\QWERTYUIOP");
test bool testBase32AllChars3() = testBase32("{}|asdfghjkl;\'ASDFGHJKL:\"");
test bool testBase32AllChars4() = testBase32("zxcvbnm,./ZXCVBNM\<\>? ");

// mostUsedNewLline
test bool mostUsedNewlineTestMixed() = mostUsedNewline("\r\n\n\r\n\t\t\t\t") == "\r\n";
test bool mostUsedNewlineTestTie() = mostUsedNewline("\n\n\r\n\r\n") == "\n";
test bool mostUsedNewlineTestNone() = mostUsedNewline("abcdefg") == "\n";
test bool mostUsedNewlineTestGreedy() = mostUsedNewline("\r\n\r\n\n") == "\r\n";

// insertFinalNewline
test bool insertFinalNewlineTestSimple() = insertFinalNewline("a\nb") == "a\nb\n";
test bool insertFinalNewlineTestNoop() = insertFinalNewline("a\nb\n") == "a\nb\n";
test bool insertFinalNewlineTestMixed() = insertFinalNewline("a\nb\r\n") == "a\nb\r\n";

// trimFinalNewlines
test bool trimFinalNewlineTestSimple() = trimFinalNewlines("a\n\n\n") == "a";
test bool trimFinalNewlineTestEndOnly() = trimFinalNewlines("a\n\n\nb\n\n") == "a\n\n\nb";
test bool trimFinalNewlineTestWhiteSpace() = trimFinalNewlines("a\n\n\nb\n\n ") == "a\n\n\nb\n\n ";

// trimTrailingWhitespace
test bool trimTrailingWhitespaceTest() = trimTrailingWhitespace("a \nb\t\n c \n") == "a\nb\n c\n";

// perLine
test bool perLineTest() = perLine("a\nb\r\nc\n\r\n", str(str line) { return line + "x"; }) == "ax\nbx\r\ncx\nx\r\n";

// separateLines
test bool separateLinesTestSimple() = separateLines("a\nb\r\nc\n\r\n") == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">];
test bool separateLinesTestSimpleWithLast() = separateLines("a\nb\r\nc\n\r\n", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "\n">, <"", "\r\n">, <"", "">];
test bool separateLinesTestNoFinalNewline() = separateLines("a\nb\r\nc") == [<"a", "\n">, <"b", "\r\n">, <"c", "">];
test bool separateLinesTestNoFinalNewlineNoEmpty() = separateLines("a\nb\r\nc", includeEmptyLastLine=true) == [<"a", "\n">, <"b", "\r\n">, <"c", "">];
test bool separateLinesTestOnlyNewlines() = separateLines("\n\r\n\n\r\n") == [<"", "\n">, <"", "\r\n">, <"", "\n">, <"", "\r\n">];
test bool separateLinesTestNoNewlines() = separateLines("abc") == [<"abc", "">];

// substrings
test bool substringsTestEmpty() = substrings("") == {};
test bool substringsTestSingle() = substrings("a") == {};
test bool substringsTestTwo() = substrings("ab") == {"a", "b"};
test bool substringsTestThree() = substrings("abc") == {"a", "b", "c", "ab", "bc"};