-
Notifications
You must be signed in to change notification settings - Fork 81
Add string format tools to library. #2373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1fbeaf9
9c39584
9754bd4
4be1f35
8d8de3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,9 @@ module String | |
|
||
extend Exception; | ||
import List; | ||
import Map; | ||
import ParseTree; | ||
import Set; | ||
|
||
@synopsis{All functions in this module that have a charset parameter use this as default.} | ||
private str DEFAULT_CHARSET = "UTF-8"; | ||
|
@@ -680,4 +682,124 @@ or the indentation. | |
* This function works fine if `indentation` is not spaces or tabs; but it does not make much sense. | ||
} | ||
@javaClass{org.rascalmpl.library.Prelude} | ||
java str indent(str indentation, str content, bool indentFirstLine=false); | ||
java str indent(str indentation, str content, bool indentFirstLine=false); | ||
|
||
list[str] newLineCharacters = [ | ||
"\u000A", // LF | ||
"\u000B", // VT | ||
"\u000C", // FF | ||
"\u000D", // CR | ||
"\u000D\u000A", // CRLF | ||
"\u0085", // NEL | ||
"\u2028", // LS | ||
"\u2029" // PS | ||
]; | ||
|
||
@synopsis{Comparator to sort strings by length (ascending).} | ||
private bool bySize(str a, str b) = size(a) < size(b); | ||
|
||
@synopsis{Comparator to sort strings by relative position in a reference list.} | ||
private bool(str, str) byIndex(list[str] indices) { | ||
return bool(str a, str b) { | ||
return indexOf(indices, a) < indexOf(indices, b); | ||
}; | ||
} | ||
|
||
@synopsis{Determine the most-used newline character in a string.} | ||
str mostUsedNewline(str input, list[str] lineseps = newLineCharacters, str(list[str]) tieBreaker = getFirstFrom) { | ||
toinehartman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
linesepCounts = (nl: 0 | nl <- lineseps); | ||
for (nl <- sort(lineseps, bySize)) { | ||
int count = size(findAll(input, nl)); | ||
linesepCounts[nl] = count; | ||
// subtract all occurrences of substrings of newline characters that we counted before | ||
for (str snl <- substrings(nl), linesepCounts[snl]?) { | ||
rodinaarssen marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this almost looks like pattern matching on strings? (which we only have reasonable support over) for example:
come to think of it, this whole function smells like an parsing automata. Where we build a big state table of all the possible matches and then iterate through all the chars and count the matches based on their state. In java this would be 20/30 lines, but in rascal we might be missing some primitives (as we don't have a character loop). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we hard-code the set of newline characters (e.g. to all Unicode newline chars), we could write it as a grammar and use the parser generator. Downside (as we discussed) is that all (transitive) imports of this module will trigger generation of a parser. We could also move some of those to a specific |
||
linesepCounts[snl] = linesepCounts[snl] - count; | ||
} | ||
} | ||
|
||
byCount = invert(linesepCounts); | ||
return tieBreaker(sort(byCount[max(domain(byCount))], byIndex(lineseps))); | ||
} | ||
|
||
@synopsis{Split a string to an indentation prefix and the remainder of the string.} | ||
tuple[str indentation, str rest] splitIndentation(/^<indentation:\s*><rest:.*>/) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if the string contains multiple lines? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that should be an explicit exception (invalid argument or similar) |
||
= <indentation, rest>; | ||
|
||
str(str) indentSpacesAsTabs(int tabSize) { | ||
rodinaarssen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
str spaces = ("" | it + " " | _ <- [0..tabSize]); | ||
return str(str line) { | ||
parts = splitIndentation(line); | ||
return "<replaceAll(parts.indentation, spaces, "\t")><parts.rest>"; | ||
}; | ||
} | ||
|
||
str(str) indentTabsAsSpaces(int tabSize) { | ||
str spaces = ("" | it + " " | _ <- [0..tabSize]); | ||
return str(str line) { | ||
parts = splitIndentation(line); | ||
return "<replaceAll(parts.indentation, "\t", spaces)><parts.rest>"; | ||
}; | ||
} | ||
|
||
@synopsis{Compute all possible strict substrings of a string.} | ||
@pitfalls{ | ||
* Does not include the empty string. | ||
* Does not include the input string itself. | ||
* The number of substrings is quadratic in the size of the string; expensive to compute. | ||
} | ||
set[str] substrings(str input) | ||
= {input[i..i+l] | int i <- [0..size(input)], int l <- [1..size(input)-i+1]} - input; | ||
|
||
@synopsis{If a string does not end with a newline character, append one. } | ||
str insertFinalNewline(str input, list[str] lineseps = newLineCharacters) | ||
= any(nl <- lineseps, endsWith(input, nl)) | ||
? input | ||
: input + mostUsedNewline(input, lineseps=lineseps) | ||
; | ||
|
||
@synopsis{Remove all newlines from the end of a string.} | ||
str trimFinalNewlines(str input, list[str] lineseps = newLineCharacters) { | ||
orderedSeps = reverse(sort(lineseps, bySize)); | ||
while (nl <- orderedSeps, endsWith(input, nl)) { | ||
input = input[0..-size(nl)]; | ||
} | ||
return input; | ||
} | ||
|
||
@synopsis{Split a string in <text, newline> pairs for each line.} | ||
list[tuple[str, str]] separateLines(str input, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) { | ||
orderedSeps = reverse(sort(lineseps, bySize)); | ||
|
||
list[tuple[str, str]] lines = []; | ||
rodinaarssen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int next = 0; | ||
for (int i <- [0..size(input)], i >= next) { | ||
// greedily match line separators (longest first) | ||
if (str nl <- orderedSeps, nl == input[i..i+size(nl)]) { | ||
lines += <input[next..i], nl>; | ||
next = i + size(nl); // skip to the start of the next line | ||
} | ||
} | ||
|
||
// last line | ||
if (next < size(input) || includeEmptyLastLine) { | ||
lines += <input[next..], "">; | ||
} | ||
|
||
return lines; | ||
} | ||
|
||
@synopsis{Concatenate a list of <line, newline> pairs to form a single string.} | ||
str mergeLines(list[tuple[str, str]] lines) | ||
= ("" | it + line + sep | <line, sep> <- lines); | ||
|
||
@synopsis{Process the text of a string per line, maintaining the original newline characters.} | ||
str perLine(str input, str(str) lineFunc, bool includeEmptyLastLine = false, list[str] lineseps = newLineCharacters) | ||
= mergeLines([<lineFunc(l), nl> | <l, nl> <- separateLines(input, includeEmptyLastLine=includeEmptyLastLine, lineseps=lineseps)]); | ||
|
||
@synopsis{Trim trailing non-newline whitespace from each line in a multi-line string.} | ||
str trimTrailingWhitespace(str input, list[str] lineseps = newLineCharacters) { | ||
str trimLineTrailingWs(/^<nonWhiteSpace:.*\S>\s*$/) = nonWhiteSpace; | ||
default str trimLineTrailingWs(/^\s*$/) = ""; | ||
|
||
return perLine(input, trimLineTrailingWs, lineseps=lineseps); | ||
} |
Uh oh!
There was an error while loading. Please reload this page.