feat: Add a new by_regex option. (#62)

google · Jan 17, 2025 · ba6ba52 · ba6ba52
1 parent 7ae1dfa
commit ba6ba52
Show file tree

Hide file tree

Showing 11 changed files with 504 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -240,7 +240,7 @@ allows for sorting data such as Go structs and JSON objects.
 
 #### Custom grouping
 
-Another way to group lines together is with the `group_prefixes` argument. This
+Another way to group lines together is with the `group_prefixes` option. This
 takes a comma-separated list of prefixes. Any line beginning with one of those
 prefixes will be treated as a continuation line.
 
@@ -323,7 +323,7 @@ username: ch3
 </tr>
 </table>
 
-More prefixes can be made to stick with their successor. The argument
+More prefixes can be made to stick with their successor. The option
 `sticky_prefixes` takes a comma-separated list of prefixes that will all be
 treated as sticky. These prefixes cannot contain space characters.
 
@@ -464,13 +464,91 @@ progress = (
 </tr>
 </table>
 
+#### Regular Expressions
+
+It can be useful to sort an entire group based on a non-prefix substring. The
+option `by_regex=…` takes a comma-separated list of [regular
+expressions] that will be applied to the group, and then sorting
+will take place on just the results of the regular expressions.
+
+> [!TIP]
+> Regular expressions often need special characters. See [Syntax](#syntax) below
+> for how to include special characters in the `by_regex` option.
+
+By default, all characters that the regular expression matches will be
+considered for sorting. If the regular expression contains any capturing groups,
+only the characters matched by the capturing groups will be considered for
+sorting. The result from each regular expression will be concatenated into a
+list of results, and that list of results will be sorted [lexicographically].
+
+Regular expressions are applied **after** pre-sorting options.
+[`group_prefixes`](#custom-grouping) will consider to the content of the file
+before any regular expression has been applied to it.
+
+Regular expressions are applied **before** other sorting options.
+[`case`](#case-sensitivity), [`numeric`](#numeric-sorting), and
+[`prefix_order`](#prefix-sorting) will only apply to the characters matched by
+your regular expressions.
+
+> [!TIP]
+> If you want your regular expression itself to be case insensitive, consider
+> setting the case-insensitive flag `(?i)` at the start of your expression.
+
+[regular expressions]: http://godoc/pkg/regexp/syntax/
+[lexicographically]: https://en.wikipedia.org/wiki/Lexicographic_order
+
+<table border="0">
+<tr>
+<td>
+
+```java
+// keep-sorted start
+List<String> foo;
+Object baz;
+String bar;
+// keep-sorted end
+```
+
+```java
+// keep-sorted start
+List<String> foo;
+Object baz;
+String bar;
+// keep-sorted end
+```
+
+</td>
+<td>
+
+```diff
++# keep-sorted start by_regex=\w+;
+ String bar;
+ Object baz;
+ List<String> foo;
+ # keep-sorted end
+```
+
+```diff
++# keep-sorted start by_regex=\w+; prefix_order=foo
+ List<String> foo;
+ String bar;
+ Object baz;
+ # keep-sorted end
+```
+
+</td>
+</tr>
+</table>
+
 #### Prefix sorting
 
 Sometimes, it is useful to specify a custom ordering for some elements. The
-argument `prefix_order=…` takes a comma-separated list of prefixes that is
+option `prefix_order=…` takes a comma-separated list of prefixes that is
 matched against the lines to be sorted: if the line starts with one of the
-specified values, it is put at the corresponding position. If an empty prefix is
-specified, any line not covered by other prefixes is matched.
+specified values, it is put at the corresponding position. Lines that don't
+match any of the prefixes are put after any lines that have a matching prefix.
+You can use an empty prefix to put unmatching lines in between non-empty
+prefixes.
 
 <table border="0">
 <tr>
@@ -532,7 +610,7 @@ droid_components = [
 #### Ignore prefixes
 
 For some use cases, there are prefix strings that would be best ignored when
-trying to keep items in an order. The argument `ignore_prefixes=…` takes a
+trying to keep items in an order. The option `ignore_prefixes=…` takes a
 comma-separated list of prefixes that are ignored for sorting purposes. If the
 line starts with any or no whitespace followed by one of the listed prefixes,
 the prefix is treated as the empty string for sorting purposes.
@@ -656,9 +734,9 @@ Pineapples
 
 ### Syntax
 
-If you find yourself wanting to include special characters in the value (spaces,
-commas, left brackets) of one of the options, you can do so with a YAML [flow
-sequence](https://yaml.org/spec/1.2.2/#flow-sequences).
+If you find yourself wanting to include special characters (spaces, commas, left
+brackets) in a comma-separated list of one of the options, you can do so with a
+YAML [flow sequence](https://yaml.org/spec/1.2.2/#flow-sequences).
 
 ```md
 <!-- keep-sorted start prefix_order=["* ", "* ["] -->
@@ -668,4 +746,4 @@ sequence](https://yaml.org/spec/1.2.2/#flow-sequences).
 <!-- keep-sorted end -->
 ```
 
-This works for any option that accepts more than one value.
+This works for all options that accept multiple values.
diff --git a/goldens/by_regex.err b/goldens/by_regex.err
@@ -0,0 +1,2 @@
+WRN while parsing option "by_regex": error parsing regexp: missing argument to repetition operator: `*` line=85
+WRN by_regex cannot be used with ignore_prefixes (consider adding a non-capturing group to the start of your regex instead of ignore_prefixes: "(?:foo|bar)") line=92
diff --git a/goldens/by_regex.in b/goldens/by_regex.in
@@ -0,0 +1,96 @@
+No capturing group
+  keep-sorted-test start by_regex=int|bool|long
+  int baz
+  bool foo
+  long bar
+  keep-sorted-test end
+
+Capturing group
+  keep-sorted-test start by_regex=['[^ ]+ (.*)']
+  int baz
+  bool foo
+  long bar
+  keep-sorted-test end
+
+Capturing group and non-capturing group
+  keep-sorted-test start by_regex=['(?:int|bool|long) (.*)']
+  int baz
+  bool foo
+  long bar
+  keep-sorted-test end
+
+Numeric
+  keep-sorted-test start by_regex=\d+ numeric=yes
+  bar 40
+  foo 7
+  baz 01
+  keep-sorted-test end
+
+Case insensitive sorting
+  keep-sorted-test start by_regex=\D+ case=no
+  1 FOO
+  2 bar
+  3 bAz
+  keep-sorted-test end
+
+Prefix order
+  // keep-sorted-test start by_regex=\w+_(\w+) prefix_order=INIT,,FINAL
+  FOO_INIT,
+  FOO_FINAL,
+  BAR_INIT,
+  BAR_FINAL,
+  DO_STUFF,
+  DO_MORE_STUFF,
+  ZAP_THINGS
+  // keep-sorted-test end
+
+Multiple regexes
+  keep-sorted-test start by_regex=['(?:int|bool|long) (.*)', 'int|bool|long']
+  int baz
+  long baz
+  bool baz
+  bool foo
+  long foo
+  int foo
+  long bar
+  bool bar
+  int bar
+  keep-sorted-test end
+
+Multiline blocks
+  keep-sorted-test start block=yes newline_separated=yes by_regex=(\w+)\(\)\s+{
+  bool func2() {
+    return true;
+  }
+  int func1() {
+    return 1;
+  }
+  List<SomeReallyLongTypeParameterThatWouldForceTheFunctionNameOntoTheNextLine>
+      func0() {
+    return List.of(whatever);
+  }
+  keep-sorted-test end
+
+Regex doesn't match every line
+  keep-sorted-test start by_regex=\d+
+  3
+  baz
+  2
+  foo
+  1
+  bar
+  keep-sorted-test end
+
+Invalid regex
+  keep-sorted-test start by_regex=*
+  2
+  1
+  3
+  keep-sorted-test end
+
+Cannot combine with ignore_prefixes
+  keep-sorted-test start by_regex=.* ignore_prefixes=foo,bar
+  2
+  1
+  3
+  keep-sorted-test end
diff --git a/goldens/by_regex.out b/goldens/by_regex.out
@@ -0,0 +1,98 @@
+No capturing group
+  keep-sorted-test start by_regex=int|bool|long
+  bool foo
+  int baz
+  long bar
+  keep-sorted-test end
+
+Capturing group
+  keep-sorted-test start by_regex=['[^ ]+ (.*)']
+  long bar
+  int baz
+  bool foo
+  keep-sorted-test end
+
+Capturing group and non-capturing group
+  keep-sorted-test start by_regex=['(?:int|bool|long) (.*)']
+  long bar
+  int baz
+  bool foo
+  keep-sorted-test end
+
+Numeric
+  keep-sorted-test start by_regex=\d+ numeric=yes
+  baz 01
+  foo 7
+  bar 40
+  keep-sorted-test end
+
+Case insensitive sorting
+  keep-sorted-test start by_regex=\D+ case=no
+  2 bar
+  3 bAz
+  1 FOO
+  keep-sorted-test end
+
+Prefix order
+  // keep-sorted-test start by_regex=\w+_(\w+) prefix_order=INIT,,FINAL
+  BAR_INIT,
+  FOO_INIT,
+  DO_MORE_STUFF,
+  DO_STUFF,
+  ZAP_THINGS,
+  BAR_FINAL,
+  FOO_FINAL
+  // keep-sorted-test end
+
+Multiple regexes
+  keep-sorted-test start by_regex=['(?:int|bool|long) (.*)', 'int|bool|long']
+  bool bar
+  int bar
+  long bar
+  bool baz
+  int baz
+  long baz
+  bool foo
+  int foo
+  long foo
+  keep-sorted-test end
+
+Multiline blocks
+  keep-sorted-test start block=yes newline_separated=yes by_regex=(\w+)\(\)\s+{
+  List<SomeReallyLongTypeParameterThatWouldForceTheFunctionNameOntoTheNextLine>
+      func0() {
+    return List.of(whatever);
+  }
+
+  int func1() {
+    return 1;
+  }
+
+  bool func2() {
+    return true;
+  }
+  keep-sorted-test end
+
+Regex doesn't match every line
+  keep-sorted-test start by_regex=\d+
+  1
+  2
+  3
+  bar
+  baz
+  foo
+  keep-sorted-test end
+
+Invalid regex
+  keep-sorted-test start by_regex=*
+  1
+  2
+  3
+  keep-sorted-test end
+
+Cannot combine with ignore_prefixes
+  keep-sorted-test start by_regex=.* ignore_prefixes=foo,bar
+  1
+  2
+  3
+  keep-sorted-test end
diff --git a/goldens/golden_test.go b/goldens/golden_test.go
@@ -75,6 +75,9 @@ func TestGoldens(t *testing.T) {
 
 				wantOut, err := os.ReadFile(filepath.Join(dir, tc+".out"))
 				if err != nil {
+					if errors.Is(err, os.ErrNotExist) {
+						needsRegen <- inFile
+					}
 					t.Fatalf("Could not read .out file: %v", err)
 				}
 				wantErr, err := os.ReadFile(filepath.Join(dir, tc+".err"))

diff --git a/keepsorted/block.go b/keepsorted/block.go
@@ -351,7 +351,6 @@ func handleTrailingComma(lgs []lineGroup) (trimTrailingComma func([]lineGroup))
 				}
 			}
 		}
-
 	}
 
 	return func([]lineGroup) {}
@@ -375,6 +374,10 @@ func (b block) lessFn() cmpFunc[lineGroup] {
 		return 1
 	})
 
+	regexTransform := func(lg lineGroup) []regexToken {
+		return b.metadata.opts.regexTransform(lg.joinedLines())
+	}
+
 	// Assign a weight to each prefix so that they will be sorted into their
 	// predetermined order.
 	// Weights are negative so that entries with matching prefixes are put before
@@ -390,8 +393,11 @@ func (b block) lessFn() cmpFunc[lineGroup] {
 	longestFirst := comparing(func(s string) int { return len(s) }).reversed()
 	prefixes := slices.SortedStableFunc(slices.Values(b.metadata.opts.PrefixOrder), longestFirst)
 
-	prefixOrder := comparing(func(s string) int {
-		p, ok := b.metadata.opts.hasPrefix(s, slices.Values(prefixes))
+	prefixOrder := comparing(func(s []string) int {
+		if len(s) == 0 {
+			return 0
+		}
+		p, ok := b.metadata.opts.hasPrefix(s[0], slices.Values(prefixes))
 		if !ok {
 			return 0
 		}
@@ -425,6 +431,6 @@ func (b block) lessFn() cmpFunc[lineGroup] {
 	}, numericTokens.compare)
 
 	return commentOnlyBlock.
-		andThen(comparingFunc(lineGroup.joinedLines, prefixOrder.andThen(transformOrder))).
+		andThen(comparingFunc(regexTransform, compareRegexTokens(prefixOrder.andThen(lexicographically(transformOrder))))).
 		andThen(lineGroup.less)
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		WRN while parsing option "by_regex": error parsing regexp: missing argument to repetition operator: `*` line=85
		WRN by_regex cannot be used with ignore_prefixes (consider adding a non-capturing group to the start of your regex instead of ignore_prefixes: "(?:foo\|bar)") line=92