-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_execute.go
3348 lines (2933 loc) · 132 KB
/
convert_execute.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package main
import (
"bytes"
"fmt"
"math"
"slices"
"strconv"
"strings"
"github.com/dlclark/regexp2/syntax"
)
// Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated
// code with other costs, like the (small) overhead of slicing to create the temp span to iterate.
const MaxUnrollSize = 16
func (c *converter) emitExecute(rm *regexpData) {
c.writeLineFmt("func (%s_Engine) Execute(r *regexp2.Runner) error {", rm.GeneratedName)
//c.writeLine(`fmt.Println("Execute")`)
defer func() {
c.writeLine("}\n")
}()
rtl := rm.Options&syntax.RightToLeft != 0
root := rm.Tree.Root.Children[0]
if root.T == syntax.NtEmpty {
// Emit a capture for the current position of length 0. This is rare to see with a real-world pattern,
// but it's very common as part of exploring the source generator, because it's what you get when you
// start out with an empty pattern.
c.writeLine("// The pattern matches the empty string")
c.writeLine("var pos = r.Runtextpos")
c.writeLine("r.Capture(0, pos, pos)")
c.writeLine("return nil")
return
}
if root.T == syntax.NtNothing {
// Emit nothing. This is rare in production and not something to we need optimize for, but as with
// empty, it's helpful as a learning exposition tool.
c.writeLine("return nil")
return
}
if root.T == syntax.NtMulti || root.T == syntax.NtOne || root.T == syntax.NtNotone || root.T == syntax.NtSet {
// If the whole expression is just one or more characters, we can rely on the FindOptimizations spitting out
// an IndexOf that will find the exact sequence or not, and we don't need to do additional checking beyond that.
op := "+"
if rtl {
op = "-"
}
jmp := 1
if root.T == syntax.NtMulti {
jmp = len(root.Str)
}
c.writeLine("// the search in findFirstChar did the entire match")
c.writeLine("var start = r.Runtextpos")
c.writeLineFmt("var end = r.Runtextpos %s %v", op, jmp)
c.writeLine("r.Runtextpos = end")
c.writeLine("r.Capture(0, start, end)")
c.writeLine("return nil")
return
}
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
// RegexWriter; the interpreter would then sit in a loop processing those opcodes, and the RegexCompiler iterated through the
// opcodes generating code for each equivalent to what the interpreter would do albeit with some decisions made at compile-time
// rather than at run-time. This approach, however, lead to complicated code that wasn't pay-for-play (e.g. a big backtracking
// jump table that all compilations went through even if there was no backtracking), that didn't factor in the shape of the
// tree (e.g. it's difficult to add optimizations based on interactions between nodes in the graph), and that didn't read well
// when decompiled from IL to C# or when directly emitted as C# as part of a source generator.
//
// This implementation is instead based on directly walking the RegexNode tree and outputting code for each node in the graph.
// A dedicated for each kind of RegexNode emits the code necessary to handle that node's processing, including recursively
// calling the relevant function for any of its children nodes. Backtracking is handled not via a giant jump table, but instead
// by emitting direct jumps to each backtracking construct. This is achieved by having all match failures jump to a "done"
// label that can be changed by a previous emitter, e.g. before EmitLoop returns, it ensures that "doneLabel" is set to the
// label that code should jump back to when backtracking. That way, a subsequent EmitXx function doesn't need to know exactly
// where to jump: it simply always jumps to "doneLabel" on match failure, and "doneLabel" is always configured to point to
// the right location. In an expression without backtracking, or before any backtracking constructs have been encountered,
// "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to
// the calling scan loop that nothing was matched.
regexTree := rm.Tree
// Helper to define names. Names start unadorned, but as soon as there's repetition,
// they begin to have a numbered suffix.
rm.usedNames = make(map[string]int)
// Every RegexTree is rooted in the implicit Capture for the whole expression.
// Skip the Capture node. We handle the implicit root capture specially.
node := regexTree.Root
node = node.Children[0]
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
oldOut := c.buf
buf := &bytes.Buffer{}
c.buf = buf
defer func() {
// lets clean this up at the end
c.buf = oldOut
// write additionalDeclarations
for _, l := range rm.additionalDeclarations {
c.writeLine(l)
}
//reset
rm.additionalDeclarations = []string{}
// then write our temp out buffer into our saved buffer
c.buf.Write(buf.Bytes())
}()
// Declare some locals.
rm.sliceSpan = "slice"
c.writeLine(`pos := r.Runtextpos
matchStart := pos
`)
// The implementation tries to use const indexes into the span wherever possible, which we can do
// for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
// we know at any point in the regex exactly how far into it we are, and we can use that to index
// into the span created at the beginning of the routine to begin at exactly where we're starting
// in the input. When we encounter a variable-length construct, we transfer the static value to
// pos, slicing the inputSpan appropriately, and then zero out the static position.
rm.sliceStaticPos = 0
c.sliceInputSpan(rm, true)
c.writeLine("")
// doneLabel starts out as the top-level label for the whole expression failing to match. However,
// it may be changed by the processing of a node to point to whereever subsequent match failures
// should jump to, in support of backtracking or other constructs. For example, before emitting
// the code for a branch N, an alternation will set the doneLabel to point to the label for
// processing the next branch N+1: that way, any failures in the branch N's processing will
// implicitly end up jumping to the right location without needing to know in what context it's used.
rm.doneLabel = rm.reserveName("NoMatch")
rm.topLevelDoneLabel = rm.doneLabel
// Check whether there are captures anywhere in the expression. If there isn't, we can skip all
// the boilerplate logic around uncapturing, as there won't be anything to uncapture.
rm.expressionHasCaptures = rm.Analysis.MayContainCapture(node)
// Emit the code for all nodes in the tree.
c.emitExecuteNode(rm, node, nil, true)
// If we fall through to this place in the code, we've successfully matched the expression.
c.writeLine("\n// The input matched.")
if rm.sliceStaticPos > 0 {
// TransferSliceStaticPosToPos would also slice, which isn't needed here
c.emitAddStmt("pos", rm.sliceStaticPos)
}
c.writeLine(`r.Runtextpos = pos
r.Capture(0, matchStart, pos)
// just to prevent an unused var error in certain regex's
var _ = slice
return nil`)
// We're done with the match.
}
// Emits the code for the node.
// subsequent = nil, emitLengthChecksIfRequired = True
func (c *converter) emitExecuteNode(rm *regexpData, node *syntax.RegexNode, subsequent *syntax.RegexNode, emitLengthChecksIfRequired bool) {
// Before we handle general-purpose matching logic for nodes, handle any special-casing.
if rm.Tree.FindOptimizations.FindMode == syntax.LiteralAfterLoop_LeftToRight &&
rm.Tree.FindOptimizations.LiteralAfterLoop.LoopNode == node {
// This is the set loop that's part of the literal-after-loop optimization: the end of the loop
// is stored in runtrackpos, so we just need to transfer that to pos. The optimization is only
// selected if the shape of the tree is amenable.
c.writeLine(`// Skip loop already matched in TryFindNextPossibleStartingPosition.
pos = r.Runtrackpos`)
c.sliceInputSpan(rm, false)
return
}
if node.Options&syntax.RightToLeft != 0 {
// RightToLeft doesn't take advantage of static positions. While RightToLeft won't update static
// positions, a previous operation may have left us with a non-zero one. Make sure it's zero'd out
// such that pos and slice are up-to-date. Note that RightToLeft also shouldn't use the slice span,
// as it's not kept up-to-date; any RightToLeft implementation that wants to use it must first update
// it from pos.
c.transferSliceStaticPosToPos(rm, false)
}
//TODO: debug
c.writeLineFmt("// Node: %s", node.Description())
// Separate out several node types that, for conciseness, don't need a header nor scope written into the source.
// Effectively these either evaporate, are completely self-explanatory, or only exist for their children to be rendered.
switch node.T {
// Nothing is written for an empty.
case syntax.NtEmpty:
return
// A single-line goto for a failure doesn't need a scope or comment.
case syntax.NtNothing:
c.emitExecuteGoto(rm, rm.doneLabel)
return
// Skip atomic nodes that wrap non-backtracking children; in such a case there's nothing to be made atomic.
case syntax.NtAtomic:
if !rm.Analysis.MayBacktrack(node.Children[0]) {
c.emitExecuteNode(rm, node.Children[0], nil, true)
return
}
// Concatenate is a simplification in the node tree so that a series of children can be represented as one.
// We don't need its presence visible in the source.
case syntax.NtConcatenate:
c.emitExecuteConcatenation(rm, node, subsequent, emitLengthChecksIfRequired)
return
}
// For everything else, output a comment about what the node is.
c.writeLineFmt("// %s", describeNode(rm, node))
// Separate out several node types that, for conciseness, don't need a scope written into the source as they're
// always a single statement / block.
switch node.T {
case syntax.NtBeginning, syntax.NtStart, syntax.NtBol, syntax.NtEol, syntax.NtEnd, syntax.NtEndZ:
c.emitExecuteAnchors(rm, node)
return
case syntax.NtBoundary, syntax.NtNonboundary, syntax.NtECMABoundary, syntax.NtNonECMABoundary:
c.emitExecuteBoundary(rm, node)
return
case syntax.NtOne, syntax.NtNotone, syntax.NtSet:
c.emitExecuteSingleChar(rm, node, emitLengthChecksIfRequired, nil, false)
return
case syntax.NtMulti:
if node.Options&syntax.RightToLeft == 0 {
c.emitExecuteMultiCharString(rm, node.Str, emitLengthChecksIfRequired, false, node.Options&syntax.RightToLeft != 0)
return
}
case syntax.NtUpdateBumpalong:
c.emitExecuteUpdateBumpalong(rm, node)
return
}
// For everything else, put the node's code into its own scope, purely for readability. If the node contains labels
// that may need to be visible outside of its scope, the scope is still emitted for clarity but is commented out.
//using (EmitBlock(writer, null, faux: rm.Analysis.MayBacktrack(node)))
//{
switch node.T {
case syntax.NtMulti:
c.emitExecuteMultiCharString(rm, node.Str, emitLengthChecksIfRequired, false, node.Options&syntax.RightToLeft != 0)
return
case syntax.NtOneloop, syntax.NtNotoneloop, syntax.NtSetloop:
c.emitExecuteSingleCharLoop(rm, node, subsequent, emitLengthChecksIfRequired)
return
case syntax.NtOnelazy, syntax.NtNotonelazy, syntax.NtSetlazy:
c.emitExecuteSingleCharLazy(rm, node, subsequent, emitLengthChecksIfRequired)
return
case syntax.NtOneloopatomic, syntax.NtNotoneloopatomic, syntax.NtSetloopatomic:
c.emitExecuteSingleCharAtomicLoop(rm, node, emitLengthChecksIfRequired)
return
case syntax.NtLoop:
c.emitExecuteLoop(rm, node)
return
case syntax.NtLazyloop:
c.emitExecuteLazy(rm, node)
return
case syntax.NtAlternate:
c.emitExecuteAlternation(rm, node)
return
case syntax.NtRef:
c.emitExecuteBackreference(rm, node)
return
case syntax.NtBackRefCond:
c.emitExecuteBackreferenceConditional(rm, node)
return
case syntax.NtExprCond:
c.emitExecuteExpressionConditional(rm, node)
return
case syntax.NtAtomic:
c.emitExecuteAtomic(rm, node, subsequent)
return
case syntax.NtCapture:
c.emitExecuteCapture(rm, node, subsequent)
return
case syntax.NtPosLook:
c.emitExecutePositiveLookaroundAssertion(rm, node)
return
case syntax.NtNegLook:
c.emitExecuteNegativeLookaroundAssertion(rm, node)
return
}
//}
panic(fmt.Sprintf("unhandled node: %v", node))
}
// Emits the node for an atomic.
func (c *converter) emitExecuteAtomic(rm *regexpData, node *syntax.RegexNode, subsequent *syntax.RegexNode) {
// Grab the current done label and the current backtracking position. The purpose of the atomic node
// is to ensure that nodes after it that might backtrack skip over the atomic, which means after
// rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
// see any label left set by the atomic's child. We also need to reset the backtracking stack position
// so that the state on the stack remains consistent.
originalDoneLabel := rm.doneLabel
startingStackpos := rm.reserveName("atomic_stackpos")
rm.addLocalDec(fmt.Sprint(startingStackpos, " := 0"))
c.writeLineFmt("%s = r.Runstackpos\n", startingStackpos)
// Emit the child.
c.emitExecuteNode(rm, node.Children[0], subsequent, true)
// Reset the stack position and done label.
c.writeLineFmt("\nr.Runstackpos = %s", startingStackpos)
rm.doneLabel = originalDoneLabel
}
// Emits the code to handle updating r.Runtextpos to pos in response to
// an UpdateBumpalong node. This is used when we want to inform the scan loop that
// it should bump from this location rather than from the original location.
func (c *converter) emitExecuteUpdateBumpalong(rm *regexpData, node *syntax.RegexNode) {
c.transferSliceStaticPosToPos(rm, false)
c.writeLine(`if r.Runtextpos < pos {
r.Runtextpos = pos
}`)
}
// Emits code for a concatenation
func (c *converter) emitExecuteConcatenation(rm *regexpData, node *syntax.RegexNode, subsequent *syntax.RegexNode, emitLengthChecksIfRequired bool) {
// Emit the code for each child one after the other.
var prevDescription *string
for i := 0; i < len(node.Children); i++ {
// If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence
// and then skip the individual length checks for each. We can also discover case-insensitive sequences that
// can be checked efficiently with methods like StartsWith. We also want to minimize the repetition of if blocks,
// and so we try to emit a series of clauses all part of the same if block rather than one if block per child.
var requiredLength, exclusiveEnd int
if node.Options&syntax.RightToLeft == 0 &&
emitLengthChecksIfRequired &&
node.TryGetJoinableLengthCheckChildRange(i, &requiredLength, &exclusiveEnd) {
wroteClauses := true
writePrefix := func() {
if wroteClauses {
if prevDescription != nil {
c.writeLineFmt(" || /* %s */ ", *prevDescription)
} else {
c.writeLine(" || ")
}
} else {
c.writeLine("if ")
}
}
c.write(fmt.Sprintf("if %s", spanLengthCheck(rm, requiredLength, nil)))
for i < exclusiveEnd {
for ; i < exclusiveEnd; i++ {
child := node.Children[i]
if ok, nodesConsumed, caseInsensitiveString := node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, false); ok {
writePrefix()
sourceSpan := rm.sliceSpan
if rm.sliceStaticPos > 0 {
sourceSpan = fmt.Sprintf("%s[%v:]", rm.sliceSpan, rm.sliceStaticPos)
}
c.write(fmt.Sprintf("!helpers.StartsWithIgnoreCase(%s, %s)", sourceSpan, getRuneSliceLiteral(strings.ToLower(caseInsensitiveString))))
desc := fmt.Sprintf("Match the string %#v (case-insensitive)", caseInsensitiveString)
prevDescription = &desc
wroteClauses = true
rm.sliceStaticPos += len(caseInsensitiveString)
i += nodesConsumed - 1
} else if child.T == syntax.NtMulti {
writePrefix()
c.emitExecuteMultiCharString(rm, child.Str, false, true, false)
desc := describeNode(rm, child)
prevDescription = &desc
wroteClauses = true
} else if (child.IsOneFamily() || child.IsNotoneFamily() || child.IsSetFamily()) &&
child.M == child.N &&
child.M <= MaxUnrollSize {
repeatCount := child.M
if child.T == syntax.NtOne || child.T == syntax.NtNotone || child.T == syntax.NtSet {
repeatCount = 1
}
for x := 0; x < repeatCount; x++ {
writePrefix()
c.emitExecuteSingleChar(rm, child, false, nil, true)
if x == 0 {
desc := describeNode(rm, child)
prevDescription = &desc
} else {
prevDescription = nil
}
wroteClauses = true
}
} else {
break
}
}
if wroteClauses {
if prevDescription != nil {
c.writeLineFmt("/* %s */ {", *prevDescription)
} else {
c.writeLine(" {")
}
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
if i < len(node.Children) {
c.writeLine("")
}
wroteClauses = false
prevDescription = nil
}
if i < exclusiveEnd {
c.emitExecuteNode(rm, node.Children[i], getSubsequentOrDefault(i, node, subsequent), false)
if i < len(node.Children)-1 {
c.writeLine("")
}
i++
}
}
i--
continue
}
c.emitExecuteNode(rm, node.Children[i], getSubsequentOrDefault(i, node, subsequent), emitLengthChecksIfRequired)
if i < len(node.Children)-1 {
c.writeLine("")
}
}
}
// Gets the node to treat as the subsequent one to node.Child(index)
func getSubsequentOrDefault(index int, node *syntax.RegexNode, defaultNode *syntax.RegexNode) *syntax.RegexNode {
for i := index + 1; i < len(node.Children); i++ {
next := node.Children[i]
// skip node types that don't have a semantic impact
if next.T != syntax.NtUpdateBumpalong {
return next
}
}
return defaultNode
}
func (c *converter) emitExecuteMultiCharString(rm *regexpData, str []rune, emitLengthCheck bool, clauseOnly bool, rightToLeft bool) {
if rightToLeft {
c.writeLineFmt("if lastIdx := pos - %v; lastIdx < 0 || lastIdx >= len(r.Runtext) {", len(str))
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}\n")
c.writeLineFmt(`for i:=0; i < %v; i++ {
pos--
if r.Runtext[pos] != %s[%v - i] {`, len(str), getRuneSliceLiteral(str), len(str)-1)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}\n}")
return
}
sourceSpan := rm.sliceSpan
if rm.sliceStaticPos > 0 {
sourceSpan = fmt.Sprintf("%s[%v:]", rm.sliceSpan, rm.sliceStaticPos)
}
clause := fmt.Sprintf("!helpers.StartsWith(%s, %s)", sourceSpan, getRuneSliceLiteral(str))
if clauseOnly {
c.write(clause)
} else {
c.writeLineFmt("if %s {", clause)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
}
rm.sliceStaticPos += len(str)
}
// Emits the code to handle a single-character match.
// emitLengthCheck = true, offset = nil, clauseOnly = false
func (c *converter) emitExecuteSingleChar(rm *regexpData, node *syntax.RegexNode, emitLengthCheck bool, offset *string, clauseOnly bool) {
rtl := node.Options&syntax.RightToLeft != 0
expr := "r.Runtext[pos-1]"
if !rtl {
expr = fmt.Sprintf("%s[%s]", rm.sliceSpan, sum(rm.sliceStaticPos, offset))
}
if node.IsSetFamily() {
expr = c.emitMatchCharacterClass(rm, node.Set, true, expr)
} else if node.IsOneFamily() {
expr = fmt.Sprintf("%s != %q", expr, node.Ch)
} else {
expr = fmt.Sprintf("%s == %q", expr, node.Ch)
}
if clauseOnly {
c.write(expr)
} else {
var clause string
if !emitLengthCheck {
clause = fmt.Sprintf("if %s {", expr)
} else if !rtl {
clause = fmt.Sprintf("if %s || %s {", spanLengthCheck(rm, 1, offset), expr)
} else {
clause = fmt.Sprintf("if newIdx := pos - 1; newIdx < 0 || newIdx >= len(r.Runtext) || %s {", expr)
}
c.writeLine(clause)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
}
if !rtl {
rm.sliceStaticPos++
} else {
c.writeLine("pos--")
}
}
// emitLengthChecksIfRequired=true
func (c *converter) emitExecuteSingleCharLoop(rm *regexpData, node *syntax.RegexNode, subsequent *syntax.RegexNode, emitLengthChecksIfRequired bool) {
// If this is actually atomic based on its parent, emit it as atomic instead; no backtracking necessary.
if rm.Analysis.IsAtomicByAncestor(node) {
c.emitExecuteSingleCharAtomicLoop(rm, node, true)
return
}
// If this is actually a repeater, emit that instead; no backtracking necessary.
if node.M == node.N {
c.emitExecuteSingleCharRepeater(rm, node, emitLengthChecksIfRequired)
return
}
// Emit backtracking around an atomic single char loop. We can then implement the backtracking
// as an afterthought, since we know exactly how many characters are accepted by each iteration
// of the wrapped loop (1) and that there's nothing captured by the loop.
backtrackingLabel := rm.reserveName("CharLoopBacktrack")
endLoop := rm.reserveName("CharLoopEnd")
startingPos := rm.reserveName("charloop_starting_pos")
endingPos := rm.reserveName("charloop_ending_pos")
rm.addLocalDec(fmt.Sprintf("var %s, %s = 0, 0", startingPos, endingPos))
rtl := node.Options&syntax.RightToLeft != 0
isInLoop := rm.Analysis.IsInLoop(node)
// We're about to enter a loop, so ensure our text position is 0.
c.transferSliceStaticPosToPos(rm, false)
// Grab the current position, then emit the loop as atomic, and then
// grab the current position again. Even though we emit the loop without
// knowledge of backtracking, we can layer it on top by just walking back
// through the individual characters (a benefit of the loop matching exactly
// one character per iteration, no possible captures within the loop, etc.)
c.writeLineFmt("%s = pos\n", startingPos)
c.emitExecuteSingleCharAtomicLoop(rm, node, true)
c.writeLine("")
c.transferSliceStaticPosToPos(rm, false)
c.writeLineFmt("%s = pos", endingPos)
if !rtl {
c.emitAddStmt(startingPos, node.M)
} else {
c.emitAddStmt(startingPos, -node.M)
}
c.emitExecuteGoto(rm, endLoop)
c.writeLine("")
// Backtracking section. Subsequent failures will jump to here, at which
// point we decrement the matched count as long as it's above the minimum
// required, and try again by flowing to everything that comes after this.
c.emitMarkLabel(rm, backtrackingLabel, false)
stackCookie := c.createStackCookie()
var capturePos string
if isInLoop {
// This loop is inside of another loop, which means we persist state
// on the backtracking stack rather than relying on locals to always
// hold the right state (if we didn't do that, another iteration of the
// outer loop could have resulted in the locals being overwritten).
// Pop the relevant state from the stack.
if rm.expressionHasCaptures {
c.emitUncaptureUntil("r.StackPop()")
}
c.emitStackPop(stackCookie, endingPos, startingPos)
} else if rm.expressionHasCaptures {
// Since we're not in a loop, we're using a local to track the crawl position.
// Unwind back to the position we were at prior to running the code after this loop.
capturePos = rm.reserveName("charloop_capture_pos")
rm.addLocalDec(fmt.Sprintf("%s := 0", capturePos))
c.emitUncaptureUntil(capturePos)
}
c.writeLine("")
// We're backtracking. Check the timeout.
c.emitTimeoutCheckIfNeeded(rm)
var literalNode *syntax.RegexNode
if subsequent != nil {
literalNode = subsequent.FindStartingLiteralNode(true)
}
var literalLength int
var indexOfExpr string
if !rtl &&
node.N > 1 && // no point in using IndexOf for small loops, in particular optionals
literalNode != nil &&
c.tryEmitExecuteIndexOf(rm, literalNode, fmt.Sprintf("r.Runtext[%s:%%s]", startingPos), true, false, &literalLength, &indexOfExpr) {
//hack -- if the indexOfExpr comes back it'll be a format string (notice the double %)
//for the final index into the slice so we can populate it here
//e.g. r.Runtext[startingPos:%s]
c.writeLineFmt(`if %s >= %s {`, startingPos, endingPos)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
if literalLength > 1 {
indexOfExpr = fmt.Sprintf(indexOfExpr, fmt.Sprintf("helpers.Min(len(r.Runtext), %s+%v)", endingPos, literalLength-1))
} else {
indexOfExpr = fmt.Sprintf(indexOfExpr, endingPos)
}
c.writeLineFmt("%s = %s", endingPos, indexOfExpr)
c.writeLineFmt("if %s < 0 { // miss", endingPos)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
c.writeLineFmt(`%s += %s
pos = %[1]s`, endingPos, startingPos)
} else {
op := ">="
if rtl {
op = "<="
}
c.writeLineFmt("if %s %s %s {", startingPos, op, endingPos)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("}")
if !rtl {
c.writeLineFmt(`%s--
pos = %[1]s`, endingPos)
} else {
c.writeLineFmt(`%s++
pos = %[1]s`, endingPos)
}
}
if !rtl {
c.sliceInputSpan(rm, false)
}
c.writeLine("")
c.emitMarkLabel(rm, endLoop, false)
if isInLoop {
// We're in a loop and thus can't rely on locals correctly holding the state we
// need (the locals could be overwritten by a subsequent iteration). Push the state
// on to the backtracking stack.
if rm.expressionHasCaptures {
c.emitStackPush(stackCookie, startingPos, endingPos, "r.Crawlpos()")
} else {
c.emitStackPush(stackCookie, startingPos, endingPos)
}
} else if len(capturePos) > 0 {
// We're not in a loop and so can trust our locals. Store the current capture position
// into the capture position local; we'll uncapture back to this when backtracking to
// remove any captures from after this loop that we need to throw away.
c.writeLineFmt("%s = r.Crawlpos()", capturePos)
}
rm.doneLabel = backtrackingLabel // leave set to the backtracking label for all subsequent nodes
}
func (c *converter) emitMarkLabel(rm *regexpData, label string, emitSemiColon bool) {
rm.emittedLabels = append(rm.emittedLabels, label)
if emitSemiColon {
c.writeLineFmt("%s: ;", label)
} else {
c.writeLineFmt("%s:", label)
}
}
// emitLengthChecksIfRequired=true
func (c *converter) emitExecuteSingleCharLazy(rm *regexpData, node *syntax.RegexNode, subsequent *syntax.RegexNode, emitLengthChecksIfRequired bool) {
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match, and there's no backtracking possible by the individual
// characters/iterations themselves.
if node.M > 0 {
c.emitExecuteSingleCharRepeater(rm, node, emitLengthChecksIfRequired)
}
// If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic
// lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum.
if node.M == node.N || rm.Analysis.IsAtomicByAncestor(node) {
return
}
if node.M > 0 {
// We emitted a repeater to handle the required iterations; add a newline after it.
c.writeLine("")
}
// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.
// We're about to enter a loop, so ensure our text position is 0.
c.transferSliceStaticPosToPos(rm, false)
// If the loop isn't unbounded, track the number of iterations and the max number to allow.
var iterationCount, maxIterations string
if node.N != math.MaxInt32 {
maxIterations = strconv.Itoa(node.N - node.M)
iterationCount = rm.reserveName("lazyloop_iteration")
rm.addLocalDec(fmt.Sprint(iterationCount, " := 0"))
c.writeLineFmt("%s = 0", iterationCount)
}
// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
var capturePos string
if rm.expressionHasCaptures {
capturePos = rm.reserveName("lazyloop_capturepos")
rm.addLocalDec(fmt.Sprint(capturePos, " := 0"))
}
// Track the current pos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
startingPos := rm.reserveName("lazyloop_pos")
rm.addLocalDec(fmt.Sprint(startingPos, " := 0"))
c.writeLineFmt("%s = pos", startingPos)
// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
endLoopLabel := rm.reserveName("LazyLoopEnd")
c.emitExecuteGoto(rm, endLoopLabel)
c.writeLine("")
// Backtracking section. Subsequent failures will jump to here.
backtrackingLabel := rm.reserveName("LazyLoopBacktrack")
c.emitMarkLabel(rm, backtrackingLabel, false)
// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if len(capturePos) > 0 {
c.emitUncaptureUntil(capturePos)
}
// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if len(maxIterations) > 0 {
c.writeLineFmt("if %s >= %s {", iterationCount, maxIterations)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLineFmt("}\n%s++", iterationCount)
}
// We're backtracking. Check the timeout.
c.emitTimeoutCheckIfNeeded(rm)
// Now match the next item in the lazy loop. We need to reset the pos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.
c.writeLineFmt("pos = %s", startingPos)
c.sliceInputSpan(rm, false)
c.emitExecuteSingleChar(rm, node, true, nil, false)
c.transferSliceStaticPosToPos(rm, false)
// Now that we've appropriately advanced by one character and are set for what comes after the loop,
// see if we can skip ahead more iterations by doing a search for a following literal.
if (node.Options & syntax.RightToLeft) == 0 {
var literal *syntax.StartingLiteral
var literalNode *syntax.RegexNode
var indexOfExpr string
if subsequent != nil {
literal = subsequent.FindStartingLiteral()
literalNode = subsequent.FindStartingLiteralNode(true)
}
if len(iterationCount) == 0 && node.T == syntax.NtNotonelazy &&
literal != nil &&
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(len(literal.String) > 0 ||
len(literal.SetChars) > 0 ||
literal.Range.First == literal.Range.Last ||
(literal.Range.First <= node.Ch && node.Ch <= literal.Range.Last)) {
// for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
// e.g. "<[^>]*?>"
// Whether the not'd character matches the subsequent literal. This impacts whether we need to search
// for both or just the literal, as well as what assumptions we can make once a match is found.
var overlap bool
// This lazy loop will consume all characters other than node.Ch until the subsequent literal.
// We can implement it to search for either that char or the literal, whichever comes first.
if len(literal.String) > 0 {
// string literal
overlap = (literal.String[0] == node.Ch)
if overlap {
c.writeLineFmt("%s = helpers.IndexOfAny1(%s, %q)", startingPos, rm.sliceSpan, node.Ch)
} else {
c.writeLineFmt("%s = helpers.IndexOfAny2(%s, %q, %q)", startingPos, rm.sliceSpan, node.Ch, literal.String[0])
}
} else if len(literal.SetChars) > 0 {
// set literal
overlap = slices.Contains(literal.SetChars, node.Ch)
var chars []rune
if !overlap {
chars = []rune{node.Ch}
}
chars = append(chars, literal.SetChars...)
c.writeLineFmt("%s = %s", startingPos, c.emitIndexOfChars(chars, false, rm.sliceSpan))
} else if literal.Range.First == literal.Range.Last {
// single char from a RegexNode.One
overlap = (literal.Range.First == node.Ch)
if overlap {
c.writeLineFmt("%s = helpers.IndexOfAny1(%s, %q)", startingPos, rm.sliceSpan, node.Ch)
} else {
c.writeLineFmt("%s = helpers.IndexOfAny2(%s, %q, %q)", startingPos, rm.sliceSpan, node.Ch, literal.Range.First)
}
} else {
// char range
overlap = true
c.writeLineFmt("%s = helpers.IndexOfAnyInRange(%s, %q, %q)", startingPos, rm.sliceSpan, literal.Range.First, literal.Range.Last)
}
// If the search didn't find anything, fail the match. If it did find something, then we need to consider whether
// that something is the loop character. If it's not, we've successfully backtracked to the next lazy location
// where we should evaluate the rest of the pattern. If it does match, then we need to consider whether there's
// overlap between the loop character and the literal. If there is overlap, this is also a place to check. But
// if there's not overlap, and if the found character is the loop character, we also want to fail the match here
// and now, as this means the loop ends before it gets to what needs to come after the loop, and thus the pattern
// can't possibly match here.
if overlap {
c.writeLineFmt("if %s < 0 {", startingPos)
} else {
c.writeLineFmt("if %s >= len(%s) || %[2]s[%[1]s] == %[3]q {", startingPos, rm.sliceSpan, node.Ch)
}
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLineFmt(`}
pos += %s`, startingPos)
c.sliceInputSpan(rm, false)
} else if len(iterationCount) == 0 &&
node.T == syntax.NtSetlazy &&
node.Set.IsAnything() &&
literalNode != nil &&
c.tryEmitExecuteIndexOf(rm, literalNode, rm.sliceSpan, false, false, new(int), &indexOfExpr) {
// e.g. ".*?string" with RegexOptions.Singleline
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
// isn't found, the loop fails. We can implement it to just search for that literal.
c.writeLineFmt("%s = %s", startingPos, indexOfExpr)
c.writeLineFmt("if %s < 0 {", startingPos)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLineFmt(`}
pos += %s`, startingPos)
c.sliceInputSpan(rm, false)
}
}
// Store the position we've left off at in case we need to iterate again.
c.writeLineFmt("%s = pos", startingPos)
// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
rm.doneLabel = backtrackingLabel // leave set to the backtracking label for all subsequent nodes
c.writeLine("")
isInLoop := rm.Analysis.IsInLoop(node)
c.emitMarkLabel(rm, endLoopLabel, !(len(capturePos) > 0 || isInLoop))
if len(capturePos) != 0 {
c.writeLineFmt("%s = r.Crawlpos()", capturePos)
}
// If this loop is itself not in another loop, nothing more needs to be done:
// upon backtracking, locals being used by this loop will have retained their
// values and be up-to-date. But if this loop is inside another loop, multiple
// iterations of this loop each need their own state, so we need to use the stack
// to hold it, and we need a dedicated backtracking section to handle restoring
// that state before jumping back into the loop itself.
if isInLoop {
c.writeLine("")
stackCookie := c.createStackCookie()
// Store the loop's state.
args := []string{startingPos}
if len(capturePos) > 0 {
args = append(args, capturePos)
}
if len(iterationCount) > 0 {
args = append(args, iterationCount)
}
c.emitStackPush(stackCookie, args...)
// Skip past the backtracking section.
end := rm.reserveName("LazyLoopSkipBacktrack")
c.emitExecuteGoto(rm, end)
c.writeLine("")
// Emit a backtracking section that restores the loop's state and then jumps to the previous done label.
backtrack := rm.reserveName("CharLazyBacktrack")
c.emitMarkLabel(rm, backtrack, false)
// Restore the loop's state.
// pop in reverse order
slices.Reverse(args)
c.emitStackPop(stackCookie, args...)
c.emitExecuteGoto(rm, rm.doneLabel)
c.writeLine("")
rm.doneLabel = backtrack
c.emitMarkLabel(rm, end, false)
}
}
// Emits the code to handle a non-backtracking, variable-length loop around a single character comparison.
// emitLengthChecksIfRequired=true
func (c *converter) emitExecuteSingleCharAtomicLoop(rm *regexpData, node *syntax.RegexNode, emitLengthChecksIfRequired bool) {
// If this is actually a repeater, emit that instead.
if node.M == node.N {
c.emitExecuteSingleCharRepeater(rm, node, emitLengthChecksIfRequired)
return
}
// If this is actually an optional single char, emit that instead.
if node.M == 0 && node.N == 1 {
c.emitExecuteAtomicSingleCharZeroOrOne(rm, node)
return
}
minIterations := node.M
maxIterations := node.N
rtl := (node.Options & syntax.RightToLeft) != 0
iterationLocal := rm.reserveName("iteration")
// needs to be function-wide in Go otherwise our Goto statements will fail because
// Go doesn't support jumping over a variable decl
rm.addLocalDec(fmt.Sprint(iterationLocal, " := 0"))
var indexOfExpr string
if rtl {
c.transferSliceStaticPosToPos(rm, false) // we don't use static position for rtl
if node.IsSetFamily() && maxIterations == math.MaxInt32 && node.Set.IsAnything() {
// If this loop will consume the remainder of the input, just set the iteration variable
// to pos directly rather than looping to get there.
c.writeLineFmt("%s = pos", iterationLocal)
} else {
c.writeLineFmt("%s = 0", iterationLocal)
expr := fmt.Sprintf("r.Runtext[pos - %s - 1]", iterationLocal)
if node.IsSetFamily() {
expr = c.emitMatchCharacterClass(rm, node.Set, false, expr)
} else {
op := "!="
if node.IsOneFamily() {
op = "=="
}
expr = fmt.Sprintf("%s %s %q", expr, op, node.Ch)
}
maxClause := ""
if maxIterations != math.MaxInt32 {
maxClause = fmt.Sprintf("%s && ", countIsLessThan(iterationLocal, maxIterations))
}
c.writeLineFmt(`for %spos > %s && %s {
%[2]s++
}
`, maxClause, iterationLocal, expr)
}
} else if node.IsSetFamily() && maxIterations == math.MaxInt32 && node.Set.IsAnything() {
// .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end.
// The unbounded constraint is the same as in the Notone case above, done purely for simplicity.
c.transferSliceStaticPosToPos(rm, false)
c.writeLineFmt("%s = len(r.Runtext) - pos", iterationLocal)
} else if c.tryEmitExecuteIndexOf(rm, node, "%s", false, true, new(int), &indexOfExpr) {
// We can use an IndexOf method to perform the search. If the number of iterations is unbounded, we can just search the whole span.
// If, however, it's bounded, we need to slice the span to the min(remainingSpan.Length, maxIterations) so that we don't
// search more than is necessary.
// If maxIterations is 0, the node should have been optimized away. If it's 1 and min is 0, it should
// have been handled as an optional loop above, and if it's 1 and min is 1, it should have been transformed
// into a single char match. So, we should only be here if maxIterations is greater than 1. And that's relevant,