-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregexp_nfa.c
7650 lines (6990 loc) · 194 KB
/
regexp_nfa.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* vi:set ts=8 sts=4 sw=4 noet:
*
* NFA regular expression implementation.
*
* This file is included in "regexp.c".
*/
/*
* Logging of NFA engine.
*
* The NFA engine can write four log files:
* - Error log: Contains NFA engine's fatal errors.
* - Dump log: Contains compiled NFA state machine's information.
* - Run log: Contains information of matching procedure.
* - Debug log: Contains detailed information of matching procedure. Can be
* disabled by undefining NFA_REGEXP_DEBUG_LOG.
* The first one can also be used without debug mode.
* The last three are enabled when compiled as debug mode and individually
* disabled by commenting them out.
* The log files can get quite big!
* To disable all of this when compiling Vim for debugging, undefine DEBUG in
* regexp.c
*/
#ifdef DEBUG
# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
# define ENABLE_LOG
# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
#endif
// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
#define NFA_ADD_NL 31
enum
{
NFA_SPLIT = -1024,
NFA_MATCH,
NFA_EMPTY, // matches 0-length
NFA_START_COLL, // [abc] start
NFA_END_COLL, // [abc] end
NFA_START_NEG_COLL, // [^abc] start
NFA_END_NEG_COLL, // [^abc] end (postfix only)
NFA_RANGE, // range of the two previous items
// (postfix only)
NFA_RANGE_MIN, // low end of a range
NFA_RANGE_MAX, // high end of a range
NFA_CONCAT, // concatenate two previous items (postfix
// only)
NFA_OR, // \| (postfix only)
NFA_STAR, // greedy * (postfix only)
NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
NFA_QUEST, // greedy \? (postfix only)
NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
NFA_BOL, // ^ Begin line
NFA_EOL, // $ End line
NFA_BOW, // \< Begin word
NFA_EOW, // \> End word
NFA_BOF, // \%^ Begin file
NFA_EOF, // \%$ End file
NFA_NEWL,
NFA_ZSTART, // Used for \zs
NFA_ZEND, // Used for \ze
NFA_NOPEN, // Start of subexpression marked with \%(
NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
NFA_START_INVISIBLE,
NFA_START_INVISIBLE_FIRST,
NFA_START_INVISIBLE_NEG,
NFA_START_INVISIBLE_NEG_FIRST,
NFA_START_INVISIBLE_BEFORE,
NFA_START_INVISIBLE_BEFORE_FIRST,
NFA_START_INVISIBLE_BEFORE_NEG,
NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
NFA_START_PATTERN,
NFA_END_INVISIBLE,
NFA_END_INVISIBLE_NEG,
NFA_END_PATTERN,
NFA_COMPOSING, // Next nodes in NFA are part of the
// composing multibyte char
NFA_END_COMPOSING, // End of a composing char in the NFA
NFA_ANY_COMPOSING, // \%C: Any composing characters.
NFA_OPT_CHARS, // \%[abc]
// The following are used only in the postfix form, not in the NFA
NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
NFA_BACKREF1, // \1
NFA_BACKREF2, // \2
NFA_BACKREF3, // \3
NFA_BACKREF4, // \4
NFA_BACKREF5, // \5
NFA_BACKREF6, // \6
NFA_BACKREF7, // \7
NFA_BACKREF8, // \8
NFA_BACKREF9, // \9
#ifdef FEAT_SYN_HL
NFA_ZREF1, // \z1
NFA_ZREF2, // \z2
NFA_ZREF3, // \z3
NFA_ZREF4, // \z4
NFA_ZREF5, // \z5
NFA_ZREF6, // \z6
NFA_ZREF7, // \z7
NFA_ZREF8, // \z8
NFA_ZREF9, // \z9
#endif
NFA_SKIP, // Skip characters
NFA_MOPEN,
NFA_MOPEN1,
NFA_MOPEN2,
NFA_MOPEN3,
NFA_MOPEN4,
NFA_MOPEN5,
NFA_MOPEN6,
NFA_MOPEN7,
NFA_MOPEN8,
NFA_MOPEN9,
NFA_MCLOSE,
NFA_MCLOSE1,
NFA_MCLOSE2,
NFA_MCLOSE3,
NFA_MCLOSE4,
NFA_MCLOSE5,
NFA_MCLOSE6,
NFA_MCLOSE7,
NFA_MCLOSE8,
NFA_MCLOSE9,
#ifdef FEAT_SYN_HL
NFA_ZOPEN,
NFA_ZOPEN1,
NFA_ZOPEN2,
NFA_ZOPEN3,
NFA_ZOPEN4,
NFA_ZOPEN5,
NFA_ZOPEN6,
NFA_ZOPEN7,
NFA_ZOPEN8,
NFA_ZOPEN9,
NFA_ZCLOSE,
NFA_ZCLOSE1,
NFA_ZCLOSE2,
NFA_ZCLOSE3,
NFA_ZCLOSE4,
NFA_ZCLOSE5,
NFA_ZCLOSE6,
NFA_ZCLOSE7,
NFA_ZCLOSE8,
NFA_ZCLOSE9,
#endif
// NFA_FIRST_NL
NFA_ANY, // Match any one character.
NFA_IDENT, // Match identifier char
NFA_SIDENT, // Match identifier char but no digit
NFA_KWORD, // Match keyword char
NFA_SKWORD, // Match word char but no digit
NFA_FNAME, // Match file name char
NFA_SFNAME, // Match file name char but no digit
NFA_PRINT, // Match printable char
NFA_SPRINT, // Match printable char but no digit
NFA_WHITE, // Match whitespace char
NFA_NWHITE, // Match non-whitespace char
NFA_DIGIT, // Match digit char
NFA_NDIGIT, // Match non-digit char
NFA_HEX, // Match hex char
NFA_NHEX, // Match non-hex char
NFA_OCTAL, // Match octal char
NFA_NOCTAL, // Match non-octal char
NFA_WORD, // Match word char
NFA_NWORD, // Match non-word char
NFA_HEAD, // Match head char
NFA_NHEAD, // Match non-head char
NFA_ALPHA, // Match alpha char
NFA_NALPHA, // Match non-alpha char
NFA_LOWER, // Match lowercase char
NFA_NLOWER, // Match non-lowercase char
NFA_UPPER, // Match uppercase char
NFA_NUPPER, // Match non-uppercase char
NFA_LOWER_IC, // Match [a-z]
NFA_NLOWER_IC, // Match [^a-z]
NFA_UPPER_IC, // Match [A-Z]
NFA_NUPPER_IC, // Match [^A-Z]
NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
NFA_CURSOR, // Match cursor pos
NFA_LNUM, // Match line number
NFA_LNUM_GT, // Match > line number
NFA_LNUM_LT, // Match < line number
NFA_COL, // Match cursor column
NFA_COL_GT, // Match > cursor column
NFA_COL_LT, // Match < cursor column
NFA_VCOL, // Match cursor virtual column
NFA_VCOL_GT, // Match > cursor virtual column
NFA_VCOL_LT, // Match < cursor virtual column
NFA_MARK, // Match mark
NFA_MARK_GT, // Match > mark
NFA_MARK_LT, // Match < mark
NFA_VISUAL, // Match Visual area
// Character classes [:alnum:] etc
NFA_CLASS_ALNUM,
NFA_CLASS_ALPHA,
NFA_CLASS_BLANK,
NFA_CLASS_CNTRL,
NFA_CLASS_DIGIT,
NFA_CLASS_GRAPH,
NFA_CLASS_LOWER,
NFA_CLASS_PRINT,
NFA_CLASS_PUNCT,
NFA_CLASS_SPACE,
NFA_CLASS_UPPER,
NFA_CLASS_XDIGIT,
NFA_CLASS_TAB,
NFA_CLASS_RETURN,
NFA_CLASS_BACKSPACE,
NFA_CLASS_ESCAPE,
NFA_CLASS_IDENT,
NFA_CLASS_KEYWORD,
NFA_CLASS_FNAME
};
// Keep in sync with classchars.
static int nfa_classcodes[] = {
NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
NFA_UPPER, NFA_NUPPER
};
// Variables only used in nfa_regcomp() and descendants.
static int nfa_re_flags; // re_flags passed to nfa_regcomp()
static int *post_start; // holds the postfix form of r.e.
static int *post_end;
static int *post_ptr;
// Set when the pattern should use the NFA engine.
// E.g. [[:upper:]] only allows 8bit characters for BT engine,
// while NFA engine handles multibyte characters correctly.
static int wants_nfa;
static int nstate; // Number of states in the NFA.
static int istate; // Index in the state vector, used in alloc_state()
// If not NULL match must end at this position
static save_se_T *nfa_endp = NULL;
// 0 for first call to nfa_regmatch(), 1 for recursive call.
static int nfa_ll_index = 0;
static int realloc_post_list(void);
static int nfa_reg(int paren);
#ifdef DEBUG
static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
#endif
static int match_follows(nfa_state_T *startstate, int depth);
static int failure_chance(nfa_state_T *state, int depth);
// helper functions used when doing re2post() ... regatom() parsing
#define EMIT(c) do { \
if (post_ptr >= post_end && realloc_post_list() == FAIL) \
return FAIL; \
*post_ptr++ = c; \
} while (0)
/*
* Initialize internal variables before NFA compilation.
* Return OK on success, FAIL otherwise.
*/
static int
nfa_regcomp_start(
char_u *expr,
int re_flags) // see vim_regcomp()
{
size_t postfix_size;
int nstate_max;
nstate = 0;
istate = 0;
// A reasonable estimation for maximum size
nstate_max = (int)(STRLEN(expr) + 1) * 25;
// Some items blow up in size, such as [A-z]. Add more space for that.
// When it is still not enough realloc_post_list() will be used.
nstate_max += 1000;
// Size for postfix representation of expr.
postfix_size = sizeof(int) * nstate_max;
post_start = alloc(postfix_size);
if (post_start == NULL)
return FAIL;
post_ptr = post_start;
post_end = post_start + nstate_max;
wants_nfa = FALSE;
rex.nfa_has_zend = FALSE;
rex.nfa_has_backref = FALSE;
// shared with BT engine
regcomp_start(expr, re_flags);
return OK;
}
/*
* Figure out if the NFA state list starts with an anchor, must match at start
* of the line.
*/
static int
nfa_get_reganch(nfa_state_T *start, int depth)
{
nfa_state_T *p = start;
if (depth > 4)
return 0;
while (p != NULL)
{
switch (p->c)
{
case NFA_BOL:
case NFA_BOF:
return 1; // yes!
case NFA_ZSTART:
case NFA_ZEND:
case NFA_CURSOR:
case NFA_VISUAL:
case NFA_MOPEN:
case NFA_MOPEN1:
case NFA_MOPEN2:
case NFA_MOPEN3:
case NFA_MOPEN4:
case NFA_MOPEN5:
case NFA_MOPEN6:
case NFA_MOPEN7:
case NFA_MOPEN8:
case NFA_MOPEN9:
case NFA_NOPEN:
#ifdef FEAT_SYN_HL
case NFA_ZOPEN:
case NFA_ZOPEN1:
case NFA_ZOPEN2:
case NFA_ZOPEN3:
case NFA_ZOPEN4:
case NFA_ZOPEN5:
case NFA_ZOPEN6:
case NFA_ZOPEN7:
case NFA_ZOPEN8:
case NFA_ZOPEN9:
#endif
p = p->out;
break;
case NFA_SPLIT:
return nfa_get_reganch(p->out, depth + 1)
&& nfa_get_reganch(p->out1, depth + 1);
default:
return 0; // noooo
}
}
return 0;
}
/*
* Figure out if the NFA state list starts with a character which must match
* at start of the match.
*/
static int
nfa_get_regstart(nfa_state_T *start, int depth)
{
nfa_state_T *p = start;
if (depth > 4)
return 0;
while (p != NULL)
{
switch (p->c)
{
// all kinds of zero-width matches
case NFA_BOL:
case NFA_BOF:
case NFA_BOW:
case NFA_EOW:
case NFA_ZSTART:
case NFA_ZEND:
case NFA_CURSOR:
case NFA_VISUAL:
case NFA_LNUM:
case NFA_LNUM_GT:
case NFA_LNUM_LT:
case NFA_COL:
case NFA_COL_GT:
case NFA_COL_LT:
case NFA_VCOL:
case NFA_VCOL_GT:
case NFA_VCOL_LT:
case NFA_MARK:
case NFA_MARK_GT:
case NFA_MARK_LT:
case NFA_MOPEN:
case NFA_MOPEN1:
case NFA_MOPEN2:
case NFA_MOPEN3:
case NFA_MOPEN4:
case NFA_MOPEN5:
case NFA_MOPEN6:
case NFA_MOPEN7:
case NFA_MOPEN8:
case NFA_MOPEN9:
case NFA_NOPEN:
#ifdef FEAT_SYN_HL
case NFA_ZOPEN:
case NFA_ZOPEN1:
case NFA_ZOPEN2:
case NFA_ZOPEN3:
case NFA_ZOPEN4:
case NFA_ZOPEN5:
case NFA_ZOPEN6:
case NFA_ZOPEN7:
case NFA_ZOPEN8:
case NFA_ZOPEN9:
#endif
p = p->out;
break;
case NFA_SPLIT:
{
int c1 = nfa_get_regstart(p->out, depth + 1);
int c2 = nfa_get_regstart(p->out1, depth + 1);
if (c1 == c2)
return c1; // yes!
return 0;
}
default:
if (p->c > 0)
return p->c; // yes!
return 0;
}
}
return 0;
}
/*
* Figure out if the NFA state list contains just literal text and nothing
* else. If so return a string in allocated memory with what must match after
* regstart. Otherwise return NULL.
*/
static char_u *
nfa_get_match_text(nfa_state_T *start)
{
nfa_state_T *p = start;
int len = 0;
char_u *ret;
char_u *s;
if (p->c != NFA_MOPEN)
return NULL; // just in case
p = p->out;
while (p->c > 0)
{
len += MB_CHAR2LEN(p->c);
p = p->out;
}
if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
return NULL;
ret = alloc(len);
if (ret == NULL)
return NULL;
p = start->out->out; // skip first char, it goes into regstart
s = ret;
while (p->c > 0)
{
if (has_mbyte)
s += (*mb_char2bytes)(p->c, s);
else
*s++ = p->c;
p = p->out;
}
*s = NUL;
return ret;
}
/*
* Allocate more space for post_start. Called when
* running above the estimated number of states.
*/
static int
realloc_post_list(void)
{
int nstate_max = (int)(post_end - post_start);
int new_max;
int *new_start;
int *old_start;
// For weird patterns the number of states can be very high. Increasing by
// 50% seems a reasonable compromise between memory use and speed.
new_max = nstate_max * 3 / 2;
new_start = ALLOC_MULT(int, new_max);
if (new_start == NULL)
return FAIL;
mch_memmove(new_start, post_start, nstate_max * sizeof(int));
old_start = post_start;
post_start = new_start;
post_ptr = new_start + (post_ptr - old_start);
post_end = post_start + new_max;
vim_free(old_start);
return OK;
}
/*
* Search between "start" and "end" and try to recognize a
* character class in expanded form. For example [0-9].
* On success, return the id the character class to be emitted.
* On failure, return 0 (=FAIL)
* Start points to the first char of the range, while end should point
* to the closing brace.
* Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
* need to be interpreted as [a-zA-Z].
*/
static int
nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
{
# define CLASS_not 0x80
# define CLASS_af 0x40
# define CLASS_AF 0x20
# define CLASS_az 0x10
# define CLASS_AZ 0x08
# define CLASS_o7 0x04
# define CLASS_o9 0x02
# define CLASS_underscore 0x01
int newl = FALSE;
char_u *p;
int config = 0;
if (extra_newl == TRUE)
newl = TRUE;
if (*end != ']')
return FAIL;
p = start;
if (*p == '^')
{
config |= CLASS_not;
p++;
}
while (p < end)
{
if (p + 2 < end && *(p + 1) == '-')
{
switch (*p)
{
case '0':
if (*(p + 2) == '9')
{
config |= CLASS_o9;
break;
}
if (*(p + 2) == '7')
{
config |= CLASS_o7;
break;
}
return FAIL;
case 'a':
if (*(p + 2) == 'z')
{
config |= CLASS_az;
break;
}
if (*(p + 2) == 'f')
{
config |= CLASS_af;
break;
}
return FAIL;
case 'A':
if (*(p + 2) == 'Z')
{
config |= CLASS_AZ;
break;
}
if (*(p + 2) == 'F')
{
config |= CLASS_AF;
break;
}
return FAIL;
default:
return FAIL;
}
p += 3;
}
else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
{
newl = TRUE;
p += 2;
}
else if (*p == '_')
{
config |= CLASS_underscore;
p ++;
}
else if (*p == '\n')
{
newl = TRUE;
p ++;
}
else
return FAIL;
} // while (p < end)
if (p != end)
return FAIL;
if (newl == TRUE)
extra_newl = NFA_ADD_NL;
switch (config)
{
case CLASS_o9:
return extra_newl + NFA_DIGIT;
case CLASS_not | CLASS_o9:
return extra_newl + NFA_NDIGIT;
case CLASS_af | CLASS_AF | CLASS_o9:
return extra_newl + NFA_HEX;
case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
return extra_newl + NFA_NHEX;
case CLASS_o7:
return extra_newl + NFA_OCTAL;
case CLASS_not | CLASS_o7:
return extra_newl + NFA_NOCTAL;
case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
return extra_newl + NFA_WORD;
case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
return extra_newl + NFA_NWORD;
case CLASS_az | CLASS_AZ | CLASS_underscore:
return extra_newl + NFA_HEAD;
case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
return extra_newl + NFA_NHEAD;
case CLASS_az | CLASS_AZ:
return extra_newl + NFA_ALPHA;
case CLASS_not | CLASS_az | CLASS_AZ:
return extra_newl + NFA_NALPHA;
case CLASS_az:
return extra_newl + NFA_LOWER_IC;
case CLASS_not | CLASS_az:
return extra_newl + NFA_NLOWER_IC;
case CLASS_AZ:
return extra_newl + NFA_UPPER_IC;
case CLASS_not | CLASS_AZ:
return extra_newl + NFA_NUPPER_IC;
}
return FAIL;
}
/*
* Produce the bytes for equivalence class "c".
* Currently only handles latin1, latin9 and utf-8.
* Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
* equivalent to 'a OR b OR c'
*
* NOTE! When changing this function, also update reg_equi_class()
*/
static int
nfa_emit_equi_class(int c)
{
#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
|| STRCMP(p_enc, "iso-8859-15") == 0)
{
#define A_grave 0xc0
#define A_acute 0xc1
#define A_circumflex 0xc2
#define A_virguilla 0xc3
#define A_diaeresis 0xc4
#define A_ring 0xc5
#define C_cedilla 0xc7
#define E_grave 0xc8
#define E_acute 0xc9
#define E_circumflex 0xca
#define E_diaeresis 0xcb
#define I_grave 0xcc
#define I_acute 0xcd
#define I_circumflex 0xce
#define I_diaeresis 0xcf
#define N_virguilla 0xd1
#define O_grave 0xd2
#define O_acute 0xd3
#define O_circumflex 0xd4
#define O_virguilla 0xd5
#define O_diaeresis 0xd6
#define O_slash 0xd8
#define U_grave 0xd9
#define U_acute 0xda
#define U_circumflex 0xdb
#define U_diaeresis 0xdc
#define Y_acute 0xdd
#define a_grave 0xe0
#define a_acute 0xe1
#define a_circumflex 0xe2
#define a_virguilla 0xe3
#define a_diaeresis 0xe4
#define a_ring 0xe5
#define c_cedilla 0xe7
#define e_grave 0xe8
#define e_acute 0xe9
#define e_circumflex 0xea
#define e_diaeresis 0xeb
#define i_grave 0xec
#define i_acute 0xed
#define i_circumflex 0xee
#define i_diaeresis 0xef
#define n_virguilla 0xf1
#define o_grave 0xf2
#define o_acute 0xf3
#define o_circumflex 0xf4
#define o_virguilla 0xf5
#define o_diaeresis 0xf6
#define o_slash 0xf8
#define u_grave 0xf9
#define u_acute 0xfa
#define u_circumflex 0xfb
#define u_diaeresis 0xfc
#define y_acute 0xfd
#define y_diaeresis 0xff
switch (c)
{
case 'A': case A_grave: case A_acute: case A_circumflex:
case A_virguilla: case A_diaeresis: case A_ring:
case 0x100: case 0x102: case 0x104: case 0x1cd:
case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
case 0x202: case 0x226: case 0x23a: case 0x1e00:
case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
EMIT2(A_circumflex) EMIT2(A_virguilla)
EMIT2(A_diaeresis) EMIT2(A_ring)
EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
return OK;
case 'B': case 0x181: case 0x243: case 0x1e02:
case 0x1e04: case 0x1e06:
EMIT2('B')
EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
EMIT2(0x1e04) EMIT2(0x1e06)
return OK;
case 'C': case C_cedilla: case 0x106: case 0x108:
case 0x10a: case 0x10c: case 0x187: case 0x23b:
case 0x1e08: case 0xa792:
EMIT2('C') EMIT2(C_cedilla)
EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
EMIT2(0x1e08) EMIT2(0xa792)
return OK;
case 'D': case 0x10e: case 0x110: case 0x18a:
case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
case 0x1e12:
EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
EMIT2(0x1e10) EMIT2(0x1e12)
return OK;
case 'E': case E_grave: case E_acute: case E_circumflex:
case E_diaeresis: case 0x112: case 0x114: case 0x116:
case 0x118: case 0x11a: case 0x204: case 0x206:
case 0x228: case 0x246: case 0x1e14: case 0x1e16:
case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
case 0x1ec2: case 0x1ec4: case 0x1ec6:
EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
EMIT2(E_circumflex) EMIT2(E_diaeresis)
EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
EMIT2(0x1ec6)
return OK;
case 'F': case 0x191: case 0x1e1e: case 0xa798:
EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
return OK;
case 'G': case 0x11c: case 0x11e: case 0x120:
case 0x122: case 0x193: case 0x1e4: case 0x1e6:
case 0x1f4: case 0x1e20: case 0xa7a0:
EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
EMIT2(0xa7a0)
return OK;
case 'H': case 0x124: case 0x126: case 0x21e:
case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
case 0x1e2a: case 0x2c67:
EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
return OK;
case 'I': case I_grave: case I_acute: case I_circumflex:
case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
case 0x12e: case 0x130: case 0x197: case 0x1cf:
case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
case 0x1ec8: case 0x1eca:
EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
EMIT2(I_circumflex) EMIT2(I_diaeresis)
EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
EMIT2(0x1eca)
return OK;
case 'J': case 0x134: case 0x248:
EMIT2('J') EMIT2(0x134) EMIT2(0x248)
return OK;
case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
EMIT2(0x2c69) EMIT2(0xa740)
return OK;
case 'L': case 0x139: case 0x13b: case 0x13d:
case 0x13f: case 0x141: case 0x23d: case 0x1e36:
case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
return OK;
case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
EMIT2(0x1e42)
return OK;
case 'N': case N_virguilla:
case 0x143: case 0x145: case 0x147: case 0x1f8:
case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
case 0xa7a4:
EMIT2('N') EMIT2(N_virguilla)
EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
return OK;
case 'O': case O_grave: case O_acute: case O_circumflex:
case O_virguilla: case O_diaeresis: case O_slash:
case 0x14c: case 0x14e: case 0x150: case 0x19f:
case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
case 0x1ede: case 0x1ee0: case 0x1ee2:
EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
EMIT2(O_circumflex) EMIT2(O_virguilla)
EMIT2(O_diaeresis) EMIT2(O_slash)
EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
EMIT2(0x1ee2)
return OK;
case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
EMIT2(0x2c63)
return OK;
case 'Q': case 0x24a:
EMIT2('Q') EMIT2(0x24a)
return OK;
case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
EMIT2(0xa7a6)
return OK;
case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
EMIT2(0xa7a8)
return OK;
case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
case 0x1e6e: case 0x1e70:
EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
return OK;
case 'U': case U_grave: case U_acute: case U_diaeresis:
case U_circumflex: case 0x168: case 0x16a: case 0x16c:
case 0x16e: case 0x170: case 0x172: case 0x1af:
case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
case 0x1db: case 0x214: case 0x216: case 0x244:
case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
EMIT2(U_diaeresis) EMIT2(U_circumflex)
EMIT2(0x168) EMIT2(0x16a)
EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
EMIT2(0x1ef0)
return OK;
case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
return OK;
case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
case 0x1e86: case 0x1e88:
EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
return OK;
case 'X': case 0x1e8a: case 0x1e8c:
EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
return OK;
case 'Y': case Y_acute: case 0x176: case 0x178:
case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
EMIT2('Y') EMIT2(Y_acute)
EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
EMIT2(0x1ef8)
return OK;
case 'Z': case 0x179: case 0x17b: case 0x17d:
case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94: