-
Notifications
You must be signed in to change notification settings - Fork 0
/
ec_glob.c
422 lines (385 loc) · 14.7 KB
/
ec_glob.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright 2024 Mike Becker - All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "ec_glob.h"
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#ifdef EC_GLOB_USE_PCRE
#include <pcre2posix.h>
#else
#include <regex.h>
#endif
struct numpair_s {
long min;
long max;
};
struct ec_glob_re {
char *str;
unsigned len;
unsigned capacity;
};
#ifndef EC_GLOB_STACK_CAPACITY
#define EC_GLOB_STACK_CAPACITY 64
#endif
static void ec_glob_pattern_increase_capacity(struct ec_glob_re *re) {
unsigned newcap = re->capacity * 2;
char *newmem;
if (re->capacity == EC_GLOB_STACK_CAPACITY) {
newmem = malloc(newcap);
if (newmem == NULL) abort();
memcpy(newmem, re->str, re->len);
} else {
newmem = realloc(re->str, newcap);
if (newmem == NULL) abort();
}
re->capacity = newcap;
re->str = newmem;
}
#define ec_glob_pattern_ensure_capacity(re, n) \
if (re.len + n > re.capacity) ec_glob_pattern_increase_capacity(&re)
#define ec_glob_cats(re, s) \
ec_glob_pattern_ensure_capacity(re, sizeof(s) - 1); \
memcpy((re).str + (re).len, s, sizeof(s) - 1); (re).len += sizeof(s)-1
#define ec_glob_catc(re, c) \
ec_glob_pattern_ensure_capacity(re, 1); (re).str[(re).len++] = (c)
int ec_glob(const char *pattern, const char *string) {
char stack[EC_GLOB_STACK_CAPACITY];
struct ec_glob_re re_pattern = {
stack, 1, EC_GLOB_STACK_CAPACITY
};
re_pattern.str[0] = '^';
unsigned scanidx = 0;
unsigned inputlen = strlen(pattern);
char c;
// maintain information about braces
const unsigned brace_stack_size = 32;
char brace_stack[brace_stack_size];
int depth_brace = 0;
_Bool braces_valid = 1;
// first, check if braces are syntactically valid
for (unsigned i = 0 ; i < inputlen ; i++) {
// skip potentially escaped braces
if (pattern[i] == '\\') {
i++;
} else if (pattern[i] == '{') {
depth_brace++;
} else if (pattern[i] == '}') {
if (depth_brace > 0) {
depth_brace--;
} else {
braces_valid = 0;
break;
}
}
}
if (depth_brace > 0) {
braces_valid = 0;
depth_brace = 0;
}
// prepare what we think is more than enough memory for matches
const unsigned numrange_max = 32;
unsigned numrange_grp_count = 0;
unsigned numrange_grp_idx[numrange_max];
struct numpair_s numrange_pairs[numrange_max];
regmatch_t numrange_matches[numrange_max];
// initialize first group number with zero
// and increment whenever we create a new group
numrange_grp_idx[0] = 0;
// now translate the editorconfig pattern to a POSIX regular expression
while (scanidx < inputlen) {
c = pattern[scanidx++];
// escape
if (c == '\\') {
if (strchr("?{}[]*\\-,", pattern[scanidx]) != NULL) {
// also escape in regex when required
if (strchr("?{}[]*\\", pattern[scanidx]) != NULL) {
ec_glob_catc(re_pattern, '\\');
}
c = pattern[scanidx++];
ec_glob_catc(re_pattern, c);
}
// otherwise, it's just a backslash
else {
ec_glob_cats(re_pattern, "\\\\");
}
}
// wildcard
else if (c == '*') {
// is double-wildcard?
if (pattern[scanidx] == '*') {
scanidx++;
// check for collapsible slashes
if (pattern[scanidx] == '/' && scanidx >= 3
&& pattern[scanidx - 3] == '/') {
// the collapsible slash is simply discarded
scanidx++;
}
ec_glob_cats(re_pattern, ".*");
} else {
ec_glob_cats(re_pattern, "[^/]*");
}
}
// arbitrary character
else if (c == '?') {
ec_glob_catc(re_pattern, '.');
}
// start of alternatives
else if (c == '{') {
// if braces are not syntactically valid, treat them as literals
if (!braces_valid) {
ec_glob_cats(re_pattern, "\\{");
continue;
}
// check brace stack
depth_brace++;
if (depth_brace > brace_stack_size) {
// treat brace literally when stacked too many of them
ec_glob_cats(re_pattern, "\\{");
continue;
}
// check if {single} or {num1..num2}
_Bool single = 1;
_Bool dotdot = strchr("+-0123456789", pattern[scanidx]) != NULL;
_Bool dotdot_seen = 0;
for (unsigned fw = scanidx; fw < inputlen; fw++) {
if (pattern[fw] == ',') {
single = 0;
dotdot = 0;
break;
}
else if (pattern[fw] == '}') {
// check if this is a {num1..num2} pattern
if (dotdot) {
_Bool ok = 1;
unsigned ngc = numrange_grp_count;
char *chk;
errno = 0;
numrange_pairs[ngc].min = strtol(
&pattern[scanidx], &chk, 10);
ok &= *chk == '.' && 0 == errno;
numrange_pairs[ngc].max = strtol(
strrchr(&pattern[scanidx], '.')+1, &chk, 10);
ok &= *chk == '}' && 0 == errno;
if (ok) {
// a dotdot is not a single
single = 0;
// skip this subpattern later on
scanidx = fw+1;
} else {
// not ok, we could not parse the numbers
dotdot = 0;
}
}
break;
} else if (dotdot) {
// check for dotdot separator
if (pattern[fw] == '.') {
if (!dotdot_seen &&
fw + 2 < inputlen && pattern[fw + 1] == '.' &&
strchr("+-0123456789", pattern[fw + 2]) != NULL) {
fw += 2;
dotdot_seen = 1;
} else {
dotdot = 0;
}
}
// everything must be a digit, otherwise
else if (!strchr("0123456789", pattern[fw])) {
dotdot = 0;
}
}
}
if (single) {
// push literal brace
ec_glob_cats(re_pattern, "\\{");
brace_stack[depth_brace-1] = '}';
} else {
// open choice and push parenthesis
ec_glob_catc(re_pattern, '(');
// increase the current group number
numrange_grp_idx[numrange_grp_count]++;
if (dotdot) {
// add the number matching pattern
ec_glob_cats(re_pattern, "[-+]?[0-9]+)");
// increase group counter and initialize
// next index with current group number
numrange_grp_count++;
if (numrange_grp_count < numrange_max) {
numrange_grp_idx[numrange_grp_count] =
numrange_grp_idx[numrange_grp_count - 1];
}
// we already took care of the closing brace
depth_brace--;
} else {
// remember that we need to close the choice eventually
brace_stack[depth_brace - 1] = ')';
}
}
}
// end of alternatives
else if (depth_brace > 0 && c == '}') {
depth_brace--;
if (depth_brace < brace_stack_size
&& brace_stack[depth_brace] == ')') {
ec_glob_catc(re_pattern, ')');
} else {
ec_glob_cats(re_pattern, "\\}");
}
}
// separator of alternatives
else if (depth_brace > 0 && c == ',') {
ec_glob_catc(re_pattern, '|');
}
// brackets
else if (c == '[') {
// check if we have a corresponding closing bracket
_Bool valid = 0;
_Bool closing_bracket_literal = 0;
unsigned newidx;
for (unsigned fw = scanidx ; fw < inputlen ; fw++) {
if (pattern[fw] == ']') {
// only terminating if it's not the first char
if (fw == scanidx) {
// otherwise, auto-escaped
closing_bracket_literal = 1;
} else {
valid = 1;
newidx = fw+1;
break;
}
} else if (pattern[fw] == '/') {
// special (undocumented) case: slash breaks
// https://github.com/editorconfig/editorconfig/issues/499
break;
} else if (pattern[fw] == '\\') {
// skip escaped characters as usual
fw++;
closing_bracket_literal |= pattern[fw] == ']';
}
}
if (valid) {
// first of all, check, if the sequence is negated
if (pattern[scanidx] == '!') {
scanidx++;
ec_glob_cats(re_pattern, "[^");
} else {
ec_glob_catc(re_pattern, '[');
}
// if we have a closing bracket as literal, it must appear first
if (closing_bracket_literal) {
ec_glob_catc(re_pattern, ']');
// but if the minus operator wanted to be there
// we need to move it to the end
if (pattern[scanidx] == '-') {
scanidx++;
}
}
// everything within brackets is treated as a literal character
// we have to parse them one by one, though, because we might
// need to escape regex-relevant stuff
for (unsigned fw = scanidx ; ; fw++) {
if (pattern[fw] == '\\') {
// skip to next char
continue;
}
// check for terminating bracket
else if (pattern[fw] == ']') {
if (fw > scanidx && pattern[fw-1] != '\\') {
break;
}
}
// include literal character
else {
if (strchr(".(){}[]", pattern[fw]) != NULL) {
ec_glob_catc(re_pattern, '\\');
}
ec_glob_catc(re_pattern, pattern[fw]);
}
}
// did we promise the minus a seat in the last row?
if (pattern[scanidx-1] == '-') {
ec_glob_cats(re_pattern, "-]");
} else {
ec_glob_catc(re_pattern, ']');
}
scanidx = newidx;
} else {
// literal bracket
ec_glob_cats(re_pattern, "\\[");
}
}
// escape special chars
else if (strchr(".(){}[]", c) != NULL) {
ec_glob_catc(re_pattern, '\\');
ec_glob_catc(re_pattern, c);
}
// literal (includes path separators)
else {
ec_glob_catc(re_pattern, c);
}
}
// terminate the regular expression
ec_glob_catc(re_pattern, '$');
ec_glob_catc(re_pattern, '\0');
// compile pattern and execute matching
regex_t re;
int status;
int flags = REG_EXTENDED;
// when we don't have a num-pattern, don't capture anything
if (numrange_grp_count == 0) {
flags |= REG_NOSUB;
}
if ((status = regcomp(&re, re_pattern.str, flags)) == 0) {
status = regexec(&re, string, numrange_max, numrange_matches, 0);
// check num ranges
for (unsigned i = 0 ; status == 0 && i < numrange_grp_count ; i++) {
regmatch_t nm = numrange_matches[numrange_grp_idx[i]];
int nmlen = nm.rm_eo-nm.rm_so;
char *nmatch = malloc(nmlen+1);
memcpy(nmatch, string+nm.rm_so, nmlen);
nmatch[nmlen] = '\0';
errno = 0;
char *chk;
long num = strtol(nmatch, &chk, 10);
if (*chk == '\0' && 0 == errno) {
// check if the matched number is within the range
status |= !(numrange_pairs[i].min <= num
&& num <= numrange_pairs[i].max);
} else {
// number not processable, return error
status = 1;
}
free(nmatch);
}
regfree(&re);
}
if (re_pattern.capacity > EC_GLOB_STACK_CAPACITY) {
free(re_pattern.str);
}
return status;
}