-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathregex.go
277 lines (226 loc) · 6.89 KB
/
regex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
package onigmo
/*
#cgo CFLAGS: -I/usr/local/include
#cgo LDFLAGS: -L/usr/local/lib -lonigmo
#include <stdlib.h>
#include "chelper.h"
*/
import "C"
import (
"errors"
"fmt"
"runtime"
"strings"
"sync"
"unicode/utf8"
"unsafe"
)
const numMatchStartSize = 4
var mutex sync.Mutex
var _ compliance = &Regexp{}
// Regexp is the representation of a compiled regular expression. A Regexp is
// safe for concurrent use by multiple goroutines.
type Regexp struct {
pattern string
options Option
encoding Encoding
syntax Syntax
regex C.OnigRegex
errorInfo *C.OnigErrorInfo
errorBuf *C.char
numSubexp int
subexpNames []string
idxSubexpNames map[string]int
hasMetacharacters bool
}
// NewRegexp creates and initializes a new Regexp with the given pattern and option.
func NewRegexp(pattern string, encoding Encoding, options Option, syntax Syntax) (*Regexp, error) {
re := &Regexp{
pattern: pattern,
encoding: encoding,
options: options,
syntax: syntax,
}
runtime.SetFinalizer(re, (*Regexp).Free)
return re, re.initRegexp()
}
func (re *Regexp) initRegexp() error {
patternCharPtr := C.CString(re.pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
mutex.Lock()
defer mutex.Unlock()
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(re.options), &re.regex, &re.encoding, re.syntax, &re.errorInfo, &re.errorBuf)
if errorCode != 0 {
return errors.New(C.GoString(re.errorBuf))
}
re.numSubexp = int(C.onig_number_of_captures(re.regex))
re.hasMetacharacters = QuoteMeta(re.pattern) != re.pattern
return re.loadSubexpNames()
}
func (re *Regexp) loadSubexpNames() error {
count := int(C.onig_number_of_names(re.regex))
if count == 0 {
return nil
}
bufferSize := len(re.pattern) * 2
nameBuffer := make([]byte, bufferSize)
groupNumbers := make([]int32, count)
bufferPtr := unsafe.Pointer(&nameBuffer[0])
numbersPtr := unsafe.Pointer(&groupNumbers[0])
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr)))
if length == 0 {
return fmt.Errorf("could not get the capture group names")
}
re.subexpNames = strings.Split(string(nameBuffer[:length]), ";")
if len(re.subexpNames) != count {
return fmt.Errorf(
"unexpected number of capture group names, got %d, expected %d,",
len(re.subexpNames), count,
)
}
re.idxSubexpNames = make(map[string]int, len(groupNumbers))
for i, idx := range groupNumbers {
re.idxSubexpNames[re.subexpNames[i]] = int(idx)
}
return nil
}
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text. The encoding is set to UTF8
// and the systax is set to Perl 5.10+ which is the most compatible with Go.
func Compile(str string) (*Regexp, error) {
return NewRegexp(str, EncodingUTF8, OptionNone, SyntaxPerl)
}
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled
// regular expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}
// Free release all the cgo resource used by the regexp. This function it's
// used as finalizer the Regexp.
func (re *Regexp) Free() {
mutex.Lock()
if re.regex != nil {
C.onig_free(re.regex)
re.regex = nil
}
mutex.Unlock()
if re.errorInfo != nil {
C.free(unsafe.Pointer(re.errorInfo))
re.errorInfo = nil
}
if re.errorBuf != nil {
C.free(unsafe.Pointer(re.errorBuf))
re.errorBuf = nil
}
}
func (re *Regexp) find(b []byte, n int, offset int) []int {
if len(re.pattern) == 0 && len(b) == 0 {
return make([]int, (re.numSubexp+1)*2)
}
match := make([]int, (re.numSubexp+1)*2)
if n == 0 {
b = []byte{0}
}
bytesPtr := unsafe.Pointer(&b[0])
// captures contains two pairs of ints, start and end, so we need list
// twice the size of the capture groups.
captures := make([]C.int, (re.numSubexp+1)*2)
capturesPtr := unsafe.Pointer(&captures[0])
var numCaptures int32
numCapturesPtr := unsafe.Pointer(&numCaptures)
pos := int(C.SearchOnigRegex(
bytesPtr, C.int(n), C.int(offset), C.int(OptionNone),
re.regex, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr),
))
if pos < 0 {
return nil
}
if numCaptures <= 0 {
panic("cannot have 0 captures when processing a match")
}
for i := range captures {
match[i] = int(captures[i])
}
return match
}
func (re *Regexp) match(b []byte, n int, offset int) bool {
if n == 0 {
b = []byte{0}
}
bytesPtr := unsafe.Pointer(&b[0])
pos := int(C.SearchOnigRegex(
bytesPtr, C.int(n), C.int(offset), C.int(OptionNone),
re.regex, re.errorInfo, nil, nil, nil,
))
return pos >= 0
}
func (re *Regexp) findAll(b []byte, n int) [][]int {
if n < 0 {
n = len(b)
}
capture := make([][]int, 0, numMatchStartSize)
var offset int
for offset <= n {
match := re.find(b, n, offset)
if match == nil {
break
}
capture = append(capture, match)
// move offset to the ending index of the current match and prepare to
// find the next non-overlapping match.
offset = match[1]
// if match[0] == match[1], it means the current match does not advance
// the search. we need to exit the loop to avoid getting stuck here.
if match[0] == match[1] {
if offset < n && offset >= 0 {
//there are more bytes, so move offset by a word
_, width := utf8.DecodeRune(b[offset:])
offset += width
} else {
//search is over, exit loop
break
}
}
}
return capture
}
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int {
return int(re.numSubexp)
}
// SubexpNames returns the names of the parenthesized subexpressions
// in this Regexp. The name for the first sub-expression is names[1],
// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
// Since the Regexp as a whole cannot be named, names[0] is always
// the empty string. The slice should not be modified.
func (re *Regexp) SubexpNames() []string {
return re.subexpNames
}
func (re *Regexp) String() string {
return re.pattern
}
// LiteralPrefix it's not implemented. Panics when it called.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
panic("function not implemented")
return "", false
}
// Copy returns a new Regexp object copied from re.
func (re *Regexp) Copy() *Regexp {
copy, _ := NewRegexp(re.pattern, re.encoding, re.options, re.syntax)
return copy
}
// Longest makes future searches prefer the leftmost-longest match.
// That is, when matching against text, the regexp returns a match that
// begins as early as possible in the input (leftmost), and among those
// it chooses a match that is as long as possible.
// This method modifies the Regexp and may not be called concurrently
// with any other methods.
func (re *Regexp) Longest() {
re.options = re.options | OptionFindLongest
re.initRegexp()
}