@@ -21,78 +21,127 @@ const NER = "not enough bytes in record"
21
21
// does mean that the contents of the field are only valid until you call getKey again, and also that
22
22
// the keyFinder type is not thread-safe
23
23
type keyFinder struct {
24
- fields []uint
25
- key []byte
26
- separator * regexp.Regexp
24
+ fields []uint
25
+ key []byte
26
+ separator * regexp.Regexp
27
+ quotedFields bool
27
28
}
28
29
29
30
// newKeyFinder creates a new Key finder with the supplied field numbers, the input should be 1 based.
30
31
// keyFinder is not thread-safe, you should clone it for each goroutine that uses it.
31
- func newKeyFinder (keys []uint , separator * regexp.Regexp ) * keyFinder {
32
+ func newKeyFinder (keys []uint , separator * regexp.Regexp , quotedFields bool ) * keyFinder {
32
33
kf := keyFinder {
33
34
key : make ([]byte , 0 , 128 ),
34
35
}
35
36
for _ , knum := range keys {
36
37
kf .fields = append (kf .fields , knum - 1 )
37
38
}
38
39
kf .separator = separator
40
+ kf .quotedFields = quotedFields
39
41
return & kf
40
42
}
41
43
42
44
// clone returns a new keyFinder with the same configuration. Each goroutine should use its own
43
45
// keyFinder instance.
44
46
func (kf * keyFinder ) clone () * keyFinder {
45
47
return & keyFinder {
46
- fields : kf .fields ,
47
- key : make ([]byte , 0 , 128 ),
48
- separator : kf .separator ,
48
+ fields : kf .fields ,
49
+ key : make ([]byte , 0 , 128 ),
50
+ separator : kf .separator ,
51
+ quotedFields : kf .quotedFields ,
49
52
}
50
53
}
51
54
52
55
// getKey extracts a key from the supplied record. This is applied to every record,
53
56
// so efficiency matters.
54
57
func (kf * keyFinder ) getKey (record []byte ) ([]byte , error ) {
55
- // if there are no Key-finders just return the record, minus any trailing newlines
58
+ // chomp
59
+ if record [len (record )- 1 ] == '\n' {
60
+ record = record [:len (record )- 1 ]
61
+ }
62
+ // if there are no Key-finders the key is the record
56
63
if len (kf .fields ) == 0 {
57
- if record [len (record )- 1 ] == '\n' {
58
- record = record [0 : len (record )- 1 ]
59
- }
60
64
return record , nil
61
65
}
62
66
var err error
63
67
kf .key = kf .key [:0 ]
64
68
if kf .separator == nil {
65
- field := 0
66
- index := 0
67
- first := true
68
-
69
- // for each field in the Key
70
- for _ , keyField := range kf .fields {
71
- // bypass fields before the one we want
72
- for field < int (keyField ) {
73
- index , err = pass (record , index )
69
+ // no regex provided, we're doing space-separation
70
+ if kf .quotedFields {
71
+ // if we're doing apache httpd style access_log files, with some "-quoted fields
72
+ field := 0
73
+ index := 0
74
+ first := true
75
+
76
+ // for each field in the key
77
+ for _ , keyField := range kf .fields {
78
+ // bypass fields before the one we want
79
+ for field < int (keyField ) {
80
+ index , err = passQuoted (record , index )
81
+ if err != nil {
82
+ return nil , err
83
+ }
84
+ // in the special case where we might have just passed a quoted fields, we will
85
+ // advance index past the closing quote
86
+ if index < len (record ) && record [index ] == '"' {
87
+ index ++
88
+ }
89
+ field ++
90
+ }
91
+
92
+ // join(' ', kf)
93
+ if first {
94
+ first = false
95
+ } else {
96
+ kf .key = append (kf .key , ' ' )
97
+ }
98
+
99
+ kf .key , index , err = gatherQuoted (kf .key , record , index )
74
100
if err != nil {
75
101
return nil , err
76
102
}
103
+ // in the special case where we might have just passed a quoted fields, we will
104
+ // advance index past the closing quote
105
+ if index < len (record ) && record [index ] == '"' {
106
+ index ++
107
+ }
77
108
field ++
78
109
}
110
+ } else {
111
+ // basic space-separation
112
+ field := 0
113
+ index := 0
114
+ first := true
79
115
80
- // join(' ', kf)
81
- if first {
82
- first = false
83
- } else {
84
- kf .key = append (kf .key , ' ' )
85
- }
116
+ // for each field in the Key
117
+ for _ , keyField := range kf .fields {
118
+ // bypass fields before the one we want
119
+ for field < int (keyField ) {
120
+ index , err = pass (record , index )
121
+ if err != nil {
122
+ return nil , err
123
+ }
124
+ field ++
125
+ }
86
126
87
- // attach desired field to Key
88
- kf .key , index , err = gather (kf .key , record , index )
89
- if err != nil {
90
- return nil , err
91
- }
127
+ // join(' ', kf)
128
+ if first {
129
+ first = false
130
+ } else {
131
+ kf .key = append (kf .key , ' ' )
132
+ }
92
133
93
- field ++
134
+ // attach desired field to Key
135
+ kf .key , index , err = gather (kf .key , record , index )
136
+ if err != nil {
137
+ return nil , err
138
+ }
139
+
140
+ field ++
141
+ }
94
142
}
95
143
} else {
144
+ // regex separator provided, less code but probably slower
96
145
allFields := kf .separator .Split (string (record ), - 1 )
97
146
for i , field := range kf .fields {
98
147
if int (field ) >= len (allFields ) {
@@ -107,9 +156,10 @@ func (kf *keyFinder) getKey(record []byte) ([]byte, error) {
107
156
return kf .key , err
108
157
}
109
158
110
- // pull in the bytes from a desired field
159
+ // gather pulls in the bytes from a desired field, and leaves index positioned at the first white-space
160
+ // character following the field, or at the end of the record, i.e. len(record)
111
161
func gather (key []byte , record []byte , index int ) ([]byte , int , error ) {
112
- // eat leading space
162
+ // eat leading space - if we're already at the end of the record, the loop is a no-op
113
163
for index < len (record ) && (record [index ] == ' ' || record [index ] == '\t' ) {
114
164
index ++
115
165
}
@@ -118,13 +168,49 @@ func gather(key []byte, record []byte, index int) ([]byte, int, error) {
118
168
}
119
169
120
170
// copy Key bytes
121
- for index < len ( record ) && record [ index ] != ' ' && record [ index ] != '\t' && record [ index ] != '\n' {
122
- key = append ( key , record [index ])
171
+ startAt := index
172
+ for index < len ( record ) && record [index ] != ' ' && record [ index ] != '\t' {
123
173
index ++
124
174
}
175
+ key = append (key , record [startAt :index ]... )
125
176
return key , index , nil
126
177
}
127
178
179
+ // same semantics as gather, but respects quoted fields that might create spaces. Leaves the index
180
+ // value pointing at the closing quote
181
+ func gatherQuoted (key []byte , record []byte , index int ) ([]byte , int , error ) {
182
+ // eat leading space
183
+ for index < len (record ) && (record [index ] == ' ' || record [index ] == '\t' ) {
184
+ index ++
185
+ }
186
+ if index >= len (record ) {
187
+ return nil , 0 , errors .New (NER )
188
+ }
189
+
190
+ if record [index ] == '"' {
191
+ index ++
192
+ startAt := index
193
+ for index < len (record ) && record [index ] != '"' {
194
+ index ++
195
+ }
196
+ key = append (key , record [startAt :index ]... )
197
+ // if we hit end-of-record before the closing quote, that's an error
198
+ if index == len (record ) {
199
+ return nil , 0 , errors .New (NER )
200
+ }
201
+ } else {
202
+ startAt := index
203
+ for index < len (record ) && record [index ] != ' ' && record [index ] != '\t' {
204
+ index ++
205
+ }
206
+ key = append (key , record [startAt :index ]... )
207
+ }
208
+ return key , index , nil
209
+ }
210
+
211
+ // pass moves the index variable past any white space and a space-separated field,
212
+ // leaving index pointing at the first white-space character after the field or
213
+ // at the end of record, i.e. == len(record)
128
214
func pass (record []byte , index int ) (int , error ) {
129
215
// eat leading space
130
216
for index < len (record ) && (record [index ] == ' ' || record [index ] == '\t' ) {
@@ -138,3 +224,30 @@ func pass(record []byte, index int) (int, error) {
138
224
}
139
225
return index , nil
140
226
}
227
+
228
+ // same semantics as pass, but for quoted fields. Leaves the index value pointing at the
229
+ // closing "
230
+ func passQuoted (record []byte , index int ) (int , error ) {
231
+ // eat leading space
232
+ for index < len (record ) && (record [index ] == ' ' || record [index ] == '\t' ) {
233
+ index ++
234
+ }
235
+ if index == len (record ) {
236
+ return 0 , errors .New (NER )
237
+ }
238
+ if record [index ] == '"' {
239
+ index ++
240
+ for index < len (record ) && record [index ] != '"' {
241
+ index ++
242
+ }
243
+ // if we hit end of record before the closing quote, that's a bug
244
+ if index >= len (record ) {
245
+ return 0 , errors .New (NER )
246
+ }
247
+ } else {
248
+ for index < len (record ) && record [index ] != ' ' && record [index ] != '\t' {
249
+ index ++
250
+ }
251
+ }
252
+ return index , nil
253
+ }
0 commit comments