@@ -23,9 +23,11 @@ import (
23
23
"bufio"
24
24
"compress/gzip"
25
25
"container/heap"
26
+ "context"
26
27
"errors"
27
28
"io"
28
29
"os"
30
+ "sync"
29
31
)
30
32
31
33
// Action defined the read log behaviour.
@@ -43,17 +45,22 @@ const (
43
45
var (
44
46
// NEED_TIMEHANDLER returned when the getTime function is nil.
45
47
NEED_TIMEHANDLER = errors .New ("need time handler" )
48
+ // NEED_ERRCHAN returned when using quick merge without err channel.
49
+ NEED_ERRCHAN = errors .New ("need error channel" )
46
50
)
47
51
48
52
/*
49
53
TimeHandler defined handlers for getting timestamp from each line.
50
54
*/
51
55
type TimeHandler = func ([]byte ) (int64 , Action , error )
52
56
57
+ /*
58
+ FilterHandler defined handlers for modifying each line.
59
+ */
60
+
53
61
type FilterHandler = func ([]byte ) ([]byte , Action , error )
54
62
55
63
type fileReader struct {
56
- filename string
57
64
scanner * bufio.Scanner
58
65
timestamp int64
59
66
line []byte
@@ -66,15 +73,26 @@ type fileReader struct {
66
73
Option defined some option can set for merging.
67
74
*/
68
75
type Option struct {
69
- SrcPath []string // Merge src File Path
70
- DstPath string // The filePath merge to
71
- SrcReader []io.Reader // DstReader io.Reader
72
- DstWriter io.Writer
73
- SrcGzip bool // Whether src file is in gzip format
74
- DstGzip bool // Merge file in gzip format
75
- DeleteSrc bool // Delete src file
76
- GetTime TimeHandler // The function to getTime from each line
77
- Filter FilterHandler // The function to process each line
76
+ SrcPath []string // Merge src File Path
77
+ DstPath string // The filePath merge to
78
+ SrcReader []io.Reader // Src files' io.Reader
79
+ DstWriter io.Writer // Destinated file's io.Writer
80
+ SrcGzip bool // Whether src file is in gzip format
81
+ DstGzip bool // Merge file in gzip format
82
+ DeleteSrc bool // Delete src file
83
+ GetTime TimeHandler // The function to getTime from each line
84
+ Filter FilterHandler // The function to process each line
85
+ Goroutine int // Quick merge's worker number
86
+ ErrChan chan error // Quick merge's error return
87
+ CTX context.Context // Quick merge's context
88
+ }
89
+
90
+ type quickMergeJob struct {
91
+ scanner * bufio.Scanner
92
+ writer chan * []byte
93
+ filter FilterHandler
94
+ errChan chan error
95
+ ctx context.Context
78
96
}
79
97
80
98
type fileHeap struct {
@@ -84,7 +102,9 @@ type fileHeap struct {
84
102
85
103
func (fh fileHeap ) Len () int { return len (fh .readers ) }
86
104
87
- func (fh fileHeap ) Less (i , j int ) bool { return fh .readers [i ].timestamp < fh .readers [j ].timestamp }
105
+ func (fh fileHeap ) Less (i , j int ) bool {
106
+ return fh .readers [i ].timestamp < fh .readers [j ].timestamp
107
+ }
88
108
89
109
func (fh fileHeap ) Swap (i , j int ) {
90
110
fh .readers [i ], fh .readers [j ] = fh .readers [j ], fh .readers [i ]
@@ -170,17 +190,6 @@ func (fh *fileHeap) merge() error {
170
190
return nil
171
191
}
172
192
173
- // Merge files to output file, and use getTime function to get timestamp.
174
- func Merge (srcPath []string , dstPath string , getTime TimeHandler ) error {
175
- option := Option {
176
- SrcPath : srcPath ,
177
- DstPath : dstPath ,
178
- GetTime : getTime ,
179
- }
180
-
181
- return MergeByOption (option )
182
- }
183
-
184
193
func merge (readers []* bufio.Scanner , writer * bufio.Writer , getTime TimeHandler , filter FilterHandler ) error {
185
194
fHeap := new (fileHeap )
186
195
@@ -208,6 +217,55 @@ func merge(readers []*bufio.Scanner, writer *bufio.Writer, getTime TimeHandler,
208
217
return fHeap .merge ()
209
218
}
210
219
220
+ func quickMerge (job * quickMergeJob ) {
221
+ scanner := job .scanner
222
+ writer := job .writer
223
+ filter := job .filter
224
+ errChan := job .errChan
225
+
226
+ for {
227
+ select {
228
+ case <- job .ctx .Done ():
229
+ return
230
+ default :
231
+ if ok := scanner .Scan (); ! ok {
232
+ if err := scanner .Err (); err != nil {
233
+ errChan <- err
234
+ }
235
+
236
+ // EOF
237
+ return
238
+ }
239
+
240
+ line := scanner .Bytes ()
241
+ if filter != nil {
242
+ newline , action , err := job .filter (line )
243
+ if action == SKIP {
244
+ continue
245
+ } else if action == STOP {
246
+ errChan <- err
247
+ return
248
+ }
249
+
250
+ line = newline
251
+ }
252
+
253
+ writer <- & line
254
+ }
255
+ }
256
+ }
257
+
258
+ // Merge files to output file, and use getTime function to get timestamp.
259
+ func Merge (srcPath []string , dstPath string , getTime TimeHandler ) error {
260
+ option := Option {
261
+ SrcPath : srcPath ,
262
+ DstPath : dstPath ,
263
+ GetTime : getTime ,
264
+ }
265
+
266
+ return MergeByOption (option )
267
+ }
268
+
211
269
// Use option to control merge behaviour.
212
270
func MergeByOption (option Option ) error {
213
271
if option .GetTime == nil {
@@ -286,3 +344,110 @@ func MergeByOption(option Option) error {
286
344
287
345
return nil
288
346
}
347
+
348
+ // Quick merge used for without sorting
349
+ func QuickMerge (option Option ) error {
350
+ var wg sync.WaitGroup
351
+ jobChan := make (chan * quickMergeJob , len (option .SrcPath ))
352
+ writerChan := make (chan * []byte , len (option .SrcPath )* 100 )
353
+
354
+ if option .ErrChan == nil {
355
+ return NEED_ERRCHAN
356
+ }
357
+
358
+ if option .CTX == nil {
359
+ option .CTX = context .Background ()
360
+ }
361
+
362
+ finishedCount := 0
363
+ var mutex sync.Mutex
364
+ for i := 0 ; i < option .Goroutine ; i ++ {
365
+ wg .Add (1 )
366
+ go func () {
367
+ for job := range jobChan {
368
+ quickMerge (job )
369
+ }
370
+ wg .Done ()
371
+
372
+ mutex .Lock ()
373
+ finishedCount ++
374
+ if finishedCount == option .Goroutine {
375
+ close (writerChan )
376
+ }
377
+ mutex .Unlock ()
378
+ }()
379
+ }
380
+
381
+ for _ , fp := range option .SrcPath {
382
+ fd , err := os .Open (fp )
383
+ if err != nil {
384
+ option .ErrChan <- err
385
+ }
386
+
387
+ defer fd .Close ()
388
+
389
+ var scanner * bufio.Scanner
390
+ if option .SrcGzip {
391
+ gzReader , err := gzip .NewReader (fd )
392
+ if err != nil {
393
+ option .ErrChan <- err
394
+ }
395
+
396
+ defer gzReader .Close ()
397
+
398
+ scanner = bufio .NewScanner (gzReader )
399
+ } else {
400
+ scanner = bufio .NewScanner (fd )
401
+ }
402
+
403
+ jobChan <- & quickMergeJob {
404
+ scanner : scanner ,
405
+ writer : writerChan ,
406
+ filter : option .Filter ,
407
+ errChan : option .ErrChan ,
408
+ ctx : option .CTX ,
409
+ }
410
+ }
411
+ close (jobChan )
412
+
413
+ fd , err := os .Create (option .DstPath )
414
+ if err != nil {
415
+ option .ErrChan <- err
416
+ return nil
417
+ }
418
+
419
+ defer fd .Close ()
420
+
421
+ var writer * bufio.Writer
422
+ if option .DstGzip {
423
+ gzWriter := gzip .NewWriter (fd )
424
+ defer gzWriter .Close ()
425
+
426
+ writer = bufio .NewWriter (gzWriter )
427
+ } else {
428
+ writer = bufio .NewWriter (fd )
429
+ }
430
+
431
+ loop:
432
+ for {
433
+ select {
434
+ case <- option .CTX .Done ():
435
+ return nil
436
+ case line , ok := <- writerChan :
437
+ // chan closed
438
+ if ! ok {
439
+ break loop
440
+ }
441
+
442
+ if _ , err := writer .Write (append (* line , '\n' )); err != nil {
443
+ option .ErrChan <- err
444
+ continue
445
+ }
446
+
447
+ writer .Flush ()
448
+ }
449
+ }
450
+
451
+ wg .Wait ()
452
+ return nil
453
+ }
0 commit comments