diff --git a/README.md b/README.md index 37db5b3..eedd5f5 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ go get -u github.com/askeladdk/fastcdc ## Quickstart -The package provides `Copy` and `CopyBuffer` functions modeled after the `io` package with identical signatures. The difference is that these Copy functions copy in content-defined chunks instead of fixed-size chunks. Chunks are sized between 32KB and 128KB with an average of about 64KB. +The package provides `Copy` and `CopyBuffer` functions modeled after the `io` package with identical signatures. The difference is that these Copy functions copy in content-defined chunks instead of fixed-size chunks. Chunks are sized between 8KB and 32KB with an average of about 16KB. Use `Copy` to copy data from a `io.Reader` to an `io.Writer` in content-defined chunks. @@ -23,17 +23,33 @@ Use `Copy` to copy data from a `io.Reader` to an `io.Writer` in content-defined n, err := fastcdc.Copy(w, r) ``` -Use `CopyBuffer` to pass a buffer. The buffer size should be 128KB or larger for best results, although it can be smaller. `Copy` allocates a buffer of 256KB. A larger buffer may provide a performance boost by reducing the number of reads. +Use `CopyBuffer` to pass a buffer. The buffer size should be 64KB or larger for best results, although it can be smaller. `Copy` allocates a buffer of 64KB. A larger buffer may provide a performance boost by reducing the number of reads. ```go n, err := fastcdc.CopyBuffer(w, r, make([]byte, 256 << 10)) ``` +Use `Chunker` to customize the parameters: + +```go +chunker := fastcdc.Chunker { + MinSize: 1 << 20, + AvgSize: 2 << 20, + MaxSize: 4 << 20, + Norm: 2, +} + +buf := make([]byte, 2*chunker.MaxSize) +n, err := chunker.CopyBuffer(dst, src, buf) +``` + Read the rest of the [documentation on pkg.go.dev](https://godoc.org/github.com/askeladdk/fastcdc). It's easy-peasy! ## Performance -Unscientific benchmarks suggest that this implementation is about 5-10% slower than the fastest implementation (PlakarLabs). As far as I can tell the performance difference is caused by PlakarLabs producing smaller chunks on average which means that it spends less time in the inner loop. Whether that makes it better or worse for deduplication purposes is unclear. However, this implementation makes zero allocations and has the simplest implementation, being less than 100 lines of code including comments. +Unscientific benchmarks suggest that this implementation is about as fast as Tigerwill90 but produces larger chunks. This is due to Tigerwill90's slightly different fingerprint calculation (they shift right instead of left). PlakarLabs has much higher performance but this is because it produces smaller chunks, meaning that it spends less time in the inner loop. + +Unlike the others, this implementation makes zero allocations and only has the fewest lines of code. ```sh % cd _bench_test @@ -42,15 +58,15 @@ goos: darwin goarch: amd64 pkg: bench_test cpu: Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz -BenchmarkAskeladdk-4 20 59166260 ns/op 2268.48 MB/s 54142 avgsz 2479 chunks 52430 B/op 0 allocs/op -BenchmarkTigerwill90-4 15 98254349 ns/op 1366.02 MB/s 66477 avgsz 2019 chunks 17536 B/op 1 allocs/op -BenchmarkJotFS-4 10 111913617 ns/op 1199.30 MB/s 76828 avgsz 1747 chunks 262256 B/op 2 allocs/op -BenchmarkPlakarLabs-4 19 53045331 ns/op 2530.25 MB/s 47679 avgsz 2815 chunks 262272 B/op 4 allocs/op +BenchmarkAskeladdk-4 14 78664269 ns/op 1706.21 MB/s 2485513 avgsz 54.00 chunks 599188 B/op 0 allocs/op +BenchmarkTigerwill90-4 13 77380696 ns/op 1734.51 MB/s 2064888 avgsz 65.00 chunks 645339 B/op 1 allocs/op +BenchmarkJotFS-4 10 103483790 ns/op 1296.99 MB/s 2396745 avgsz 56.00 chunks 8388720 B/op 2 allocs/op +BenchmarkPlakarLabs-4 31 36523149 ns/op 3674.87 MB/s 1065220 avgsz 126.0 chunks 8388736 B/op 4 allocs/op PASS -ok bench_test 6.947s +ok bench_test 5.136s ``` -More benchmarks: +More unscientific benchmarks: ```sh % go test -run=^$ -bench ^Benchmark$ @@ -58,19 +74,19 @@ goos: darwin goarch: amd64 pkg: github.com/askeladdk/fastcdc cpu: Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz -Benchmark/1KB-4 19108564 60.02 ns/op 17059.60 MB/s -Benchmark/4KB-4 12981624 89.85 ns/op 45589.12 MB/s -Benchmark/16KB-4 3305914 357.1 ns/op 45876.47 MB/s -Benchmark/64KB-4 41148 29139 ns/op 2249.09 MB/s -Benchmark/256KB-4 10000 113107 ns/op 2317.66 MB/s -Benchmark/1MB-4 2394 462801 ns/op 2265.72 MB/s -Benchmark/4MB-4 636 1805544 ns/op 2323.01 MB/s -Benchmark/16MB-4 165 7189987 ns/op 2333.41 MB/s -Benchmark/64MB-4 38 29806177 ns/op 2251.51 MB/s -Benchmark/256MB-4 9 120255293 ns/op 2232.21 MB/s -Benchmark/1GB-4 3 479891694 ns/op 2237.47 MB/s +Benchmark/1KB-4 8513276 120.5 ns/op 8497.58 MB/s +Benchmark/4KB-4 6978042 153.9 ns/op 26619.10 MB/s +Benchmark/16KB-4 166795 7117 ns/op 2302.14 MB/s +Benchmark/64KB-4 53578 22183 ns/op 2954.29 MB/s +Benchmark/256KB-4 9573 122433 ns/op 2141.11 MB/s +Benchmark/1MB-4 2134 521845 ns/op 2009.36 MB/s +Benchmark/4MB-4 534 2116966 ns/op 1981.28 MB/s +Benchmark/16MB-4 140 8525421 ns/op 1967.90 MB/s +Benchmark/64MB-4 33 34171293 ns/op 1963.90 MB/s +Benchmark/256MB-4 8 135296222 ns/op 1984.06 MB/s +Benchmark/1GB-4 2 548831781 ns/op 1956.41 MB/s PASS -ok github.com/askeladdk/fastcdc 28.965s +ok github.com/askeladdk/fastcdc 22.673s ``` ## License diff --git a/_bench_test/bench_test.go b/_bench_test/bench_test.go index a2872ff..263ec74 100644 --- a/_bench_test/bench_test.go +++ b/_bench_test/bench_test.go @@ -14,9 +14,9 @@ import ( ) const ( - minsize = 32 << 10 - avgsize = 64 << 10 - maxsize = 128 << 10 + minsize = 1 << 20 + avgsize = 2 << 20 + maxsize = 4 << 20 norm = 2 datalen = 128 << 20 ) @@ -30,17 +30,24 @@ func (fn writerFunc) Write(p []byte) (int, error) { var rb, _ = io.ReadAll(io.LimitReader(rand.New(rand.NewSource(0)), datalen)) func BenchmarkAskeladdk(b *testing.B) { + c := askeladdk.Chunker{ + MinSize: minsize, + AvgSize: avgsize, + MaxSize: maxsize, + Norm: norm, + } + r := bytes.NewReader(rb) b.SetBytes(int64(r.Len())) b.ResetTimer() - buf := make([]byte, 1<<20) + buf := make([]byte, maxsize<<1) nchunks := 0 w := writerFunc(func(p []byte) (int, error) { nchunks++ return len(p), nil }) for i := 0; i < b.N; i++ { - _, _ = askeladdk.CopyBuffer(w, r, buf) + _, _ = c.CopyBuffer(w, r, buf) r.Reset(rb) } b.ReportMetric(float64(nchunks)/float64(b.N), "chunks") diff --git a/_bench_test/go.mod b/_bench_test/go.mod index e0f1781..c68418e 100644 --- a/_bench_test/go.mod +++ b/_bench_test/go.mod @@ -1,6 +1,8 @@ module bench_test -go 1.17 +go 1.21 + +toolchain go1.21.1 require ( github.com/PlakarLabs/go-fastcdc v0.5.0 diff --git a/fastcdc.go b/fastcdc.go index 2970cc3..a24a139 100644 --- a/fastcdc.go +++ b/fastcdc.go @@ -7,39 +7,59 @@ package fastcdc import ( "io" + "math" ) -const ( - minsize = 32 << 10 - avgsize = 64 << 10 - maxsize = 128 << 10 - bufsize = maxsize << 1 - maskL = 0x0000d90003530000 - maskS = 0x0003590703530000 -) +// Chunker is a configurable content defined chunker. +type Chunker struct { + // MinSize is the minimum chunk size in bytes. + MinSize int + // AvgSize is the average chunk size in bytes. + AvgSize int + // MaxSize is the maximum chunk size in bytes. + MaxSize int + // Norm is the normalization factor. Set to zero to disable normalization. + Norm int +} -func min(a, b int) int { - if a < b { - return a +// Copy copies from src to dst in content-defined chunk sizes. +// A successful Copy returns err == nil. +func (c Chunker) Copy(dst io.Writer, src io.Reader) (n int64, err error) { + return c.copyBuffer(dst, src, make([]byte, c.MaxSize<<1)) +} + +// CopyBuffer is identical to Copy except that it stages through the +// provided buffer rather than allocating a temporary one. +func (c Chunker) CopyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { + if buf == nil { + buf = make([]byte, c.MaxSize<<1) + } else if len(buf) == 0 { + panic("fastcdc: empty buffer in CopyBuffer") } - return b + return c.copyBuffer(dst, src, buf) } -func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { +func (c Chunker) copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { + bits := int(math.Floor(math.Log2(float64(c.AvgSize)))) + maskS := uint64(1)< 0 || err == nil { - i, fp := min(head, tail+minsize), uint64(0) + i := min(head, tail+c.MinSize) + fp := uint64(0) - for end := min(head, tail+avgsize); i < end; i++ { - if fp = fp<<1 + gear[buf[i]]; fp&maskS == 0 { + for m, j := maskS, min(head, tail+c.AvgSize); i < j; i++ { + if fp = fp<<1 + gear[buf[i]]; fp&m == 0 { goto emitchunk } } - for end := min(head, tail+maxsize); i < end; i++ { - if fp = fp<<1 + gear[buf[i]]; fp&maskL == 0 { + for m, j := maskL, min(head, tail+c.MaxSize); i < j; i++ { + if fp = fp<<1 + gear[buf[i]]; fp&m == 0 { break } } @@ -51,7 +71,7 @@ func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { n, tail = n+int64(i-tail), i - if unread := head - tail; unread < maxsize { + if unread := head - tail; unread < c.MaxSize { copy(buf, buf[tail:head]) var k int if err != io.EOF { @@ -68,6 +88,13 @@ func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { return n, err } +var defaultChunker = Chunker{ + MinSize: 8 << 10, + AvgSize: 16 << 10, + MaxSize: 32 << 10, + Norm: 2, +} + // Copy copies from src to dst in content-defined chunk sizes, // as opposed to io.Copy which copies in fixed-sized chunks. // @@ -79,7 +106,7 @@ func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { // Because Copy is defined to read from src until EOF, it does // not treat an EOF from Read as an error to be reported. func Copy(dst io.Writer, src io.Reader) (n int64, err error) { - return copyBuffer(dst, src, make([]byte, bufsize)) + return defaultChunker.Copy(dst, src) } // CopyBuffer is identical to Copy except that it stages through the @@ -87,10 +114,10 @@ func Copy(dst io.Writer, src io.Reader) (n int64, err error) { // If buf is nil, one is allocated; otherwise if it has // zero length, CopyBuffer panics. func CopyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) { - if buf == nil { - buf = make([]byte, bufsize) - } else if len(buf) == 0 { - panic("fastcdc: empty buffer in CopyBuffer") - } - return copyBuffer(dst, src, buf) + return defaultChunker.CopyBuffer(dst, src, buf) +} + +// DefaultChunker returns the chunker used by [Copy] and [CopyBuffer]. +func DefaultChunker() Chunker { + return defaultChunker } diff --git a/fastcdc_test.go b/fastcdc_test.go index ff4a99f..6f62e1a 100644 --- a/fastcdc_test.go +++ b/fastcdc_test.go @@ -34,10 +34,8 @@ func TestCopyErrReader(t *testing.T) { } func TestCopyRobustness(t *testing.T) { - data := make([]byte, (1<<20)-1) rnd := rand.New(rand.NewSource(0)) - _, _ = io.ReadFull(rnd, data) - + data, _ := io.ReadAll(io.LimitReader(rnd, (1<<20)-1)) buf := make([]byte, 128<<10) for _, testCase := range []struct { @@ -58,6 +56,10 @@ func TestCopyRobustness(t *testing.T) { } func Benchmark(b *testing.B) { + rnd := rand.New(rand.NewSource(0)) + data, _ := io.ReadAll(io.LimitReader(rnd, int64(1<<30))) + buf := make([]byte, 256<<10) + for _, x := range []struct { Size int Name string @@ -76,14 +78,11 @@ func Benchmark(b *testing.B) { } { x := x b.Run(x.Name, func(b *testing.B) { - buf := make([]byte, bufsize) - rnd := rand.New(rand.NewSource(0)) - data, _ := io.ReadAll(io.LimitReader(rnd, int64(x.Size))) - r := bytes.NewReader(data) + r := bytes.NewReader(data[:x.Size]) b.ResetTimer() b.SetBytes(int64(x.Size)) for i := 0; i < b.N; i++ { - r.Reset(data) + r.Reset(data[:x.Size]) _, _ = CopyBuffer(io.Discard, r, buf) } }) diff --git a/go.mod b/go.mod index e6929ee..4e3840d 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ module github.com/askeladdk/fastcdc -go 1.17 +go 1.21