Use 64-bit gear and put normalized chunking back

askeladdk · Sep 24, 2023 · f4db658 · f4db658
1 parent 971bec9
commit f4db658
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -15,27 +15,25 @@ go get -u github.com/askeladdk/fastcdc
 
 ## Quickstart
 
-The package provides `Copy` and `CopyBuffer` functions modeled after the `io` package with identical signatures. The difference is that these Copy functions copy in content-defined chunks instead of fixed-size chunks.
+The package provides `Copy` and `CopyBuffer` functions modeled after the `io` package with identical signatures. The difference is that these Copy functions copy in content-defined chunks instead of fixed-size chunks. Chunks are sized between 32KB and 128KB with an average of about 64KB.
 
 Use `Copy` to copy data from a `io.Reader` to an `io.Writer` in content-defined chunks.
 
 ```go
 n, err := fastcdc.Copy(w, r)
 ```
 
-Use `CopyBuffer` to pass a buffer. The buffer size should be 64KB or larger for best results, although it can be smaller. `Copy` allocates a buffer of 128KB.
+Use `CopyBuffer` to pass a buffer. The buffer size should be 128KB or larger for best results, although it can be smaller. `Copy` allocates a buffer of 256KB. A larger buffer may provide a performance boost by reducing the number of reads.
 
 ```go
-n, err := fastcdc.CopyBuffer(w, r, make([]byte, 128 << 10))
+n, err := fastcdc.CopyBuffer(w, r, make([]byte, 256 << 10))
 ```
 
-Unlike other implementations it is not possible to tweak the parameters. This is not needed because there is a sweet spot of practical chunk sizes that enables efficient deduplication: Too small reduces performance due to overhead and too high reduces deduplication due to overly coarse chunks. Hence, chunks are sized between 2KB and 64KB with an average of about 10KB (2KB + 8KB). The final chunk can be smaller than 2KB. Normalized chunking as described in the paper is not implemented because it does not appear to improve deduplication.
-
 Read the rest of the [documentation on pkg.go.dev](https://godoc.org/github.com/askeladdk/fastcdc). It's easy-peasy!
 
 ## Performance
 
-Unscientific benchmarks on go1.21.1 suggest that this implementation is roughly 15% faster than the next best implementation and is the only one that makes zero allocations. Our implementation is also much simpler than the others, being less than 100 lines of code including comments. The number of generated chunks is roughly the same.
+Unscientific benchmarks suggest that this implementation is about 5-10% slower than the fastest implementation (PlakarLabs). As far as I can tell the performance difference is caused by PlakarLabs producing smaller chunks on average which means that it spends less time in the inner loop. Whether that makes it better or worse for deduplication purposes is unclear. However, this implementation makes zero allocations and has the simplest implementation, being less than 100 lines of code including comments.
 
 ```sh
 % cd _bench_test
@@ -44,12 +42,35 @@ goos: darwin
 goarch: amd64
 pkg: bench_test
 cpu: Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz
-BenchmarkAskeladdk-4     	      14	  96068850 ns/op	1397.10 MB/s	     13014 chunks	    9364 B/op	       0 allocs/op
-BenchmarkTigerwill90-4   	      10	 100792045 ns/op	1331.63 MB/s	     16027 chunks	   13172 B/op	       1 allocs/op
-BenchmarkJotFS-4         	       9	 128299966 ns/op	1046.12 MB/s	     14651 chunks	  131184 B/op	       2 allocs/op
-BenchmarkPlakarLabs-4    	      12	  97721841 ns/op	1373.47 MB/s	     15406 chunks	  131200 B/op	       4 allocs/op
+BenchmarkAskeladdk-4                  20          59166260 ns/op        2268.48 MB/s         54142 avgsz      2479 chunks           52430 B/op          0 allocs/op
+BenchmarkTigerwill90-4                15          98254349 ns/op        1366.02 MB/s         66477 avgsz      2019 chunks           17536 B/op          1 allocs/op
+BenchmarkJotFS-4                      10         111913617 ns/op        1199.30 MB/s         76828 avgsz      1747 chunks          262256 B/op          2 allocs/op
+BenchmarkPlakarLabs-4                 19          53045331 ns/op        2530.25 MB/s         47679 avgsz      2815 chunks          262272 B/op          4 allocs/op
+PASS
+ok      bench_test      6.947s
+```
+
+More benchmarks:
+
+```sh
+% go test -run=^$ -bench ^Benchmark$ 
+goos: darwin
+goarch: amd64
+pkg: github.com/askeladdk/fastcdc
+cpu: Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz
+Benchmark/1KB-4                 19108564                60.02 ns/op     17059.60 MB/s
+Benchmark/4KB-4                 12981624                89.85 ns/op     45589.12 MB/s
+Benchmark/16KB-4                 3305914               357.1 ns/op      45876.47 MB/s
+Benchmark/64KB-4                   41148             29139 ns/op        2249.09 MB/s
+Benchmark/256KB-4                  10000            113107 ns/op        2317.66 MB/s
+Benchmark/1MB-4                     2394            462801 ns/op        2265.72 MB/s
+Benchmark/4MB-4                      636           1805544 ns/op        2323.01 MB/s
+Benchmark/16MB-4                     165           7189987 ns/op        2333.41 MB/s
+Benchmark/64MB-4                      38          29806177 ns/op        2251.51 MB/s
+Benchmark/256MB-4                      9         120255293 ns/op        2232.21 MB/s
+Benchmark/1GB-4                        3         479891694 ns/op        2237.47 MB/s
 PASS
-ok  	bench_test	7.703s
+ok      github.com/askeladdk/fastcdc    28.965s
 ```
 
 ## License

diff --git a/_bench_test/bench_test.go b/_bench_test/bench_test.go
@@ -14,10 +14,10 @@ import (
 )
 
 const (
-	minsize = 2 << 10
-	avgsize = 8 << 10
-	maxsize = 64 << 10
-	norm    = 0
+	minsize = 32 << 10
+	avgsize = 64 << 10
+	maxsize = 128 << 10
+	norm    = 2
 	datalen = 128 << 20
 )
 
@@ -33,7 +33,7 @@ func BenchmarkAskeladdk(b *testing.B) {
 	r := bytes.NewReader(rb)
 	b.SetBytes(int64(r.Len()))
 	b.ResetTimer()
-	buf := make([]byte, maxsize<<1)
+	buf := make([]byte, 1<<20)
 	nchunks := 0
 	w := writerFunc(func(p []byte) (int, error) {
 		nchunks++

diff --git a/_gen/gear.go b/_gen/gear.go
@@ -22,13 +22,13 @@ func main() {
 	fmt.Fprintln(f, "// Code generated with _gen/gear.go. DO NOT EDIT.\n")
 	fmt.Fprintln(f, "package fastcdc\n")
 	fmt.Fprintf(f, "// gear is generated from rand.New(rand.NewSource(%d)).\n", *seed)
-	fmt.Fprintln(f, "var gear [256]uint16 = [...]uint16{")
+	fmt.Fprintln(f, "var gear [256]uint64 = [...]uint64{")
 	for i := 0; i < 64; i++ {
 		fmt.Fprintf(f, "\t")
 		for j := 0; j < 3; j++ {
-			fmt.Fprintf(f, "0x%04x, ", rnd.Uint64()>>48)
+			fmt.Fprintf(f, "0x%016x, ", rnd.Uint64())
 		}
-		fmt.Fprintf(f, "0x%04x,\n", rnd.Uint64()>>48)
+		fmt.Fprintf(f, "0x%016x,\n", rnd.Uint64())
 	}
 	fmt.Fprintln(f, "}")
 }
diff --git a/fastcdc.go b/fastcdc.go
@@ -10,11 +10,12 @@ import (
 )
 
 const (
-	minsize = 2 << 10
-	avgsize = 8 << 10
-	maxsize = 64 << 10
+	minsize = 32 << 10
+	avgsize = 64 << 10
+	maxsize = 128 << 10
 	bufsize = maxsize << 1
-	maskL   = avgsize - 1
+	maskL   = 0x0000d90003530000
+	maskS   = 0x0003590703530000
 )
 
 func min(a, b int) int {
@@ -29,14 +30,21 @@ func copyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) {
 	head, err := io.ReadFull(src, buf)
 
 	for head > 0 || err == nil {
-		i, fp := min(head, tail+minsize), uint16(0)
+		i, fp := min(head, tail+minsize), uint64(0)
+
+		for end := min(head, tail+avgsize); i < end; i++ {
+			if fp = fp<<1 + gear[buf[i]]; fp&maskS == 0 {
+				goto emitchunk
+			}
+		}
 
 		for end := min(head, tail+maxsize); i < end; i++ {
-			if fp = (fp << 1) + gear[buf[i]]; fp&maskL == 0 {
+			if fp = fp<<1 + gear[buf[i]]; fp&maskL == 0 {
 				break
 			}
 		}
 
+	emitchunk:
 		if x, err := dst.Write(buf[tail:i]); err != nil {
 			return n + int64(x), err
 		}
@@ -82,7 +90,7 @@ func CopyBuffer(dst io.Writer, src io.Reader, buf []byte) (n int64, err error) {
 	if buf == nil {
 		buf = make([]byte, bufsize)
 	} else if len(buf) == 0 {
-		panic("empty buffer in CopyBuffer")
+		panic("fastcdc: empty buffer in CopyBuffer")
 	}
 	return copyBuffer(dst, src, buf)
 }
diff --git a/gear.go b/gear.go