diff --git a/http-benchmark/fasthttp/go.mod b/http-benchmark/fasthttp/go.mod
index d1e525f1..ce9a5d97 100644
--- a/http-benchmark/fasthttp/go.mod
+++ b/http-benchmark/fasthttp/go.mod
@@ -2,4 +2,4 @@ module github.com/bigwhite/benchmark-http
 
 go 1.16
 
-require github.com/valyala/fasthttp v1.23.0
+require github.com/valyala/fasthttp v1.34.0
diff --git a/http-benchmark/fasthttp/go.sum b/http-benchmark/fasthttp/go.sum
index bdb25b52..f63e7c9d 100644
--- a/http-benchmark/fasthttp/go.sum
+++ b/http-benchmark/fasthttp/go.sum
@@ -1,23 +1,22 @@
-github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc=
-github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
-github.com/klauspost/compress v1.11.8 h1:difgzQsp5mdAz9v8lm3P/I+EpDKMU/6uTMw1y1FObuo=
-github.com/klauspost/compress v1.11.8/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
+github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/klauspost/compress v1.15.0 h1:xqfchp4whNFxn5A4XFyyYtitiWI8Hy5EW59jEwcyL6U=
+github.com/klauspost/compress v1.15.0/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.23.0 h1:0ufwSD9BhWa6f8HWdmdq4FHQ23peRo3Ng/Qs8m5NcFs=
-github.com/valyala/fasthttp v1.23.0/go.mod h1:0mw2RjXGOzxf4NL2jni3gUQ7LfjjUSiG5sskOUUSEpU=
-github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20210226101413-39120d07d75e/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+github.com/valyala/fasthttp v1.34.0 h1:d3AAQJ2DRcxJYHm7OXNXtXt2as1vMDfxeIcFvhmGGm4=
+github.com/valyala/fasthttp v1.34.0/go.mod h1:epZA5N+7pY6ZaEKRmstzOuYJx9HI8DI1oaCGZpdH4h0=
+github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
+golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
index 2470f84e..7acfb180 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
@@ -121,7 +121,7 @@ func encodeMlen(length uint, bits *uint64, numbits *uint, nibblesbits *uint64) {
 	*bits = uint64(length) - 1
 }
 
-func storeCommandExtra(cmd *command, bw *bitWriter) {
+func storeCommandExtra(cmd *command, storage_ix *uint, storage []byte) {
 	var copylen_code uint32 = commandCopyLenCode(cmd)
 	var inscode uint16 = getInsertLengthCode(uint(cmd.insert_len_))
 	var copycode uint16 = getCopyLengthCode(uint(copylen_code))
@@ -129,7 +129,7 @@ func storeCommandExtra(cmd *command, bw *bitWriter) {
 	var insextraval uint64 = uint64(cmd.insert_len_) - uint64(getInsertBase(inscode))
 	var copyextraval uint64 = uint64(copylen_code) - uint64(getCopyBase(copycode))
 	var bits uint64 = copyextraval<<insnumextra | insextraval
-	bw.writeBits(uint(insnumextra+getCopyExtra(copycode)), bits)
+	writeBits(uint(insnumextra+getCopyExtra(copycode)), bits, storage_ix, storage)
 }
 
 /* Data structure that stores almost everything that is needed to encode each
@@ -143,21 +143,21 @@ type blockSplitCode struct {
 }
 
 /* Stores a number between 0 and 255. */
-func storeVarLenUint8(n uint, bw *bitWriter) {
+func storeVarLenUint8(n uint, storage_ix *uint, storage []byte) {
 	if n == 0 {
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	} else {
 		var nbits uint = uint(log2FloorNonZero(n))
-		bw.writeBits(1, 1)
-		bw.writeBits(3, uint64(nbits))
-		bw.writeBits(nbits, uint64(n)-(uint64(uint(1))<<nbits))
+		writeBits(1, 1, storage_ix, storage)
+		writeBits(3, uint64(nbits), storage_ix, storage)
+		writeBits(nbits, uint64(n)-(uint64(uint(1))<<nbits), storage_ix, storage)
 	}
 }
 
 /* Stores the compressed meta-block header.
    REQUIRES: length > 0
    REQUIRES: length <= (1 << 24) */
-func storeCompressedMetaBlockHeader(is_final_block bool, length uint, bw *bitWriter) {
+func storeCompressedMetaBlockHeader(is_final_block bool, length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
 	var nibblesbits uint64
@@ -169,41 +169,41 @@ func storeCompressedMetaBlockHeader(is_final_block bool, length uint, bw *bitWri
 	}
 
 	/* Write ISLAST bit. */
-	bw.writeBits(1, is_final)
+	writeBits(1, is_final, storage_ix, storage)
 
 	/* Write ISEMPTY bit. */
 	if is_final_block {
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	}
 
 	encodeMlen(length, &lenbits, &nlenbits, &nibblesbits)
-	bw.writeBits(2, nibblesbits)
-	bw.writeBits(nlenbits, lenbits)
+	writeBits(2, nibblesbits, storage_ix, storage)
+	writeBits(nlenbits, lenbits, storage_ix, storage)
 
 	if !is_final_block {
 		/* Write ISUNCOMPRESSED bit. */
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	}
 }
 
 /* Stores the uncompressed meta-block header.
    REQUIRES: length > 0
    REQUIRES: length <= (1 << 24) */
-func storeUncompressedMetaBlockHeader(length uint, bw *bitWriter) {
+func storeUncompressedMetaBlockHeader(length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
 	var nibblesbits uint64
 
 	/* Write ISLAST bit.
 	   Uncompressed block cannot be the last one, so set to 0. */
-	bw.writeBits(1, 0)
+	writeBits(1, 0, storage_ix, storage)
 
 	encodeMlen(length, &lenbits, &nlenbits, &nibblesbits)
-	bw.writeBits(2, nibblesbits)
-	bw.writeBits(nlenbits, lenbits)
+	writeBits(2, nibblesbits, storage_ix, storage)
+	writeBits(nlenbits, lenbits, storage_ix, storage)
 
 	/* Write ISUNCOMPRESSED bit. */
-	bw.writeBits(1, 1)
+	writeBits(1, 1, storage_ix, storage)
 }
 
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder = [codeLengthCodes]byte{1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15}
@@ -211,7 +211,7 @@ var storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder = [codeLengthCodes]byte
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols = [6]byte{0, 7, 3, 2, 1, 15}
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths = [6]byte{2, 4, 3, 2, 2, 4}
 
-func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth []byte, bw *bitWriter) {
+func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth []byte, storage_ix *uint, storage []byte) {
 	var skip_some uint = 0
 	var codes_to_store uint = codeLengthCodes
 	/* The bit lengths of the Huffman code over the code length alphabet
@@ -241,38 +241,38 @@ func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth
 		}
 	}
 
-	bw.writeBits(2, uint64(skip_some))
+	writeBits(2, uint64(skip_some), storage_ix, storage)
 	{
 		var i uint
 		for i = skip_some; i < codes_to_store; i++ {
 			var l uint = uint(code_length_bitdepth[storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder[i]])
-			bw.writeBits(uint(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths[l]), uint64(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols[l]))
+			writeBits(uint(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths[l]), uint64(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols[l]), storage_ix, storage)
 		}
 	}
 }
 
-func storeHuffmanTreeToBitMask(huffman_tree_size uint, huffman_tree []byte, huffman_tree_extra_bits []byte, code_length_bitdepth []byte, code_length_bitdepth_symbols []uint16, bw *bitWriter) {
+func storeHuffmanTreeToBitMask(huffman_tree_size uint, huffman_tree []byte, huffman_tree_extra_bits []byte, code_length_bitdepth []byte, code_length_bitdepth_symbols []uint16, storage_ix *uint, storage []byte) {
 	var i uint
 	for i = 0; i < huffman_tree_size; i++ {
 		var ix uint = uint(huffman_tree[i])
-		bw.writeBits(uint(code_length_bitdepth[ix]), uint64(code_length_bitdepth_symbols[ix]))
+		writeBits(uint(code_length_bitdepth[ix]), uint64(code_length_bitdepth_symbols[ix]), storage_ix, storage)
 
 		/* Extra bits */
 		switch ix {
 		case repeatPreviousCodeLength:
-			bw.writeBits(2, uint64(huffman_tree_extra_bits[i]))
+			writeBits(2, uint64(huffman_tree_extra_bits[i]), storage_ix, storage)
 
 		case repeatZeroCodeLength:
-			bw.writeBits(3, uint64(huffman_tree_extra_bits[i]))
+			writeBits(3, uint64(huffman_tree_extra_bits[i]), storage_ix, storage)
 		}
 	}
 }
 
-func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max_bits uint, bw *bitWriter) {
+func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max_bits uint, storage_ix *uint, storage []byte) {
 	/* value of 1 indicates a simple Huffman code */
-	bw.writeBits(2, 1)
+	writeBits(2, 1, storage_ix, storage)
 
-	bw.writeBits(2, uint64(num_symbols)-1) /* NSYM - 1 */
+	writeBits(2, uint64(num_symbols)-1, storage_ix, storage) /* NSYM - 1 */
 	{
 		/* Sort */
 		var i uint
@@ -289,17 +289,17 @@ func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max
 	}
 
 	if num_symbols == 2 {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
 	} else if num_symbols == 3 {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
-		bw.writeBits(max_bits, uint64(symbols[2]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
 	} else {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
-		bw.writeBits(max_bits, uint64(symbols[2]))
-		bw.writeBits(max_bits, uint64(symbols[3]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[3]), storage_ix, storage)
 
 		/* tree-select */
 		var tmp int
@@ -308,13 +308,13 @@ func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max
 		} else {
 			tmp = 0
 		}
-		bw.writeBits(1, uint64(tmp))
+		writeBits(1, uint64(tmp), storage_ix, storage)
 	}
 }
 
 /* num = alphabet size
    depths = symbol depths */
-func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, bw *bitWriter) {
+func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var huffman_tree [numCommandSymbols]byte
 	var huffman_tree_extra_bits [numCommandSymbols]byte
 	var huffman_tree_size uint = 0
@@ -357,19 +357,19 @@ func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, bw *bitWriter
 	convertBitDepthsToSymbols(code_length_bitdepth[:], codeLengthCodes, code_length_bitdepth_symbols[:])
 
 	/* Now, we have all the data, let's start storing it */
-	storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth[:], bw)
+	storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth[:], storage_ix, storage)
 
 	if num_codes == 1 {
 		code_length_bitdepth[code] = 0
 	}
 
 	/* Store the real Huffman tree now. */
-	storeHuffmanTreeToBitMask(huffman_tree_size, huffman_tree[:], huffman_tree_extra_bits[:], code_length_bitdepth[:], code_length_bitdepth_symbols[:], bw)
+	storeHuffmanTreeToBitMask(huffman_tree_size, huffman_tree[:], huffman_tree_extra_bits[:], code_length_bitdepth[:], code_length_bitdepth_symbols[:], storage_ix, storage)
 }
 
 /* Builds a Huffman tree from histogram[0:length] into depth[0:length] and
    bits[0:length] and stores the encoded tree to the bit stream. */
-func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabet_size uint, tree []huffmanTree, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabet_size uint, tree []huffmanTree, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var count uint = 0
 	var s4 = [4]uint{0}
 	var i uint
@@ -394,8 +394,8 @@ func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabe
 	}
 
 	if count <= 1 {
-		bw.writeBits(4, 1)
-		bw.writeBits(max_bits, uint64(s4[0]))
+		writeBits(4, 1, storage_ix, storage)
+		writeBits(max_bits, uint64(s4[0]), storage_ix, storage)
 		depth[s4[0]] = 0
 		bits[s4[0]] = 0
 		return
@@ -408,9 +408,9 @@ func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabe
 	convertBitDepthsToSymbols(depth, histogram_length, bits)
 
 	if count <= 4 {
-		storeSimpleHuffmanTree(depth, s4[:], count, max_bits, bw)
+		storeSimpleHuffmanTree(depth, s4[:], count, max_bits, storage_ix, storage)
 	} else {
-		storeHuffmanTree(depth, histogram_length, tree, bw)
+		storeHuffmanTree(depth, histogram_length, tree, storage_ix, storage)
 	}
 }
 
@@ -420,7 +420,7 @@ func sortHuffmanTree1(v0 huffmanTree, v1 huffmanTree) bool {
 
 var huffmanTreePool sync.Pool
 
-func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var count uint = 0
 	var symbols = [4]uint{0}
 	var length uint = 0
@@ -439,8 +439,8 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 	}
 
 	if count <= 1 {
-		bw.writeBits(4, 1)
-		bw.writeBits(max_bits, uint64(symbols[0]))
+		writeBits(4, 1, storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
 		depth[symbols[0]] = 0
 		bits[symbols[0]] = 0
 		return
@@ -544,9 +544,9 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 		var i uint
 
 		/* value of 1 indicates a simple Huffman code */
-		bw.writeBits(2, 1)
+		writeBits(2, 1, storage_ix, storage)
 
-		bw.writeBits(2, uint64(count)-1) /* NSYM - 1 */
+		writeBits(2, uint64(count)-1, storage_ix, storage) /* NSYM - 1 */
 
 		/* Sort */
 		for i = 0; i < count; i++ {
@@ -561,27 +561,33 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 		}
 
 		if count == 2 {
-			bw.writeBits(max_bits, uint64(symbols[0]))
-			bw.writeBits(max_bits, uint64(symbols[1]))
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
 		} else if count == 3 {
-			bw.writeBits(max_bits, uint64(symbols[0]))
-			bw.writeBits(max_bits, uint64(symbols[1]))
-			bw.writeBits(max_bits, uint64(symbols[2]))
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
 		} else {
-			bw.writeBits(max_bits, uint64(symbols[0]))
-			bw.writeBits(max_bits, uint64(symbols[1]))
-			bw.writeBits(max_bits, uint64(symbols[2]))
-			bw.writeBits(max_bits, uint64(symbols[3]))
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[3]), storage_ix, storage)
 
 			/* tree-select */
-			bw.writeSingleBit(depth[symbols[0]] == 1)
+			var tmp int
+			if depth[symbols[0]] == 1 {
+				tmp = 1
+			} else {
+				tmp = 0
+			}
+			writeBits(1, uint64(tmp), storage_ix, storage)
 		}
 	} else {
 		var previous_value byte = 8
 		var i uint
 
 		/* Complex Huffman Tree */
-		storeStaticCodeLengthCode(bw)
+		storeStaticCodeLengthCode(storage_ix, storage)
 
 		/* Actual RLE coding. */
 		for i = 0; i < length; {
@@ -594,21 +600,21 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 
 			i += reps
 			if value == 0 {
-				bw.writeBits(uint(kZeroRepsDepth[reps]), kZeroRepsBits[reps])
+				writeBits(uint(kZeroRepsDepth[reps]), kZeroRepsBits[reps], storage_ix, storage)
 			} else {
 				if previous_value != value {
-					bw.writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]))
+					writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]), storage_ix, storage)
 					reps--
 				}
 
 				if reps < 3 {
 					for reps != 0 {
 						reps--
-						bw.writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]))
+						writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]), storage_ix, storage)
 					}
 				} else {
 					reps -= 3
-					bw.writeBits(uint(kNonZeroRepsDepth[reps]), kNonZeroRepsBits[reps])
+					writeBits(uint(kNonZeroRepsDepth[reps]), kNonZeroRepsBits[reps], storage_ix, storage)
 				}
 
 				previous_value = value
@@ -733,7 +739,7 @@ const symbolBits = 9
 
 var encodeContextMap_kSymbolMask uint32 = (1 << symbolBits) - 1
 
-func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters uint, tree []huffmanTree, bw *bitWriter) {
+func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var i uint
 	var rle_symbols []uint32
 	var max_run_length_prefix uint32 = 6
@@ -742,7 +748,7 @@ func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters
 	var depths [maxContextMapSymbols]byte
 	var bits [maxContextMapSymbols]uint16
 
-	storeVarLenUint8(num_clusters-1, bw)
+	storeVarLenUint8(num_clusters-1, storage_ix, storage)
 
 	if num_clusters == 1 {
 		return
@@ -757,45 +763,45 @@ func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters
 	}
 	{
 		var use_rle bool = (max_run_length_prefix > 0)
-		bw.writeSingleBit(use_rle)
+		writeSingleBit(use_rle, storage_ix, storage)
 		if use_rle {
-			bw.writeBits(4, uint64(max_run_length_prefix)-1)
+			writeBits(4, uint64(max_run_length_prefix)-1, storage_ix, storage)
 		}
 	}
 
-	buildAndStoreHuffmanTree(histogram[:], uint(uint32(num_clusters)+max_run_length_prefix), uint(uint32(num_clusters)+max_run_length_prefix), tree, depths[:], bits[:], bw)
+	buildAndStoreHuffmanTree(histogram[:], uint(uint32(num_clusters)+max_run_length_prefix), uint(uint32(num_clusters)+max_run_length_prefix), tree, depths[:], bits[:], storage_ix, storage)
 	for i = 0; i < num_rle_symbols; i++ {
 		var rle_symbol uint32 = rle_symbols[i] & encodeContextMap_kSymbolMask
 		var extra_bits_val uint32 = rle_symbols[i] >> symbolBits
-		bw.writeBits(uint(depths[rle_symbol]), uint64(bits[rle_symbol]))
+		writeBits(uint(depths[rle_symbol]), uint64(bits[rle_symbol]), storage_ix, storage)
 		if rle_symbol > 0 && rle_symbol <= max_run_length_prefix {
-			bw.writeBits(uint(rle_symbol), uint64(extra_bits_val))
+			writeBits(uint(rle_symbol), uint64(extra_bits_val), storage_ix, storage)
 		}
 	}
 
-	bw.writeBits(1, 1) /* use move-to-front */
+	writeBits(1, 1, storage_ix, storage) /* use move-to-front */
 	rle_symbols = nil
 }
 
 /* Stores the block switch command with index block_ix to the bit stream. */
-func storeBlockSwitch(code *blockSplitCode, block_len uint32, block_type byte, is_first_block bool, bw *bitWriter) {
+func storeBlockSwitch(code *blockSplitCode, block_len uint32, block_type byte, is_first_block bool, storage_ix *uint, storage []byte) {
 	var typecode uint = nextBlockTypeCode(&code.type_code_calculator, block_type)
 	var lencode uint
 	var len_nextra uint32
 	var len_extra uint32
 	if !is_first_block {
-		bw.writeBits(uint(code.type_depths[typecode]), uint64(code.type_bits[typecode]))
+		writeBits(uint(code.type_depths[typecode]), uint64(code.type_bits[typecode]), storage_ix, storage)
 	}
 
 	getBlockLengthPrefixCode(block_len, &lencode, &len_nextra, &len_extra)
 
-	bw.writeBits(uint(code.length_depths[lencode]), uint64(code.length_bits[lencode]))
-	bw.writeBits(uint(len_nextra), uint64(len_extra))
+	writeBits(uint(code.length_depths[lencode]), uint64(code.length_bits[lencode]), storage_ix, storage)
+	writeBits(uint(len_nextra), uint64(len_extra), storage_ix, storage)
 }
 
 /* Builds a BlockSplitCode data structure from the block split given by the
    vector of block types and block lengths and stores it to the bit stream. */
-func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint, num_types uint, tree []huffmanTree, code *blockSplitCode, bw *bitWriter) {
+func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint, num_types uint, tree []huffmanTree, code *blockSplitCode, storage_ix *uint, storage []byte) {
 	var type_histo [maxBlockTypeSymbols]uint32
 	var length_histo [numBlockLenSymbols]uint32
 	var i uint
@@ -813,17 +819,17 @@ func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint
 		length_histo[blockLengthPrefixCode(lengths[i])]++
 	}
 
-	storeVarLenUint8(num_types-1, bw)
+	storeVarLenUint8(num_types-1, storage_ix, storage)
 	if num_types > 1 { /* TODO: else? could StoreBlockSwitch occur? */
-		buildAndStoreHuffmanTree(type_histo[0:], num_types+2, num_types+2, tree, code.type_depths[0:], code.type_bits[0:], bw)
-		buildAndStoreHuffmanTree(length_histo[0:], numBlockLenSymbols, numBlockLenSymbols, tree, code.length_depths[0:], code.length_bits[0:], bw)
-		storeBlockSwitch(code, lengths[0], types[0], true, bw)
+		buildAndStoreHuffmanTree(type_histo[0:], num_types+2, num_types+2, tree, code.type_depths[0:], code.type_bits[0:], storage_ix, storage)
+		buildAndStoreHuffmanTree(length_histo[0:], numBlockLenSymbols, numBlockLenSymbols, tree, code.length_depths[0:], code.length_bits[0:], storage_ix, storage)
+		storeBlockSwitch(code, lengths[0], types[0], true, storage_ix, storage)
 	}
 }
 
 /* Stores a context map where the histogram type is always the block type. */
-func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTree, bw *bitWriter) {
-	storeVarLenUint8(num_types-1, bw)
+func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
+	storeVarLenUint8(num_types-1, storage_ix, storage)
 	if num_types > 1 {
 		var repeat_code uint = context_bits - 1
 		var repeat_bits uint = (1 << repeat_code) - 1
@@ -837,16 +843,16 @@ func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTre
 		}
 
 		/* Write RLEMAX. */
-		bw.writeBits(1, 1)
+		writeBits(1, 1, storage_ix, storage)
 
-		bw.writeBits(4, uint64(repeat_code)-1)
+		writeBits(4, uint64(repeat_code)-1, storage_ix, storage)
 		histogram[repeat_code] = uint32(num_types)
 		histogram[0] = 1
 		for i = context_bits; i < alphabet_size; i++ {
 			histogram[i] = 1
 		}
 
-		buildAndStoreHuffmanTree(histogram[:], alphabet_size, alphabet_size, tree, depths[:], bits[:], bw)
+		buildAndStoreHuffmanTree(histogram[:], alphabet_size, alphabet_size, tree, depths[:], bits[:], storage_ix, storage)
 		for i = 0; i < num_types; i++ {
 			var tmp uint
 			if i == 0 {
@@ -855,13 +861,13 @@ func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTre
 				tmp = i + context_bits - 1
 			}
 			var code uint = tmp
-			bw.writeBits(uint(depths[code]), uint64(bits[code]))
-			bw.writeBits(uint(depths[repeat_code]), uint64(bits[repeat_code]))
-			bw.writeBits(repeat_code, uint64(repeat_bits))
+			writeBits(uint(depths[code]), uint64(bits[code]), storage_ix, storage)
+			writeBits(uint(depths[repeat_code]), uint64(bits[repeat_code]), storage_ix, storage)
+			writeBits(repeat_code, uint64(repeat_bits), storage_ix, storage)
 		}
 
 		/* Write IMTF (inverse-move-to-front) bit. */
-		bw.writeBits(1, 1)
+		writeBits(1, 1, storage_ix, storage)
 	}
 }
 
@@ -915,13 +921,13 @@ func cleanupBlockEncoder(self *blockEncoder) {
 
 /* Creates entropy codes of block lengths and block types and stores them
    to the bit stream. */
-func buildAndStoreBlockSwitchEntropyCodes(self *blockEncoder, tree []huffmanTree, bw *bitWriter) {
-	buildAndStoreBlockSplitCode(self.block_types_, self.block_lengths_, self.num_blocks_, self.num_block_types_, tree, &self.block_split_code_, bw)
+func buildAndStoreBlockSwitchEntropyCodes(self *blockEncoder, tree []huffmanTree, storage_ix *uint, storage []byte) {
+	buildAndStoreBlockSplitCode(self.block_types_, self.block_lengths_, self.num_blocks_, self.num_block_types_, tree, &self.block_split_code_, storage_ix, storage)
 }
 
 /* Stores the next symbol with the entropy code of the current block type.
    Updates the block type and block length at block boundaries. */
-func storeSymbol(self *blockEncoder, symbol uint, bw *bitWriter) {
+func storeSymbol(self *blockEncoder, symbol uint, storage_ix *uint, storage []byte) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
 		var block_ix uint = self.block_ix_
@@ -929,20 +935,20 @@ func storeSymbol(self *blockEncoder, symbol uint, bw *bitWriter) {
 		var block_type byte = self.block_types_[block_ix]
 		self.block_len_ = uint(block_len)
 		self.entropy_ix_ = uint(block_type) * self.histogram_length_
-		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, bw)
+		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, storage_ix, storage)
 	}
 
 	self.block_len_--
 	{
 		var ix uint = self.entropy_ix_ + symbol
-		bw.writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]))
+		writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]), storage_ix, storage)
 	}
 }
 
 /* Stores the next symbol with the entropy code of the current block type and
    context value.
    Updates the block type and block length at block boundaries. */
-func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, context_map []uint32, bw *bitWriter, context_bits uint) {
+func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, context_map []uint32, storage_ix *uint, storage []byte, context_bits uint) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
 		var block_ix uint = self.block_ix_
@@ -950,18 +956,18 @@ func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, conte
 		var block_type byte = self.block_types_[block_ix]
 		self.block_len_ = uint(block_len)
 		self.entropy_ix_ = uint(block_type) << context_bits
-		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, bw)
+		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, storage_ix, storage)
 	}
 
 	self.block_len_--
 	{
 		var histo_ix uint = uint(context_map[self.entropy_ix_+context])
 		var ix uint = histo_ix*self.histogram_length_ + symbol
-		bw.writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]))
+		writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]), storage_ix, storage)
 	}
 }
 
-func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogramLiteral, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogramLiteral, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -977,12 +983,12 @@ func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogram
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogramCommand, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogramCommand, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -998,12 +1004,12 @@ func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogram
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogramDistance, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogramDistance, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -1019,12 +1025,17 @@ func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogra
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, mb *metaBlockSplit, bw *bitWriter) {
+func jumpToByteBoundary(storage_ix *uint, storage []byte) {
+	*storage_ix = (*storage_ix + 7) &^ 7
+	storage[*storage_ix>>3] = 0
+}
+
+func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, mb *metaBlockSplit, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
 	var i uint
 	var num_distance_symbols uint32 = params.dist.alphabet_size
@@ -1036,48 +1047,48 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 		num_effective_distance_symbols = numHistogramDistanceSymbols
 	}
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
 	tree = make([]huffmanTree, maxHuffmanTreeSize)
 	literal_enc := getBlockEncoder(numLiteralSymbols, mb.literal_split.num_types, mb.literal_split.types, mb.literal_split.lengths, mb.literal_split.num_blocks)
 	command_enc := getBlockEncoder(numCommandSymbols, mb.command_split.num_types, mb.command_split.types, mb.command_split.lengths, mb.command_split.num_blocks)
 	distance_enc := getBlockEncoder(uint(num_effective_distance_symbols), mb.distance_split.num_types, mb.distance_split.types, mb.distance_split.lengths, mb.distance_split.num_blocks)
 
-	buildAndStoreBlockSwitchEntropyCodes(literal_enc, tree, bw)
-	buildAndStoreBlockSwitchEntropyCodes(command_enc, tree, bw)
-	buildAndStoreBlockSwitchEntropyCodes(distance_enc, tree, bw)
+	buildAndStoreBlockSwitchEntropyCodes(literal_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(command_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(distance_enc, tree, storage_ix, storage)
 
-	bw.writeBits(2, uint64(dist.distance_postfix_bits))
-	bw.writeBits(4, uint64(dist.num_direct_distance_codes)>>dist.distance_postfix_bits)
+	writeBits(2, uint64(dist.distance_postfix_bits), storage_ix, storage)
+	writeBits(4, uint64(dist.num_direct_distance_codes)>>dist.distance_postfix_bits, storage_ix, storage)
 	for i = 0; i < mb.literal_split.num_types; i++ {
-		bw.writeBits(2, uint64(literal_context_mode))
+		writeBits(2, uint64(literal_context_mode), storage_ix, storage)
 	}
 
 	if mb.literal_context_map_size == 0 {
-		storeTrivialContextMap(mb.literal_histograms_size, literalContextBits, tree, bw)
+		storeTrivialContextMap(mb.literal_histograms_size, literalContextBits, tree, storage_ix, storage)
 	} else {
-		encodeContextMap(mb.literal_context_map, mb.literal_context_map_size, mb.literal_histograms_size, tree, bw)
+		encodeContextMap(mb.literal_context_map, mb.literal_context_map_size, mb.literal_histograms_size, tree, storage_ix, storage)
 	}
 
 	if mb.distance_context_map_size == 0 {
-		storeTrivialContextMap(mb.distance_histograms_size, distanceContextBits, tree, bw)
+		storeTrivialContextMap(mb.distance_histograms_size, distanceContextBits, tree, storage_ix, storage)
 	} else {
-		encodeContextMap(mb.distance_context_map, mb.distance_context_map_size, mb.distance_histograms_size, tree, bw)
+		encodeContextMap(mb.distance_context_map, mb.distance_context_map_size, mb.distance_histograms_size, tree, storage_ix, storage)
 	}
 
-	buildAndStoreEntropyCodesLiteral(literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, bw)
-	buildAndStoreEntropyCodesCommand(command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, bw)
-	buildAndStoreEntropyCodesDistance(distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, bw)
+	buildAndStoreEntropyCodesLiteral(literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesCommand(command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesDistance(distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, storage_ix, storage)
 	tree = nil
 
 	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
-		storeSymbol(command_enc, cmd_code, bw)
-		storeCommandExtra(&cmd, bw)
+		storeSymbol(command_enc, cmd_code, storage_ix, storage)
+		storeCommandExtra(&cmd, storage_ix, storage)
 		if mb.literal_context_map_size == 0 {
 			var j uint
 			for j = uint(cmd.insert_len_); j != 0; j-- {
-				storeSymbol(literal_enc, uint(input[pos&mask]), bw)
+				storeSymbol(literal_enc, uint(input[pos&mask]), storage_ix, storage)
 				pos++
 			}
 		} else {
@@ -1085,7 +1096,7 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 			for j = uint(cmd.insert_len_); j != 0; j-- {
 				var context uint = uint(getContext(prev_byte, prev_byte2, literal_context_lut))
 				var literal byte = input[pos&mask]
-				storeSymbolWithContext(literal_enc, uint(literal), context, mb.literal_context_map, bw, literalContextBits)
+				storeSymbolWithContext(literal_enc, uint(literal), context, mb.literal_context_map, storage_ix, storage, literalContextBits)
 				prev_byte2 = prev_byte
 				prev_byte = literal
 				pos++
@@ -1101,13 +1112,13 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 				var distnumextra uint32 = uint32(cmd.dist_prefix_) >> 10
 				var distextra uint64 = uint64(cmd.dist_extra_)
 				if mb.distance_context_map_size == 0 {
-					storeSymbol(distance_enc, dist_code, bw)
+					storeSymbol(distance_enc, dist_code, storage_ix, storage)
 				} else {
 					var context uint = uint(commandDistanceContext(&cmd))
-					storeSymbolWithContext(distance_enc, dist_code, context, mb.distance_context_map, bw, distanceContextBits)
+					storeSymbolWithContext(distance_enc, dist_code, context, mb.distance_context_map, storage_ix, storage, distanceContextBits)
 				}
 
-				bw.writeBits(uint(distnumextra), distextra)
+				writeBits(uint(distnumextra), distextra, storage_ix, storage)
 			}
 		}
 	}
@@ -1116,7 +1127,7 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 	cleanupBlockEncoder(command_enc)
 	cleanupBlockEncoder(literal_enc)
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
@@ -1137,16 +1148,16 @@ func buildHistograms(input []byte, start_pos uint, mask uint, commands []command
 	}
 }
 
-func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, bw *bitWriter) {
+func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
 	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
 		var j uint
-		bw.writeBits(uint(cmd_depth[cmd_code]), uint64(cmd_bits[cmd_code]))
-		storeCommandExtra(&cmd, bw)
+		writeBits(uint(cmd_depth[cmd_code]), uint64(cmd_bits[cmd_code]), storage_ix, storage)
+		storeCommandExtra(&cmd, storage_ix, storage)
 		for j = uint(cmd.insert_len_); j != 0; j-- {
 			var literal byte = input[pos&mask]
-			bw.writeBits(uint(lit_depth[literal]), uint64(lit_bits[literal]))
+			writeBits(uint(lit_depth[literal]), uint64(lit_bits[literal]), storage_ix, storage)
 			pos++
 		}
 
@@ -1155,13 +1166,13 @@ func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands
 			var dist_code uint = uint(cmd.dist_prefix_) & 0x3FF
 			var distnumextra uint32 = uint32(cmd.dist_prefix_) >> 10
 			var distextra uint32 = cmd.dist_extra_
-			bw.writeBits(uint(dist_depth[dist_code]), uint64(dist_bits[dist_code]))
-			bw.writeBits(uint(distnumextra), uint64(distextra))
+			writeBits(uint(dist_depth[dist_code]), uint64(dist_bits[dist_code]), storage_ix, storage)
+			writeBits(uint(distnumextra), uint64(distextra), storage_ix, storage)
 		}
 	}
 }
 
-func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, bw *bitWriter) {
+func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var lit_histo histogramLiteral
 	var cmd_histo histogramCommand
 	var dist_histo histogramDistance
@@ -1174,7 +1185,7 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 	var tree []huffmanTree
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
 	histogramClearLiteral(&lit_histo)
 	histogramClearCommand(&cmd_histo)
@@ -1182,26 +1193,26 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 
 	buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
 	tree = make([]huffmanTree, maxHuffmanTreeSize)
-	buildAndStoreHuffmanTree(lit_histo.data_[:], numLiteralSymbols, numLiteralSymbols, tree, lit_depth[:], lit_bits[:], bw)
-	buildAndStoreHuffmanTree(cmd_histo.data_[:], numCommandSymbols, numCommandSymbols, tree, cmd_depth[:], cmd_bits[:], bw)
-	buildAndStoreHuffmanTree(dist_histo.data_[:], maxSimpleDistanceAlphabetSize, uint(num_distance_symbols), tree, dist_depth[:], dist_bits[:], bw)
+	buildAndStoreHuffmanTree(lit_histo.data_[:], numLiteralSymbols, numLiteralSymbols, tree, lit_depth[:], lit_bits[:], storage_ix, storage)
+	buildAndStoreHuffmanTree(cmd_histo.data_[:], numCommandSymbols, numCommandSymbols, tree, cmd_depth[:], cmd_bits[:], storage_ix, storage)
+	buildAndStoreHuffmanTree(dist_histo.data_[:], maxSimpleDistanceAlphabetSize, uint(num_distance_symbols), tree, dist_depth[:], dist_bits[:], storage_ix, storage)
 	tree = nil
-	storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], bw)
+	storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
-func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, bw *bitWriter) {
+func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 	var distance_alphabet_bits uint32 = log2FloorNonZero(uint(num_distance_symbols-1)) + 1
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
 	if len(commands) <= 128 {
 		var histogram = [numLiteralSymbols]uint32{0}
@@ -1221,11 +1232,11 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		}
 
 		buildAndStoreHuffmanTreeFast(histogram[:], num_literals, /* max_bits = */
-			8, lit_depth[:], lit_bits[:], bw)
+			8, lit_depth[:], lit_bits[:], storage_ix, storage)
 
-		storeStaticCommandHuffmanTree(bw)
-		storeStaticDistanceHuffmanTree(bw)
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], bw)
+		storeStaticCommandHuffmanTree(storage_ix, storage)
+		storeStaticDistanceHuffmanTree(storage_ix, storage)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], storage_ix, storage)
 	} else {
 		var lit_histo histogramLiteral
 		var cmd_histo histogramCommand
@@ -1241,43 +1252,49 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		histogramClearDistance(&dist_histo)
 		buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 		buildAndStoreHuffmanTreeFast(lit_histo.data_[:], lit_histo.total_count_, /* max_bits = */
-			8, lit_depth[:], lit_bits[:], bw)
+			8, lit_depth[:], lit_bits[:], storage_ix, storage)
 
 		buildAndStoreHuffmanTreeFast(cmd_histo.data_[:], cmd_histo.total_count_, /* max_bits = */
-			10, cmd_depth[:], cmd_bits[:], bw)
+			10, cmd_depth[:], cmd_bits[:], storage_ix, storage)
 
 		buildAndStoreHuffmanTreeFast(dist_histo.data_[:], dist_histo.total_count_, /* max_bits = */
-			uint(distance_alphabet_bits), dist_depth[:], dist_bits[:], bw)
+			uint(distance_alphabet_bits), dist_depth[:], dist_bits[:], storage_ix, storage)
 
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], bw)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	}
 
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
 /* This is for storing uncompressed blocks (simple raw storage of
    bytes-as-bytes). */
-func storeUncompressedMetaBlock(is_final_block bool, input []byte, position uint, mask uint, len uint, bw *bitWriter) {
+func storeUncompressedMetaBlock(is_final_block bool, input []byte, position uint, mask uint, len uint, storage_ix *uint, storage []byte) {
 	var masked_pos uint = position & mask
-	storeUncompressedMetaBlockHeader(uint(len), bw)
-	bw.jumpToByteBoundary()
+	storeUncompressedMetaBlockHeader(uint(len), storage_ix, storage)
+	jumpToByteBoundary(storage_ix, storage)
 
 	if masked_pos+len > mask+1 {
 		var len1 uint = mask + 1 - masked_pos
-		bw.writeBytes(input[masked_pos:][:len1])
+		copy(storage[*storage_ix>>3:], input[masked_pos:][:len1])
+		*storage_ix += len1 << 3
 		len -= len1
 		masked_pos = 0
 	}
 
-	bw.writeBytes(input[masked_pos:][:len])
+	copy(storage[*storage_ix>>3:], input[masked_pos:][:len])
+	*storage_ix += uint(len << 3)
+
+	/* We need to clear the next 4 bytes to continue to be
+	   compatible with BrotliWriteBits. */
+	writeBitsPrepareStorage(*storage_ix, storage)
 
 	/* Since the uncompressed block itself may not be the final block, add an
 	   empty one after this. */
 	if is_final_block {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/cluster_command.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/cluster_command.go
index 7449751b..45b569bb 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/cluster_command.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/cluster_command.go
@@ -1,7 +1,5 @@
 package brotli
 
-import "math"
-
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -164,163 +162,3 @@ func histogramBitCostDistanceCommand(histogram *histogramCommand, candidate *his
 		return populationCostCommand(&tmp) - candidate.bit_cost_
 	}
 }
-
-/* Find the best 'out' histogram for each of the 'in' histograms.
-   When called, clusters[0..num_clusters) contains the unique values from
-   symbols[0..in_size), but this property is not preserved in this function.
-   Note: we assume that out[]->bit_cost_ is already up-to-date. */
-func histogramRemapCommand(in []histogramCommand, in_size uint, clusters []uint32, num_clusters uint, out []histogramCommand, symbols []uint32) {
-	var i uint
-	for i = 0; i < in_size; i++ {
-		var best_out uint32
-		if i == 0 {
-			best_out = symbols[0]
-		} else {
-			best_out = symbols[i-1]
-		}
-		var best_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[best_out])
-		var j uint
-		for j = 0; j < num_clusters; j++ {
-			var cur_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[clusters[j]])
-			if cur_bits < best_bits {
-				best_bits = cur_bits
-				best_out = clusters[j]
-			}
-		}
-
-		symbols[i] = best_out
-	}
-
-	/* Recompute each out based on raw and symbols. */
-	for i = 0; i < num_clusters; i++ {
-		histogramClearCommand(&out[clusters[i]])
-	}
-
-	for i = 0; i < in_size; i++ {
-		histogramAddHistogramCommand(&out[symbols[i]], &in[i])
-	}
-}
-
-/* Reorders elements of the out[0..length) array and changes values in
-   symbols[0..length) array in the following way:
-     * when called, symbols[] contains indexes into out[], and has N unique
-       values (possibly N < length)
-     * on return, symbols'[i] = f(symbols[i]) and
-                  out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
-       where f is a bijection between the range of symbols[] and [0..N), and
-       the first occurrences of values in symbols'[i] come in consecutive
-       increasing order.
-   Returns N, the number of unique values in symbols[]. */
-
-var histogramReindexCommand_kInvalidIndex uint32 = math.MaxUint32
-
-func histogramReindexCommand(out []histogramCommand, symbols []uint32, length uint) uint {
-	var new_index []uint32 = make([]uint32, length)
-	var next_index uint32
-	var tmp []histogramCommand
-	var i uint
-	for i = 0; i < length; i++ {
-		new_index[i] = histogramReindexCommand_kInvalidIndex
-	}
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == histogramReindexCommand_kInvalidIndex {
-			new_index[symbols[i]] = next_index
-			next_index++
-		}
-	}
-
-	/* TODO: by using idea of "cycle-sort" we can avoid allocation of
-	   tmp and reduce the number of copying by the factor of 2. */
-	tmp = make([]histogramCommand, next_index)
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == next_index {
-			tmp[next_index] = out[symbols[i]]
-			next_index++
-		}
-
-		symbols[i] = new_index[symbols[i]]
-	}
-
-	new_index = nil
-	for i = 0; uint32(i) < next_index; i++ {
-		out[i] = tmp[i]
-	}
-
-	tmp = nil
-	return uint(next_index)
-}
-
-func clusterHistogramsCommand(in []histogramCommand, in_size uint, max_histograms uint, out []histogramCommand, out_size *uint, histogram_symbols []uint32) {
-	var cluster_size []uint32 = make([]uint32, in_size)
-	var clusters []uint32 = make([]uint32, in_size)
-	var num_clusters uint = 0
-	var max_input_histograms uint = 64
-	var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
-	var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
-	var i uint
-
-	/* For the first pass of clustering, we allow all pairs. */
-	for i = 0; i < in_size; i++ {
-		cluster_size[i] = 1
-	}
-
-	for i = 0; i < in_size; i++ {
-		out[i] = in[i]
-		out[i].bit_cost_ = populationCostCommand(&in[i])
-		histogram_symbols[i] = uint32(i)
-	}
-
-	for i = 0; i < in_size; i += max_input_histograms {
-		var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
-		var num_new_clusters uint
-		var j uint
-		for j = 0; j < num_to_combine; j++ {
-			clusters[num_clusters+j] = uint32(i + j)
-		}
-
-		num_new_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
-		num_clusters += num_new_clusters
-	}
-	{
-		/* For the second pass, we limit the total number of histogram pairs.
-		   After this limit is reached, we only keep searching for the best pair. */
-		var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
-		if pairs_capacity < (max_num_pairs + 1) {
-			var _new_size uint
-			if pairs_capacity == 0 {
-				_new_size = max_num_pairs + 1
-			} else {
-				_new_size = pairs_capacity
-			}
-			var new_array []histogramPair
-			for _new_size < (max_num_pairs + 1) {
-				_new_size *= 2
-			}
-			new_array = make([]histogramPair, _new_size)
-			if pairs_capacity != 0 {
-				copy(new_array, pairs[:pairs_capacity])
-			}
-
-			pairs = new_array
-			pairs_capacity = _new_size
-		}
-
-		/* Collapse similar histograms. */
-		num_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
-	}
-
-	pairs = nil
-	cluster_size = nil
-
-	/* Find the optimal map from original histograms to the final ones. */
-	histogramRemapCommand(in, in_size, clusters, num_clusters, out, histogram_symbols)
-
-	clusters = nil
-
-	/* Convert the context map to a canonical form. */
-	*out_size = histogramReindexCommand(out, histogram_symbols, in_size)
-}
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment.go
index dbf0c43b..c9bd0577 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment.go
@@ -45,7 +45,7 @@ func isMatch5(p1 []byte, p2 []byte) bool {
    and thus have to assign a non-zero depth for each literal.
    Returns estimated compression ratio millibytes/char for encoding given input
    with generated code. */
-func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, bw *bitWriter) uint {
+func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, storage_ix *uint, storage []byte) uint {
 	var histogram = [256]uint32{0}
 	var histogram_total uint
 	var i uint
@@ -82,7 +82,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
 	}
 
 	buildAndStoreHuffmanTreeFast(histogram[:], histogram_total, /* max_bits = */
-		8, depths, bits, bw)
+		8, depths, bits, storage_ix, storage)
 	{
 		var literal_ratio uint = 0
 		for i = 0; i < 256; i++ {
@@ -98,7 +98,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
 
 /* Builds a command and distance prefix code (each 64 symbols) into "depth" and
    "bits" based on "histogram" and stores it into the bit stream. */
-func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var tree [129]huffmanTree
 	var cmd_depth = [numCommandSymbols]byte{0}
 	/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@@ -145,141 +145,141 @@ func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []ui
 			cmd_depth[448+8*i] = depth[56+i]
 		}
 
-		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
+		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
 	}
 
-	storeHuffmanTree(depth[64:], 64, tree[:], bw)
+	storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
 }
 
 /* REQUIRES: insertlen < 6210 */
-func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if insertlen < 6 {
 		var code uint = insertlen + 40
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
 		histo[code]++
 	} else if insertlen < 130 {
 		var tail uint = insertlen - 2
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var inscode uint = uint((nbits << 1) + uint32(prefix) + 42)
-		bw.writeBits(uint(depth[inscode]), uint64(bits[inscode]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[inscode]), uint64(bits[inscode]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[inscode]++
 	} else if insertlen < 2114 {
 		var tail uint = insertlen - 66
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 50)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
 		histo[code]++
 	} else {
-		bw.writeBits(uint(depth[61]), uint64(bits[61]))
-		bw.writeBits(12, uint64(insertlen)-2114)
+		writeBits(uint(depth[61]), uint64(bits[61]), storage_ix, storage)
+		writeBits(12, uint64(insertlen)-2114, storage_ix, storage)
 		histo[61]++
 	}
 }
 
-func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if insertlen < 22594 {
-		bw.writeBits(uint(depth[62]), uint64(bits[62]))
-		bw.writeBits(14, uint64(insertlen)-6210)
+		writeBits(uint(depth[62]), uint64(bits[62]), storage_ix, storage)
+		writeBits(14, uint64(insertlen)-6210, storage_ix, storage)
 		histo[62]++
 	} else {
-		bw.writeBits(uint(depth[63]), uint64(bits[63]))
-		bw.writeBits(24, uint64(insertlen)-22594)
+		writeBits(uint(depth[63]), uint64(bits[63]), storage_ix, storage)
+		writeBits(24, uint64(insertlen)-22594, storage_ix, storage)
 		histo[63]++
 	}
 }
 
-func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if copylen < 10 {
-		bw.writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]))
+		writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]), storage_ix, storage)
 		histo[copylen+14]++
 	} else if copylen < 134 {
 		var tail uint = copylen - 6
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var code uint = uint((nbits << 1) + uint32(prefix) + 20)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[code]++
 	} else if copylen < 2118 {
 		var tail uint = copylen - 70
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 28)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
 		histo[code]++
 	} else {
-		bw.writeBits(uint(depth[39]), uint64(bits[39]))
-		bw.writeBits(24, uint64(copylen)-2118)
+		writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
+		writeBits(24, uint64(copylen)-2118, storage_ix, storage)
 		histo[39]++
 	}
 }
 
-func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if copylen < 12 {
-		bw.writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]))
+		writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]), storage_ix, storage)
 		histo[copylen-4]++
 	} else if copylen < 72 {
 		var tail uint = copylen - 8
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var code uint = uint((nbits << 1) + uint32(prefix) + 4)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[code]++
 	} else if copylen < 136 {
 		var tail uint = copylen - 8
 		var code uint = (tail >> 5) + 30
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(5, uint64(tail)&31)
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(5, uint64(tail)&31, storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[code]++
 		histo[64]++
 	} else if copylen < 2120 {
 		var tail uint = copylen - 72
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 28)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[code]++
 		histo[64]++
 	} else {
-		bw.writeBits(uint(depth[39]), uint64(bits[39]))
-		bw.writeBits(24, uint64(copylen)-2120)
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
+		writeBits(24, uint64(copylen)-2120, storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[39]++
 		histo[64]++
 	}
 }
 
-func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	var d uint = distance + 3
 	var nbits uint32 = log2FloorNonZero(d) - 1
 	var prefix uint = (d >> nbits) & 1
 	var offset uint = (2 + prefix) << nbits
 	var distcode uint = uint(2*(nbits-1) + uint32(prefix) + 80)
-	bw.writeBits(uint(depth[distcode]), uint64(bits[distcode]))
-	bw.writeBits(uint(nbits), uint64(d)-uint64(offset))
+	writeBits(uint(depth[distcode]), uint64(bits[distcode]), storage_ix, storage)
+	writeBits(uint(nbits), uint64(d)-uint64(offset), storage_ix, storage)
 	histo[distcode]++
 }
 
-func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, bw *bitWriter) {
+func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var j uint
 	for j = 0; j < len; j++ {
 		var lit byte = input[j]
-		bw.writeBits(uint(depth[lit]), uint64(bits[lit]))
+		writeBits(uint(depth[lit]), uint64(bits[lit]), storage_ix, storage)
 	}
 }
 
 /* REQUIRES: len <= 1 << 24. */
-func storeMetaBlockHeader1(len uint, is_uncompressed bool, bw *bitWriter) {
+func storeMetaBlockHeader1(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
 	var nibbles uint = 6
 
 	/* ISLAST */
-	bw.writeBits(1, 0)
+	writeBits(1, 0, storage_ix, storage)
 
 	if len <= 1<<16 {
 		nibbles = 4
@@ -287,11 +287,34 @@ func storeMetaBlockHeader1(len uint, is_uncompressed bool, bw *bitWriter) {
 		nibbles = 5
 	}
 
-	bw.writeBits(2, uint64(nibbles)-4)
-	bw.writeBits(nibbles*4, uint64(len)-1)
+	writeBits(2, uint64(nibbles)-4, storage_ix, storage)
+	writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
 
 	/* ISUNCOMPRESSED */
-	bw.writeSingleBit(is_uncompressed)
+	writeSingleBit(is_uncompressed, storage_ix, storage)
+}
+
+func updateBits(n_bits uint, bits uint32, pos uint, array []byte) {
+	for n_bits > 0 {
+		var byte_pos uint = pos >> 3
+		var n_unchanged_bits uint = pos & 7
+		var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
+		var total_bits uint = n_unchanged_bits + n_changed_bits
+		var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
+		var unchanged_bits uint32 = uint32(array[byte_pos]) & mask
+		var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
+		array[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
+		n_bits -= n_changed_bits
+		bits >>= n_changed_bits
+		pos += n_changed_bits
+	}
+}
+
+func rewindBitPosition1(new_storage_ix uint, storage_ix *uint, storage []byte) {
+	var bitpos uint = new_storage_ix & 7
+	var mask uint = (1 << bitpos) - 1
+	storage[new_storage_ix>>3] &= byte(mask)
+	*storage_ix = new_storage_ix
 }
 
 var shouldMergeBlock_kSampleRate uint = 43
@@ -322,26 +345,151 @@ func shouldUseUncompressedMode(metablock_start []byte, next_emit []byte, insertl
 	}
 }
 
-func emitUncompressedMetaBlock1(data []byte, storage_ix_start uint, bw *bitWriter) {
-	bw.rewind(storage_ix_start)
-	storeMetaBlockHeader1(uint(len(data)), true, bw)
-	bw.jumpToByteBoundary()
-	bw.writeBytes(data)
+func emitUncompressedMetaBlock1(begin []byte, end []byte, storage_ix_start uint, storage_ix *uint, storage []byte) {
+	var len uint = uint(-cap(end) + cap(begin))
+	rewindBitPosition1(storage_ix_start, storage_ix, storage)
+	storeMetaBlockHeader1(uint(len), true, storage_ix, storage)
+	*storage_ix = (*storage_ix + 7) &^ 7
+	copy(storage[*storage_ix>>3:], begin[:len])
+	*storage_ix += uint(len << 3)
+	storage[*storage_ix>>3] = 0
 }
 
 var kCmdHistoSeed = [128]uint32{
-	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 0, 0, 0, 0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	0,
 }
 
 var compressFragmentFastImpl_kFirstBlockSize uint = 3 << 15
 var compressFragmentFastImpl_kMergeBlockSize uint = 1 << 16
 
-func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
+func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
 	var cmd_histo [128]uint32
 	var ip_end int
 	var next_emit int = 0
@@ -352,7 +500,7 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
 	var metablock_start int = input
 	var block_size uint = brotli_min_size_t(input_size, compressFragmentFastImpl_kFirstBlockSize)
 	var total_block_size uint = block_size
-	var mlen_storage_ix uint = bw.getPos() + 3
+	var mlen_storage_ix uint = *storage_ix + 3
 	var lit_depth [256]byte
 	var lit_bits [256]uint16
 	var literal_ratio uint
@@ -369,21 +517,21 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
 
 	/* Save the bit position of the MLEN field of the meta-block header, so that
 	   we can update it later if we decide to extend this meta-block. */
-	storeMetaBlockHeader1(block_size, false, bw)
+	storeMetaBlockHeader1(block_size, false, storage_ix, storage)
 
 	/* No block splits, no contexts. */
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
-	literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
+	literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
 	{
 		/* Store the pre-compressed command and distance prefix codes. */
 		var i uint
 		for i = 0; i+7 < *cmd_code_numbits; i += 8 {
-			bw.writeBits(8, uint64(cmd_code[i>>3]))
+			writeBits(8, uint64(cmd_code[i>>3]), storage_ix, storage)
 		}
 	}
 
-	bw.writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]))
+	writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]), storage_ix, storage)
 
 	/* Initialize the command and distance histograms. We will gather
 	   statistics of command and distance codes during the processing
@@ -456,7 +604,7 @@ emit_commands:
 				assert(candidate < ip)
 
 				table[hash] = int(ip - base_ip)
-				if !(!isMatch5(in[ip:], in[candidate:])) {
+				if isMatch5(in[ip:], in[candidate:]) {
 					break
 				}
 			}
@@ -482,27 +630,27 @@ emit_commands:
 				var insert uint = uint(base - next_emit)
 				ip += int(matched)
 				if insert < 6210 {
-					emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 				} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
-					emitUncompressedMetaBlock1(in[metablock_start:base], mlen_storage_ix-3, bw)
+					emitUncompressedMetaBlock1(in[metablock_start:], in[base:], mlen_storage_ix-3, storage_ix, storage)
 					input_size -= uint(base - input)
 					input = base
 					next_emit = input
 					goto next_block
 				} else {
-					emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 				}
 
-				emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+				emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 				if distance == last_distance {
-					bw.writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]))
+					writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]), storage_ix, storage)
 					cmd_histo[64]++
 				} else {
-					emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 					last_distance = distance
 				}
 
-				emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
+				emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 
 				next_emit = ip
 				if ip >= ip_limit {
@@ -538,8 +686,8 @@ emit_commands:
 				}
 				ip += int(matched)
 				last_distance = int(base - candidate) /* > 0 */
-				emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
-				emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
+				emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+				emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 
 				next_emit = ip
 				if ip >= ip_limit {
@@ -585,7 +733,7 @@ emit_remainder:
 		   nibbles. */
 		total_block_size += block_size
 
-		bw.updateBits(20, uint32(total_block_size-1), mlen_storage_ix)
+		updateBits(20, uint32(total_block_size-1), mlen_storage_ix, storage)
 		goto emit_commands
 	}
 
@@ -593,13 +741,13 @@ emit_remainder:
 	if next_emit < ip_end {
 		var insert uint = uint(ip_end - next_emit)
 		if insert < 6210 {
-			emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
-			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+			emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 		} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
-			emitUncompressedMetaBlock1(in[metablock_start:ip_end], mlen_storage_ix-3, bw)
+			emitUncompressedMetaBlock1(in[metablock_start:], in[ip_end:], mlen_storage_ix-3, storage_ix, storage)
 		} else {
-			emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
-			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+			emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 		}
 	}
 
@@ -615,29 +763,30 @@ next_block:
 
 		/* Save the bit position of the MLEN field of the meta-block header, so that
 		   we can update it later if we decide to extend this meta-block. */
-		mlen_storage_ix = bw.getPos() + 3
+		mlen_storage_ix = *storage_ix + 3
 
-		storeMetaBlockHeader1(block_size, false, bw)
+		storeMetaBlockHeader1(block_size, false, storage_ix, storage)
 
 		/* No block splits, no contexts. */
-		bw.writeBits(13, 0)
+		writeBits(13, 0, storage_ix, storage)
 
-		literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
-		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, bw)
+		literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
+		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, storage_ix, storage)
 		goto emit_commands
 	}
 
 	if !is_last {
 		/* If this is not the last block, update the command and distance prefix
 		   codes for the next block and store the compressed forms. */
-		var bw bitWriter
-		bw.dst = cmd_code
-		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, &bw)
-		*cmd_code_numbits = bw.getPos()
+		cmd_code[0] = 0
+
+		*cmd_code_numbits = 0
+		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, cmd_code_numbits, cmd_code)
 	}
 }
 
-/* Compresses "input" string to bw as one or more complete meta-blocks.
+/* Compresses "input" string to the "*storage" buffer as one or more complete
+   meta-blocks, and updates the "*storage_ix" bit position.
 
    If "is_last" is 1, emits an additional empty last meta-block.
 
@@ -658,28 +807,28 @@ next_block:
    REQUIRES: "table_size" is an odd (9, 11, 13, 15) power of two
    OUTPUT: maximal copy distance <= |input_size|
    OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
-func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
-	var initial_storage_ix uint = bw.getPos()
+func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
+	var initial_storage_ix uint = *storage_ix
 	var table_bits uint = uint(log2FloorNonZero(table_size))
 
 	if input_size == 0 {
 		assert(is_last)
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 		return
 	}
 
-	compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, bw)
+	compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, storage_ix, storage)
 
 	/* If output is larger than single uncompressed block, rewrite it. */
-	if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
-		emitUncompressedMetaBlock1(input[:input_size], initial_storage_ix, bw)
+	if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
+		emitUncompressedMetaBlock1(input, input[input_size:], initial_storage_ix, storage_ix, storage)
 	}
 
 	if is_last {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 	}
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
index 2473aca3..172dc7f4 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
@@ -41,7 +41,7 @@ func isMatch1(p1 []byte, p2 []byte, length uint) bool {
 
 /* Builds a command and distance prefix code (each 64 symbols) into "depth" and
    "bits" based on "histogram" and stores it into the bit stream. */
-func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var tree [129]huffmanTree
 	var cmd_depth = [numCommandSymbols]byte{0}
 	/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@@ -87,10 +87,10 @@ func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uin
 			cmd_depth[448+8*i] = depth[16+i]
 		}
 
-		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
+		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
 	}
 
-	storeHuffmanTree(depth[64:], 64, tree[:], bw)
+	storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
 }
 
 func emitInsertLen(insertlen uint32, commands *[]uint32) {
@@ -197,11 +197,11 @@ func emitDistance(distance uint32, commands *[]uint32) {
 }
 
 /* REQUIRES: len <= 1 << 24. */
-func storeMetaBlockHeader(len uint, is_uncompressed bool, bw *bitWriter) {
+func storeMetaBlockHeader(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
 	var nibbles uint = 6
 
 	/* ISLAST */
-	bw.writeBits(1, 0)
+	writeBits(1, 0, storage_ix, storage)
 
 	if len <= 1<<16 {
 		nibbles = 4
@@ -209,11 +209,11 @@ func storeMetaBlockHeader(len uint, is_uncompressed bool, bw *bitWriter) {
 		nibbles = 5
 	}
 
-	bw.writeBits(2, uint64(nibbles)-4)
-	bw.writeBits(nibbles*4, uint64(len)-1)
+	writeBits(2, uint64(nibbles)-4, storage_ix, storage)
+	writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
 
 	/* ISUNCOMPRESSED */
-	bw.writeSingleBit(is_uncompressed)
+	writeSingleBit(is_uncompressed, storage_ix, storage)
 }
 
 func createCommands(input []byte, block_size uint, input_size uint, base_ip_ptr []byte, table []int, table_bits uint, min_match uint, literals *[]byte, commands *[]uint32) {
@@ -440,20 +440,163 @@ emit_remainder:
 }
 
 var storeCommands_kNumExtraBits = [128]uint32{
-	0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 12, 14, 24,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 24,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
-	9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
-	17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	7,
+	8,
+	9,
+	10,
+	12,
+	14,
+	24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	7,
+	8,
+	9,
+	10,
+	24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	6,
+	7,
+	7,
+	8,
+	8,
+	9,
+	9,
+	10,
+	10,
+	11,
+	11,
+	12,
+	12,
+	13,
+	13,
+	14,
+	14,
+	15,
+	15,
+	16,
+	16,
+	17,
+	17,
+	18,
+	18,
+	19,
+	19,
+	20,
+	20,
+	21,
+	21,
+	22,
+	22,
+	23,
+	23,
+	24,
+	24,
 }
 var storeCommands_kInsertOffset = [24]uint32{
-	0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 26, 34, 50, 66, 98, 130, 194, 322, 578,
-	1090, 2114, 6210, 22594,
+	0,
+	1,
+	2,
+	3,
+	4,
+	5,
+	6,
+	8,
+	10,
+	14,
+	18,
+	26,
+	34,
+	50,
+	66,
+	98,
+	130,
+	194,
+	322,
+	578,
+	1090,
+	2114,
+	6210,
+	22594,
 }
 
-func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, bw *bitWriter) {
+func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, storage_ix *uint, storage []byte) {
 	var lit_depths [256]byte
 	var lit_bits [256]uint16
 	var lit_histo = [256]uint32{0}
@@ -466,7 +609,7 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
 	}
 
 	buildAndStoreHuffmanTreeFast(lit_histo[:], num_literals, /* max_bits = */
-		8, lit_depths[:], lit_bits[:], bw)
+		8, lit_depths[:], lit_bits[:], storage_ix, storage)
 
 	for i = 0; i < num_commands; i++ {
 		var code uint32 = commands[i] & 0xFF
@@ -478,21 +621,21 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
 	cmd_histo[2] += 1
 	cmd_histo[64] += 1
 	cmd_histo[84] += 1
-	buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], bw)
+	buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], storage_ix, storage)
 
 	for i = 0; i < num_commands; i++ {
 		var cmd uint32 = commands[i]
 		var code uint32 = cmd & 0xFF
 		var extra uint32 = cmd >> 8
 		assert(code < 128)
-		bw.writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]))
-		bw.writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra))
+		writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]), storage_ix, storage)
+		writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra), storage_ix, storage)
 		if code < 24 {
 			var insert uint32 = storeCommands_kInsertOffset[code] + extra
 			var j uint32
 			for j = 0; j < insert; j++ {
 				var lit byte = literals[0]
-				bw.writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]))
+				writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]), storage_ix, storage)
 				literals = literals[1:]
 			}
 		}
@@ -520,13 +663,22 @@ func shouldCompress(input []byte, input_size uint, num_literals uint) bool {
 	}
 }
 
-func emitUncompressedMetaBlock(input []byte, input_size uint, bw *bitWriter) {
-	storeMetaBlockHeader(input_size, true, bw)
-	bw.jumpToByteBoundary()
-	bw.writeBytes(input[:input_size])
+func rewindBitPosition(new_storage_ix uint, storage_ix *uint, storage []byte) {
+	var bitpos uint = new_storage_ix & 7
+	var mask uint = (1 << bitpos) - 1
+	storage[new_storage_ix>>3] &= byte(mask)
+	*storage_ix = new_storage_ix
 }
 
-func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, bw *bitWriter) {
+func emitUncompressedMetaBlock(input []byte, input_size uint, storage_ix *uint, storage []byte) {
+	storeMetaBlockHeader(input_size, true, storage_ix, storage)
+	*storage_ix = (*storage_ix + 7) &^ 7
+	copy(storage[*storage_ix>>3:], input[:input_size])
+	*storage_ix += input_size << 3
+	storage[*storage_ix>>3] = 0
+}
+
+func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, storage_ix *uint, storage []byte) {
 	/* Save the start of the first block for position and distance computations.
 	 */
 	var base_ip []byte = input
@@ -540,17 +692,17 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
 		num_literals = uint(-cap(literals) + cap(literal_buf))
 		if shouldCompress(input, block_size, num_literals) {
 			var num_commands uint = uint(-cap(commands) + cap(command_buf))
-			storeMetaBlockHeader(block_size, false, bw)
+			storeMetaBlockHeader(block_size, false, storage_ix, storage)
 
 			/* No block splits, no contexts. */
-			bw.writeBits(13, 0)
+			writeBits(13, 0, storage_ix, storage)
 
-			storeCommands(literal_buf, num_literals, command_buf, num_commands, bw)
+			storeCommands(literal_buf, num_literals, command_buf, num_commands, storage_ix, storage)
 		} else {
 			/* Since we did not find many backward references and the entropy of
 			   the data is close to 8 bits, we can simply emit an uncompressed block.
 			   This makes compression speed of uncompressible data about 3x faster. */
-			emitUncompressedMetaBlock(input, block_size, bw)
+			emitUncompressedMetaBlock(input, block_size, storage_ix, storage)
 		}
 
 		input = input[block_size:]
@@ -558,7 +710,8 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
 	}
 }
 
-/* Compresses "input" string to bw as one or more complete meta-blocks.
+/* Compresses "input" string to the "*storage" buffer as one or more complete
+   meta-blocks, and updates the "*storage_ix" bit position.
 
    If "is_last" is 1, emits an additional empty last meta-block.
 
@@ -570,8 +723,8 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
    REQUIRES: "table_size" is a power of two
    OUTPUT: maximal copy distance <= |input_size|
    OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
-func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, bw *bitWriter) {
-	var initial_storage_ix uint = bw.getPos()
+func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, storage_ix *uint, storage []byte) {
+	var initial_storage_ix uint = *storage_ix
 	var table_bits uint = uint(log2FloorNonZero(table_size))
 	var min_match uint
 	if table_bits <= 15 {
@@ -579,17 +732,17 @@ func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, comman
 	} else {
 		min_match = 6
 	}
-	compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, bw)
+	compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, storage_ix, storage)
 
 	/* If output is larger than single uncompressed block, rewrite it. */
-	if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
-		bw.rewind(initial_storage_ix)
-		emitUncompressedMetaBlock(input, input_size, bw)
+	if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
+		rewindBitPosition(initial_storage_ix, storage_ix, storage)
+		emitUncompressedMetaBlock(input, input_size, storage_ix, storage)
 	}
 
 	if is_last {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 	}
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/decode.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/decode.go
index d2f39a05..6a73b88a 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/decode.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/decode.go
@@ -50,21 +50,6 @@ const (
 	decoderErrorUnreachable                 = -31
 )
 
-/**
- * The value of the last error code, negative integer.
- *
- * All other error code values are in the range from ::lastErrorCode
- * to @c -1. There are also 4 other possible non-error codes @c 0 .. @c 3 in
- * ::BrotliDecoderErrorCode enumeration.
- */
-const lastErrorCode = decoderErrorUnreachable
-
-/** Options to be used with ::BrotliDecoderSetParameter. */
-const (
-	decoderParamDisableRingBufferReallocation = 0
-	decoderParamLargeWindow                   = 1
-)
-
 const huffmanTableBits = 8
 
 const huffmanTableMask = 0xFF
@@ -81,28 +66,6 @@ var kCodeLengthPrefixLength = [16]byte{2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 3, 2, 2,
 
 var kCodeLengthPrefixValue = [16]byte{0, 4, 3, 2, 0, 4, 3, 1, 0, 4, 3, 2, 0, 4, 3, 5}
 
-func decoderSetParameter(state *Reader, p int, value uint32) bool {
-	if state.state != stateUninited {
-		return false
-	}
-	switch p {
-	case decoderParamDisableRingBufferReallocation:
-		if !(value == 0) {
-			state.canny_ringbuffer_allocation = 0
-		} else {
-			state.canny_ringbuffer_allocation = 1
-		}
-		return true
-
-	case decoderParamLargeWindow:
-		state.large_window = (!(value == 0))
-		return true
-
-	default:
-		return false
-	}
-}
-
 /* Saves error code and converts it to BrotliDecoderResult. */
 func saveErrorCode(s *Reader, e int) int {
 	s.error_code = int(e)
@@ -1125,10 +1088,8 @@ func decodeContextMap(context_map_size uint32, num_htrees *uint32, context_map_a
    Reads 3..54 bits. */
 func decodeBlockTypeAndLength(safe int, s *Reader, tree_type int) bool {
 	var max_block_type uint32 = s.num_block_types[tree_type]
-	var type_tree []huffmanCode
-	type_tree = s.block_type_trees[tree_type*huffmanMaxSize258:]
-	var len_tree []huffmanCode
-	len_tree = s.block_len_trees[tree_type*huffmanMaxSize26:]
+	type_tree := s.block_type_trees[tree_type*huffmanMaxSize258:]
+	len_tree := s.block_len_trees[tree_type*huffmanMaxSize26:]
 	var br *bitReader = &s.br
 	var ringbuffer []uint32 = s.block_type_rb[tree_type*2:]
 	var block_type uint32
@@ -1280,8 +1241,7 @@ func unwrittenBytes(s *Reader, wrap bool) uint {
    Returns BROTLI_DECODER_NEEDS_MORE_OUTPUT only if there is more output to push
    and either ring-buffer is as big as window size, or |force| is true. */
 func writeRingBuffer(s *Reader, available_out *uint, next_out *[]byte, total_out *uint, force bool) int {
-	var start []byte
-	start = s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
+	start := s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
 	var to_write uint = unwrittenBytes(s, true)
 	var num_written uint = *available_out
 	if num_written > to_write {
@@ -1412,8 +1372,7 @@ func copyUncompressedBlockToOutput(available_out *uint, next_out *[]byte, total_
 
 		case stateUncompressedWrite:
 			{
-				var result int
-				result = writeRingBuffer(s, available_out, next_out, total_out, false)
+				result := writeRingBuffer(s, available_out, next_out, total_out, false)
 				if result != decoderSuccess {
 					return result
 				}
@@ -1931,8 +1890,7 @@ CommandPostDecodeLiterals:
 			}
 
 			if transform_idx < int(trans.num_transforms) {
-				var word []byte
-				word = words.data[offset:]
+				word := words.data[offset:]
 				var len int = i
 				if transform_idx == int(trans.cutOffTransforms[0]) {
 					copy(s.ringbuffer[pos:], word[:uint(len)])
@@ -1954,10 +1912,8 @@ CommandPostDecodeLiterals:
 		}
 	} else {
 		var src_start int = (pos - s.distance_code) & s.ringbuffer_mask
-		var copy_dst []byte
-		copy_dst = s.ringbuffer[pos:]
-		var copy_src []byte
-		copy_src = s.ringbuffer[src_start:]
+		copy_dst := s.ringbuffer[pos:]
+		copy_src := s.ringbuffer[src_start:]
 		var dst_end int = pos + i
 		var src_end int = src_start + i
 
@@ -2494,8 +2450,6 @@ func decoderDecompressStream(s *Reader, available_in *uint, next_in *[]byte, ava
 				} else {
 					s.state = stateCommandBegin
 				}
-
-				break
 			} else if s.state == stateCommandPostWrite2 {
 				s.state = stateCommandPostWrapCopy /* BROTLI_STATE_COMMAND_INNER_WRITE */
 			} else {
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/encode.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/encode.go
index 3abaf571..8e25a4ec 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/encode.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/encode.go
@@ -87,9 +87,11 @@ type Writer struct {
 	last_processed_pos_ uint64
 	dist_cache_         [numDistanceShortCodes]int
 	saved_dist_cache_   [4]int
+	last_bytes_         uint16
+	last_bytes_bits_    byte
 	prev_byte_          byte
 	prev_byte2_         byte
-	bw                  bitWriter
+	storage             []byte
 	small_table_        [1 << 10]int
 	large_table_        []int
 	large_table_size_   uint
@@ -139,6 +141,14 @@ func wrapPosition(position uint64) uint32 {
 	return result
 }
 
+func (s *Writer) getStorage(size int) []byte {
+	if len(s.storage) < size {
+		s.storage = make([]byte, size)
+	}
+
+	return s.storage
+}
+
 func hashTableSize(max_table_size uint, input_size uint) uint {
 	var htsize uint = 256
 	for htsize < max_table_size && htsize < input_size {
@@ -184,18 +194,23 @@ func getHashTable(s *Writer, quality int, input_size uint, table_size *uint) []i
 	return table
 }
 
-func encodeWindowBits(lgwin int, large_window bool, bw *bitWriter) {
+func encodeWindowBits(lgwin int, large_window bool, last_bytes *uint16, last_bytes_bits *byte) {
 	if large_window {
-		bw.writeBits(14, uint64((lgwin&0x3F)<<8|0x11))
+		*last_bytes = uint16((lgwin&0x3F)<<8 | 0x11)
+		*last_bytes_bits = 14
 	} else {
 		if lgwin == 16 {
-			bw.writeBits(1, 0)
+			*last_bytes = 0
+			*last_bytes_bits = 1
 		} else if lgwin == 17 {
-			bw.writeBits(7, 1)
+			*last_bytes = 1
+			*last_bytes_bits = 7
 		} else if lgwin > 17 {
-			bw.writeBits(4, uint64((lgwin-17)<<1|0x01))
+			*last_bytes = uint16((lgwin-17)<<1 | 0x01)
+			*last_bytes_bits = 4
 		} else {
-			bw.writeBits(7, uint64((lgwin-8)<<4|0x01))
+			*last_bytes = uint16((lgwin-8)<<4 | 0x01)
+			*last_bytes_bits = 7
 		}
 	}
 }
@@ -417,15 +432,18 @@ func chooseContextMode(params *encoderParams, data []byte, pos uint, mask uint,
 	return contextUTF8
 }
 
-func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, commands []command, saved_dist_cache []int, dist_cache []int, bw *bitWriter) {
+func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, commands []command, saved_dist_cache []int, dist_cache []int, storage_ix *uint, storage []byte) {
 	var wrapped_last_flush_pos uint32 = wrapPosition(last_flush_pos)
+	var last_bytes uint16
+	var last_bytes_bits byte
 	var literal_context_lut contextLUT = getContextLUT(literal_context_mode)
 	var block_params encoderParams = *params
 
 	if bytes == 0 {
 		/* Write the ISLAST and ISEMPTY bits. */
-		bw.writeBits(2, 3)
-		bw.jumpToByteBoundary()
+		writeBits(2, 3, storage_ix, storage)
+
+		*storage_ix = (*storage_ix + 7) &^ 7
 		return
 	}
 
@@ -434,15 +452,17 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 		   CreateBackwardReferences is now unused. */
 		copy(dist_cache, saved_dist_cache[:4])
 
-		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, bw)
+		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, storage_ix, storage)
 		return
 	}
 
-	savedPos := bw.getPos()
+	assert(*storage_ix <= 14)
+	last_bytes = uint16(storage[1])<<8 | uint16(storage[0])
+	last_bytes_bits = byte(*storage_ix)
 	if params.quality <= maxQualityForStaticEntropyCodes {
-		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, bw)
+		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else if params.quality < minQualityForBlockSplit {
-		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, bw)
+		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else {
 		mb := getMetaBlockSplit()
 		if params.quality < minQualityForHqBlockSplitting {
@@ -469,15 +489,18 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 			optimizeHistograms(num_effective_dist_codes, mb)
 		}
 
-		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, mb, bw)
+		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, mb, storage_ix, storage)
 		freeMetaBlockSplit(mb)
 	}
 
-	if bytes+4 < bw.getPos()>>3 {
+	if bytes+4 < *storage_ix>>3 {
 		/* Restore the distance cache and last byte. */
 		copy(dist_cache, saved_dist_cache[:4])
-		bw.rewind(savedPos)
-		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, bw)
+
+		storage[0] = byte(last_bytes)
+		storage[1] = byte(last_bytes >> 8)
+		*storage_ix = uint(last_bytes_bits)
+		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, storage_ix, storage)
 	}
 }
 
@@ -510,10 +533,8 @@ func ensureInitialized(s *Writer) bool {
 		return true
 	}
 
-	s.bw.bits = 0
-	s.bw.nbits = 0
-	s.bw.dst = s.bw.dst[:0]
-
+	s.last_bytes_bits_ = 0
+	s.last_bytes_ = 0
 	s.remaining_metadata_bytes_ = math.MaxUint32
 
 	sanitizeParams(&s.params)
@@ -529,7 +550,7 @@ func ensureInitialized(s *Writer) bool {
 			lgwin = brotli_max_int(lgwin, 18)
 		}
 
-		encodeWindowBits(lgwin, s.params.large_window, &s.bw)
+		encodeWindowBits(lgwin, s.params.large_window, &s.last_bytes_, &s.last_bytes_bits_)
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality {
@@ -761,6 +782,8 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality || s.params.quality == fastTwoPassCompressionQuality {
+		var storage []byte
+		var storage_ix uint = uint(s.last_bytes_bits_)
 		var table_size uint
 		var table []int
 
@@ -770,16 +793,20 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 			return true
 		}
 
+		storage = s.getStorage(int(2*bytes + 503))
+		storage[0] = byte(s.last_bytes_)
+		storage[1] = byte(s.last_bytes_ >> 8)
 		table = getHashTable(s, s.params.quality, uint(bytes), &table_size)
 		if s.params.quality == fastOnePassCompressionQuality {
-			compressFragmentFast(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &s.bw)
+			compressFragmentFast(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &storage_ix, storage)
 		} else {
-			compressFragmentTwoPass(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, s.command_buf_, s.literal_buf_, table, table_size, &s.bw)
+			compressFragmentTwoPass(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, s.command_buf_, s.literal_buf_, table, table_size, &storage_ix, storage)
 		}
 
+		s.last_bytes_ = uint16(storage[storage_ix>>3])
+		s.last_bytes_bits_ = byte(storage_ix & 7)
 		updateLastProcessedPos(s)
-		s.writeOutput(s.bw.dst)
-		s.bw.dst = s.bw.dst[:0]
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 	{
@@ -856,7 +883,13 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 	assert(s.input_pos_-s.last_flush_pos_ <= 1<<24)
 	{
 		var metablock_size uint32 = uint32(s.input_pos_ - s.last_flush_pos_)
-		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.commands, s.saved_dist_cache_[:], s.dist_cache_[:], &s.bw)
+		var storage []byte = s.getStorage(int(2*metablock_size + 503))
+		var storage_ix uint = uint(s.last_bytes_bits_)
+		storage[0] = byte(s.last_bytes_)
+		storage[1] = byte(s.last_bytes_ >> 8)
+		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.commands, s.saved_dist_cache_[:], s.dist_cache_[:], &storage_ix, storage)
+		s.last_bytes_ = uint16(storage[storage_ix>>3])
+		s.last_bytes_bits_ = byte(storage_ix & 7)
 		s.last_flush_pos_ = s.input_pos_
 		if updateLastProcessedPos(s) {
 			hasherReset(s.hasher_)
@@ -877,22 +910,27 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 		   emitting an uncompressed block. */
 		copy(s.saved_dist_cache_[:], s.dist_cache_[:])
 
-		s.writeOutput(s.bw.dst)
-		s.bw.dst = s.bw.dst[:0]
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 }
 
-/* Dumps remaining output bits and metadata header to s.bw.
+/* Dumps remaining output bits and metadata header to |header|.
+   Returns number of produced bytes.
+   REQUIRED: |header| should be 8-byte aligned and at least 16 bytes long.
    REQUIRED: |block_size| <= (1 << 24). */
-func writeMetadataHeader(s *Writer, block_size uint) {
-	bw := &s.bw
-
-	bw.writeBits(1, 0)
-	bw.writeBits(2, 3)
-	bw.writeBits(1, 0)
+func writeMetadataHeader(s *Writer, block_size uint, header []byte) uint {
+	storage_ix := uint(s.last_bytes_bits_)
+	header[0] = byte(s.last_bytes_)
+	header[1] = byte(s.last_bytes_ >> 8)
+	s.last_bytes_ = 0
+	s.last_bytes_bits_ = 0
+
+	writeBits(1, 0, &storage_ix, header)
+	writeBits(2, 3, &storage_ix, header)
+	writeBits(1, 0, &storage_ix, header)
 	if block_size == 0 {
-		bw.writeBits(2, 0)
+		writeBits(2, 0, &storage_ix, header)
 	} else {
 		var nbits uint32
 		if block_size == 1 {
@@ -901,19 +939,34 @@ func writeMetadataHeader(s *Writer, block_size uint) {
 			nbits = log2FloorNonZero(uint(uint32(block_size)-1)) + 1
 		}
 		var nbytes uint32 = (nbits + 7) / 8
-		bw.writeBits(2, uint64(nbytes))
-		bw.writeBits(uint(8*nbytes), uint64(block_size)-1)
+		writeBits(2, uint64(nbytes), &storage_ix, header)
+		writeBits(uint(8*nbytes), uint64(block_size)-1, &storage_ix, header)
 	}
 
-	bw.jumpToByteBoundary()
+	return (storage_ix + 7) >> 3
 }
 
 func injectBytePaddingBlock(s *Writer) {
+	var seal uint32 = uint32(s.last_bytes_)
+	var seal_bits uint = uint(s.last_bytes_bits_)
+	s.last_bytes_ = 0
+	s.last_bytes_bits_ = 0
+
 	/* is_last = 0, data_nibbles = 11, reserved = 0, meta_nibbles = 00 */
-	s.bw.writeBits(6, 0x6)
-	s.bw.jumpToByteBoundary()
-	s.writeOutput(s.bw.dst)
-	s.bw.dst = s.bw.dst[:0]
+	seal |= 0x6 << seal_bits
+
+	seal_bits += 6
+
+	destination := s.tiny_buf_.u8[:]
+
+	destination[0] = byte(seal)
+	if seal_bits > 8 {
+		destination[1] = byte(seal >> 8)
+	}
+	if seal_bits > 16 {
+		destination[2] = byte(seal >> 16)
+	}
+	s.writeOutput(destination[:(seal_bits+7)>>3])
 }
 
 func checkFlushComplete(s *Writer) {
@@ -945,7 +998,7 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 	}
 
 	for {
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
@@ -957,6 +1010,9 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 			var block_size uint = brotli_min_size_t(block_size_limit, *available_in)
 			var is_last bool = (*available_in == block_size) && (op == int(operationFinish))
 			var force_flush bool = (*available_in == block_size) && (op == int(operationFlush))
+			var max_out_size uint = 2*block_size + 503
+			var storage []byte = nil
+			var storage_ix uint = uint(s.last_bytes_bits_)
 			var table_size uint
 			var table []int
 
@@ -965,18 +1021,25 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 				continue
 			}
 
+			storage = s.getStorage(int(max_out_size))
+
+			storage[0] = byte(s.last_bytes_)
+			storage[1] = byte(s.last_bytes_ >> 8)
 			table = getHashTable(s, s.params.quality, block_size, &table_size)
 
 			if s.params.quality == fastOnePassCompressionQuality {
-				compressFragmentFast(*next_in, block_size, is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &s.bw)
+				compressFragmentFast(*next_in, block_size, is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &storage_ix, storage)
 			} else {
-				compressFragmentTwoPass(*next_in, block_size, is_last, command_buf, literal_buf, table, table_size, &s.bw)
+				compressFragmentTwoPass(*next_in, block_size, is_last, command_buf, literal_buf, table, table_size, &storage_ix, storage)
 			}
 
 			*next_in = (*next_in)[block_size:]
 			*available_in -= block_size
-			s.writeOutput(s.bw.dst)
-			s.bw.dst = s.bw.dst[:0]
+			var out_bytes uint = storage_ix >> 3
+			s.writeOutput(storage[:out_bytes])
+
+			s.last_bytes_ = uint16(storage[storage_ix>>3])
+			s.last_bytes_bits_ = byte(storage_ix & 7)
 
 			if force_flush {
 				s.stream_state_ = streamFlushRequested
@@ -1010,7 +1073,7 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 	}
 
 	for {
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
@@ -1024,9 +1087,8 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 		}
 
 		if s.stream_state_ == streamMetadataHead {
-			writeMetadataHeader(s, uint(s.remaining_metadata_bytes_))
-			s.writeOutput(s.bw.dst)
-			s.bw.dst = s.bw.dst[:0]
+			n := writeMetadataHeader(s, uint(s.remaining_metadata_bytes_), s.tiny_buf_.u8[:])
+			s.writeOutput(s.tiny_buf_.u8[:n])
 			s.stream_state_ = streamMetadataBody
 			continue
 		} else {
@@ -1112,7 +1174,7 @@ func encoderCompressStream(s *Writer, op int, available_in *uint, next_in *[]byt
 			continue
 		}
 
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/entropy_encode_static.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
index 2543f8f0..5ddf3fcb 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
@@ -778,9 +778,8 @@ var kStaticDistanceCodeDepth = [64]byte{
 
 var kCodeLengthBits = [18]uint32{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 15, 31, 0, 11, 7}
 
-func storeStaticCodeLengthCode(bw *bitWriter) {
-	bw.writeBits(32, 0x55555554)
-	bw.writeBits(8, 0xFF)
+func storeStaticCodeLengthCode(storage_ix *uint, storage []byte) {
+	writeBits(40, 0x0000FF55555554, storage_ix, storage)
 }
 
 var kZeroRepsBits = [numCommandSymbols]uint64{
@@ -4318,10 +4317,9 @@ var kStaticCommandCodeBits = [numCommandSymbols]uint16{
 	2047,
 }
 
-func storeStaticCommandHuffmanTree(bw *bitWriter) {
-	bw.writeBits(32, 0x16307003)
-	bw.writeBits(24, 0x926244)
-	bw.writeBits(3, 0x00000000)
+func storeStaticCommandHuffmanTree(storage_ix *uint, storage []byte) {
+	writeBits(56, 0x92624416307003, storage_ix, storage)
+	writeBits(3, 0x00000000, storage_ix, storage)
 }
 
 var kStaticDistanceCodeBits = [64]uint16{
@@ -4391,6 +4389,6 @@ var kStaticDistanceCodeBits = [64]uint16{
 	63,
 }
 
-func storeStaticDistanceHuffmanTree(bw *bitWriter) {
-	bw.writeBits(28, 0x0369DC03)
+func storeStaticDistanceHuffmanTree(storage_ix *uint, storage []byte) {
+	writeBits(28, 0x0369DC03, storage_ix, storage)
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/fast_log.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/fast_log.go
index bbae3009..9d6607f7 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/fast_log.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/fast_log.go
@@ -1,6 +1,9 @@
 package brotli
 
-import "math"
+import (
+	"math"
+	"math/bits"
+)
 
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
@@ -11,16 +14,7 @@ import "math"
 /* Utilities for fast computation of logarithms. */
 
 func log2FloorNonZero(n uint) uint32 {
-	/* TODO: generalize and move to platform.h */
-	var result uint32 = 0
-	for {
-		n >>= 1
-		if n == 0 {
-			break
-		}
-		result++
-	}
-	return result
+	return uint32(bits.Len(n)) - 1
 }
 
 /* A lookup table for small values of log2(int) to be used in entropy
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/go.mod b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/go.mod
index 8e609842..1c94232c 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/go.mod
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/go.mod
@@ -1,3 +1,5 @@
 module github.com/andybalholm/brotli
 
 go 1.12
+
+retract v1.0.1 // occasional panics and data corruption
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash.go
index 003b433e..00f812e8 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash.go
@@ -29,8 +29,6 @@ type hasherHandle interface {
 	Store(data []byte, mask uint, ix uint)
 }
 
-type score_t uint
-
 const kCutoffTransformsCount uint32 = 10
 
 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
index 3364c44b..306e46d3 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
@@ -110,8 +110,7 @@ func (h *hashForgetfulChain) Prepare(one_shot bool, input_size uint, data []byte
 func (h *hashForgetfulChain) Store(data []byte, mask uint, ix uint) {
 	var key uint = h.HashBytes(data[ix&mask:])
 	var bank uint = key & (h.numBanks - 1)
-	var idx uint
-	idx = uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
+	idx := uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
 	h.free_slot_idx[bank]++
 	var delta uint = ix - uint(h.addr[key])
 	h.tiny_hash[uint16(ix)] = byte(key)
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_rolling.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_rolling.go
index ad655a0a..6630fc07 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_rolling.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/hash_rolling.go
@@ -48,7 +48,6 @@ type hashRolling struct {
 	state         uint32
 	table         []uint32
 	next_ix       uint
-	chunk_len     uint32
 	factor        uint32
 	factor_remove uint32
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/http.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/http.go
index af58670f..1e981963 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/http.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/http.go
@@ -180,8 +180,8 @@ func init() {
 		var t octetType
 		isCtl := c <= 31 || c == 127
 		isChar := 0 <= c && c <= 127
-		isSeparator := strings.IndexRune(" \t\"(),/:;<=>?@[]\\{}", rune(c)) >= 0
-		if strings.IndexRune(" \t\r\n", rune(c)) >= 0 {
+		isSeparator := strings.ContainsRune(" \t\"(),/:;<=>?@[]\\{}", rune(c))
+		if strings.ContainsRune(" \t\r\n", rune(c)) {
 			t |= isSpace
 		}
 		if isChar && !isCtl && !isSeparator {
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/reader.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/reader.go
index 5c795e6e..cdc67645 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/reader.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/reader.go
@@ -33,7 +33,9 @@ func NewReader(src io.Reader) *Reader {
 func (r *Reader) Reset(src io.Reader) error {
 	decoderStateInit(r)
 	r.src = src
-	r.buf = make([]byte, readBufSize)
+	if r.buf == nil {
+		r.buf = make([]byte, readBufSize)
+	}
 	return nil
 }
 
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/static_dict.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/static_dict.go
index 8e7492d7..bc05566d 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/static_dict.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/static_dict.go
@@ -77,8 +77,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data)])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -431,8 +430,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data[1:])])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -596,8 +594,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[2:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -629,8 +626,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[5:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/utf8_util.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/utf8_util.go
index f86de3d2..3244247e 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/utf8_util.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/utf8_util.go
@@ -58,8 +58,7 @@ func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction fl
 	var i uint = 0
 	for i < length {
 		var symbol int
-		var current_data []byte
-		current_data = data[(pos+i)&mask:]
+		current_data := data[(pos+i)&mask:]
 		var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
 		i += bytes_read
 		if symbol < 0x110000 {
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/write_bits.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/write_bits.go
index 2d216d7c..87299011 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/write_bits.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/write_bits.go
@@ -1,5 +1,7 @@
 package brotli
 
+import "encoding/binary"
+
 /* Copyright 2010 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -8,87 +10,43 @@ package brotli
 
 /* Write bits into a byte array. */
 
-type bitWriter struct {
-	dst []byte
-
-	// Data waiting to be written is the low nbits of bits.
-	bits  uint64
-	nbits uint
-}
+/* This function writes bits into bytes in increasing addresses, and within
+   a byte least-significant-bit first.
 
-func (w *bitWriter) writeBits(nb uint, b uint64) {
-	w.bits |= b << w.nbits
-	w.nbits += nb
-	if w.nbits >= 32 {
-		bits := w.bits
-		w.bits >>= 32
-		w.nbits -= 32
-		w.dst = append(w.dst,
-			byte(bits),
-			byte(bits>>8),
-			byte(bits>>16),
-			byte(bits>>24),
-		)
-	}
-}
+   The function can write up to 56 bits in one go with WriteBits
+   Example: let's assume that 3 bits (Rs below) have been written already:
 
-func (w *bitWriter) writeSingleBit(bit bool) {
-	if bit {
-		w.writeBits(1, 1)
-	} else {
-		w.writeBits(1, 0)
-	}
-}
+   BYTE-0     BYTE+1       BYTE+2
 
-func (w *bitWriter) jumpToByteBoundary() {
-	dst := w.dst
-	for w.nbits != 0 {
-		dst = append(dst, byte(w.bits))
-		w.bits >>= 8
-		if w.nbits > 8 { // Avoid underflow
-			w.nbits -= 8
-		} else {
-			w.nbits = 0
-		}
-	}
-	w.bits = 0
-	w.dst = dst
-}
+   0000 0RRR    0000 0000    0000 0000
 
-func (w *bitWriter) writeBytes(b []byte) {
-	if w.nbits&7 != 0 {
-		panic("writeBytes with unfinished bits")
-	}
-	for w.nbits != 0 {
-		w.dst = append(w.dst, byte(w.bits))
-		w.bits >>= 8
-		w.nbits -= 8
-	}
-	w.dst = append(w.dst, b...)
-}
+   Now, we could write 5 or less bits in MSB by just sifting by 3
+   and OR'ing to BYTE-0.
 
-func (w *bitWriter) getPos() uint {
-	return uint(len(w.dst)<<3) + w.nbits
+   For n bits, we take the last 5 bits, OR that with high bits in BYTE-0,
+   and locate the rest in BYTE+1, BYTE+2, etc. */
+func writeBits(n_bits uint, bits uint64, pos *uint, array []byte) {
+	/* This branch of the code can write up to 56 bits at a time,
+	   7 bits are lost by being perhaps already in *p and at least
+	   1 bit is needed to initialize the bit-stream ahead (i.e. if 7
+	   bits are in *p and we write 57 bits, then the next write will
+	   access a byte that was never initialized). */
+	p := array[*pos>>3:]
+	v := uint64(p[0])
+	v |= bits << (*pos & 7)
+	binary.LittleEndian.PutUint64(p, v)
+	*pos += n_bits
 }
 
-func (w *bitWriter) rewind(p uint) {
-	w.bits = uint64(w.dst[p>>3] & byte((1<<(p&7))-1))
-	w.nbits = p & 7
-	w.dst = w.dst[:p>>3]
+func writeSingleBit(bit bool, pos *uint, array []byte) {
+	if bit {
+		writeBits(1, 1, pos, array)
+	} else {
+		writeBits(1, 0, pos, array)
+	}
 }
 
-func (w *bitWriter) updateBits(n_bits uint, bits uint32, pos uint) {
-	for n_bits > 0 {
-		var byte_pos uint = pos >> 3
-		var n_unchanged_bits uint = pos & 7
-		var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
-		var total_bits uint = n_unchanged_bits + n_changed_bits
-		var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
-		var unchanged_bits uint32 = uint32(w.dst[byte_pos]) & mask
-		var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
-		w.dst[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
-		n_bits -= n_changed_bits
-		bits >>= n_changed_bits
-		pos += n_changed_bits
-	}
+func writeBitsPrepareStorage(pos uint, array []byte) {
+	assert(pos&7 == 0)
+	array[pos>>3] = 0
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/writer.go b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/writer.go
index 63676b46..39feaef5 100644
--- a/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/writer.go
+++ b/http-benchmark/fasthttp/vendor/github.com/andybalholm/brotli/writer.go
@@ -61,6 +61,7 @@ func (w *Writer) Reset(dst io.Writer) {
 		w.params.lgwin = uint(w.options.LGWin)
 	}
 	w.dst = dst
+	w.err = nil
 }
 
 func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/LICENSE b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/LICENSE
index 1eb75ef6..87d55747 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/LICENSE
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/LICENSE
@@ -26,3 +26,279 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+
+Files: gzhttp/*
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016-2017 The New York Times Company
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------
+
+Files: s2/cmd/internal/readahead/*
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---------------------
+Files: snappy/*
+Files: internal/snapref/*
+
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------
+
+Files: s2/cmd/internal/filepathx/*
+
+Copyright 2016 The filepathx Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/deflate.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/deflate.go
index 25dbe3e1..bffa2f33 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
@@ -37,15 +38,17 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
 	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 
@@ -70,9 +73,9 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 8, 24, 16, skipNever, 7},
-	{10, 16, 24, 64, skipNever, 8},
-	{32, 258, 258, 4096, skipNever, 9},
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
 }
 
 // advancedState contains state for the advanced levels, with bigger hash tables, etc.
@@ -93,8 +96,9 @@ type advancedState struct {
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
-	index     int
-	hashMatch [maxMatchLength + minMatchLength]uint32
+	index          int
+	estBitsPerByte int
+	hashMatch      [maxMatchLength + minMatchLength]uint32
 
 	hash uint32
 	ii   uint16 // position of last match, intended to overflow to reset.
@@ -103,6 +107,7 @@ type advancedState struct {
 type compressor struct {
 	compressionLevel
 
+	h *huffmanEncoder
 	w *huffmanBitWriter
 
 	// compression algorithm
@@ -170,7 +175,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -263,7 +269,7 @@ func (d *compressor) fillWindow(b []byte) {
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
 	minMatchLook := maxMatchLength
 	if lookahead < minMatchLook {
 		minMatchLook = lookahead
@@ -279,36 +285,75 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
 	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	cGain := 0
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
 
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 6
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
 		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
-		if i < minIndex || i < 0 {
+		if i < minIndex {
 			break
 		}
 	}
@@ -327,8 +372,7 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	b = b[:4]
-	return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
 }
 
 // bulkHash4 will compute hashes using the same
@@ -337,11 +381,12 @@ func bulkHash4(b []byte, dst []uint32) {
 	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	hb := binary.LittleEndian.Uint32(b)
+
 	dst[0] = hash4u(hb, hashBits)
 	end := len(b) - 4 + 1
 	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
+		hb = (hb >> 8) | uint32(b[i+3])<<24
 		dst[i] = hash4u(hb, hashBits)
 	}
 }
@@ -374,10 +419,21 @@ func (d *compressor) deflateLazy() {
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
 	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 	if s.index < s.maxInsertIndex {
-		s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+		s.hash = hash4(d.window[s.index:])
 	}
 
 	for {
@@ -410,7 +466,7 @@ func (d *compressor) deflateLazy() {
 		}
 		if s.index < s.maxInsertIndex {
 			// Update the hash
-			s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+			s.hash = hash4(d.window[s.index:])
 			ch := s.hashHead[s.hash&hashMask]
 			s.chainHead = int(ch)
 			s.hashPrev[s.index&windowMask] = ch
@@ -426,12 +482,37 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
 				s.length = newLength
 				s.offset = newOffset
 			}
 		}
+
 		if prevLength >= minMatchLength && s.length <= prevLength {
+			// Check for better match at end...
+			//
+			// checkOff must be >=2 since we otherwise risk checking s.index
+			// Offset of 2 seems to yield best results.
+			const checkOff = 2
+			prevIndex := s.index - 1
+			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
+				end := lookahead
+				if lookahead > maxMatchLength {
+					end = maxMatchLength
+				}
+				end += prevIndex
+				idx := prevIndex + prevLength - (4 - checkOff)
+				h := hash4(d.window[idx:])
+				ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff)
+				if ch2 > minIndex {
+					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
+					// It seems like a pure length metric is best.
+					if length > prevLength {
+						prevLength = length
+						prevOffset = prevIndex - ch2
+					}
+				}
+			}
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
@@ -440,8 +521,7 @@ func (d *compressor) deflateLazy() {
 			// index and index-1 are already inserted. If there is not enough
 			// lookahead, the last two strings are not inserted into the hash
 			// table.
-			var newIndex int
-			newIndex = s.index + prevLength - 1
+			newIndex := s.index + prevLength - 1
 			// Calculate missing hashes
 			end := newIndex
 			if end > s.maxInsertIndex {
@@ -480,6 +560,7 @@ func (d *compressor) deflateLazy() {
 				}
 				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
 			// Reset, if we got a match this run.
 			if s.length >= minMatchLength {
@@ -499,13 +580,12 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > 31 {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
@@ -513,6 +593,14 @@ func (d *compressor) deflateLazy() {
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -612,7 +700,9 @@ func (d *compressor) write(b []byte) (n int, err error) {
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
@@ -645,21 +735,21 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logNewTablePenalty = 4
-		d.window = make([]byte, maxStoreBlockSize)
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
 	case level == DefaultCompression:
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 6
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index 4a73e1bd..d55ea2a7 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"math/bits"
 )
@@ -44,7 +45,7 @@ const (
 
 	bTableBits   = 17                                               // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                                  // Size of the table
-	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )
 
@@ -65,26 +66,15 @@ func load32(b []byte, i int) uint32 {
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func hash(u uint32) uint32 {
@@ -189,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -223,31 +213,20 @@ func (e *fastGen) Reset() {
 // matchLen returns the maximum length.
 // 'a' must be the shortest of the two.
 func matchLen(a, b []byte) int {
-	b = b[:len(a)]
 	var checked int
-	if len(a) > 4 {
-		// Try 4 bytes first
-		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
-			return bits.TrailingZeros32(diff) >> 3
-		}
-		// Switch to 8 byte matching.
-		checked = 4
-		a = a[4:]
-		b = b[4:]
-		for len(a) >= 8 {
-			b = b[:len(a)]
-			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
-				return checked + (bits.TrailingZeros64(diff) >> 3)
-			}
-			checked += 8
-			a = a[8:]
-			b = b[8:]
+
+	for len(a) >= 8 {
+		if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+			return checked + (bits.TrailingZeros64(diff) >> 3)
 		}
+		checked += 8
+		a = a[8:]
+		b = b[8:]
 	}
 	b = b[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 208d6671..25f6d110 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -5,7 +5,10 @@
 package flate
 
 import (
+	"encoding/binary"
+	"fmt"
 	"io"
+	"math"
 )
 
 const (
@@ -22,11 +25,15 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
+	bufferFlushSize = 246
 
 	// bufferSize is the actual output byte buffer size.
 	// It must have additional headroom for a flush
@@ -34,8 +41,11 @@ const (
 	bufferSize = bufferFlushSize + 8
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = [32]int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -49,28 +59,41 @@ var lengthBase = [32]uint8{
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = [64]int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
 	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+	14, 14,
 }
 
-var offsetBase = [64]uint32{
-	/* normal deflate */
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+var offsetCombined = [32]uint32{}
 
-	/* extended window */
-	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-	0x100000, 0x180000, 0x200000, 0x300000,
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
 }
 
 // The odd order in which the codegen code sizes are written.
@@ -85,17 +108,18 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint16
+	nbits           uint8
 	nbytes          uint8
+	lastHuffMan     bool
 	literalEncoding *huffmanEncoder
+	tmpLitEncoding  *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
 	err             error
 	lastHeader      int
 	// Set between 0 (reused block can be up to 2x the size)
 	logNewTablePenalty uint
-	lastHuffMan        bool
-	bytes              [256]byte
+	bytes              [256 + 8]byte
 	literalFreq        [lengthCodesStart + 32]uint16
 	offsetFreq         [32]uint16
 	codegenFreq        [codegenCodeCount]uint16
@@ -127,6 +151,7 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
 		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@@ -139,37 +164,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -205,8 +226,8 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	w.bits |= uint64(b) << (w.nbits & reg16SizeMask64)
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -407,7 +428,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
 	// The function does not get inlined if we "& 63" the shift.
-	w.bits |= uint64(c.code) << w.nbits
+	w.bits |= uint64(c.code) << (w.nbits & 63)
 	w.nbits += c.len
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -420,13 +441,11 @@ func (w *huffmanBitWriter) writeOutBits() {
 	w.bits >>= 48
 	w.nbits -= 48
 	n := w.nbytes
-	w.bytes[n] = byte(bits)
-	w.bytes[n+1] = byte(bits >> 8)
-	w.bytes[n+2] = byte(bits >> 16)
-	w.bytes[n+3] = byte(bits >> 24)
-	w.bytes[n+4] = byte(bits >> 32)
-	w.bytes[n+5] = byte(bits >> 40)
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 	n += 6
+
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
@@ -435,6 +454,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 		w.write(w.bytes[:n])
 		n = 0
 	}
+
 	w.nbytes = n
 }
 
@@ -551,7 +571,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -562,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -580,7 +603,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -619,22 +642,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -645,35 +685,83 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -684,6 +772,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
@@ -718,7 +819,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -745,52 +846,135 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	offs := oeCodes[:32]
 	lengths := leCodes[lengthCodesStart:]
 	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
 	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(lits[t.literal()])
+		if t < 256 {
+			//w.writeCode(lits[t.literal()])
+			c := lits[t]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 			continue
 		}
 
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
+		lengthCode := lengthCode(length) & 31
 		if false {
-			w.writeCode(lengths[lengthCode&31])
+			w.writeCode(lengths[lengthCode])
 		} else {
 			// inlined
-			c := lengths[lengthCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := lengths[lengthCode]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
 
-		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode&31])
-			w.writeBits(extraLength, extraLengthBits)
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offsetCode(offset)
+		offsetCode := (offset >> 16) & 31
 		if false {
-			w.writeCode(offs[offsetCode&31])
+			w.writeCode(offs[offsetCode])
 		} else {
 			// inlined
-			c := offs[offsetCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := offs[offsetCode]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
-		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode&63])
-			w.writeBits(extraOffset, extraOffsetBits)
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
+			//w.writeBits(extraOffset, extraOffsetBits)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
 	if deferEOB {
 		w.writeCode(leCodes[endBlockMarker])
 	}
@@ -825,43 +1009,85 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		}
 	}
 
+	// Fill is rarely better...
+	const fill = false
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
 	// Add everything as literals
 	// We have to estimate the header size.
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
-	estBits += w.lastHeader + 15
+	histogram(input, w.literalFreq[:numLiterals], fill)
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	if fill {
+		// Clear fill...
+		for i := range w.literalFreq[:numLiterals] {
+			w.literalFreq[i] = 0
+		}
+		histogram(input, w.literalFreq[:numLiterals], false)
+	}
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	estBits += w.lastHeader
 	if w.lastHeader == 0 {
 		estBits += guessHeaderSizeBits
 	}
 	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
-	ssize, storable := w.storedSize(input)
-	if storable && ssize < estBits {
+	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
 
 	if w.lastHeader > 0 {
-		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
-		estBits += estExtra
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
 
 		if estBits < reuseSize {
+			if debugDeflate {
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
+			}
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			w.lastHeader = 0
+		} else if debugDeflate {
+			fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
 		}
 	}
 
-	const numLiterals = endBlockMarker + 1
-	const numOffsets = 1
+	count := 0
 	if w.lastHeader == 0 {
-		w.literalFreq[endBlockMarker] = 1
-		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
-
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
@@ -872,39 +1098,93 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 		w.lastHuffMan = true
 		w.lastHeader, _ = w.headerSize()
+		if debugDeflate {
+			count += w.lastHeader
+			fmt.Println("header:", count/8)
+		}
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
+		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= uint64(a.code) << (nbits & 63)
+		bits |= uint64(b.code) << ((nbits + a.len) & 63)
+		c := encoding[input[2]]
+		nbits += b.len + a.len
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		input = input[3:]
 	}
 
-	encoding := w.literalEncoding.codes[:257]
+	// Remaining...
 	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64)
-		w.nbits += c.len
-		if w.nbits >= 48 {
-			bits := w.bits
-			w.bits >>= 48
-			w.nbits -= 48
-			n := w.nbytes
-			w.bytes[n] = byte(bits)
-			w.bytes[n+1] = byte(bits >> 8)
-			w.bytes[n+2] = byte(bits >> 16)
-			w.bytes[n+3] = byte(bits >> 24)
-			w.bytes[n+4] = byte(bits >> 32)
-			w.bytes[n+5] = byte(bits >> 40)
-			n += 6
-			if n >= bufferFlushSize {
+		if nbits >= 48 {
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
 				if w.err != nil {
-					n = 0
+					nbytes = 0
 					return
 				}
-				w.write(w.bytes[:n])
-				n = 0
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
 			}
-			w.nbytes = n
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		if debugDeflate {
+			count += int(c.len)
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if debugDeflate {
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
 	if eof || sync {
-		w.writeCode(encoding[endBlockMarker])
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_code.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 4c39a301..9ab497c2 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -17,13 +17,18 @@ const (
 
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
-	code, len uint16
+	code uint16
+	len  uint8
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
@@ -52,7 +57,7 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
+func (h *hcode) set(code uint16, length uint8) {
 	h.len = length
 	h.code = code
 }
@@ -76,7 +81,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 	var ch uint16
 	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -95,7 +100,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = hcode{code: reverseBits(bits, size), len: size}
 	}
 	return h
 }
@@ -122,6 +127,29 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	return total
 }
 
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len)
+	}
+	return total
+}
+
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.len == 0 {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len)
+		}
+	}
+	return total
+}
+
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@@ -160,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     int32(list[1].freq),
-			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -178,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
 			// We've run out of both leafs and pairs.
@@ -211,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
 			levels[l.level-1].needed = 2
 		}
 
@@ -269,7 +308,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -281,13 +320,8 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
-		// The largest of these is literalCount, so we allocate for that case.
-		h.freqcache = make([]literalNode, literalCount+1)
-	}
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -296,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
+			codes[i].len = 0
 		}
 	}
-	list[len(freq)] = literalNode{}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
@@ -320,44 +353,32 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }
 
+// atLeastOne clamps the result between 1 and 15.
 func atLeastOne(v float32) float32 {
 	if v < 1 {
 		return 1
 	}
+	if v > 15 {
+		return 15
+	}
 	return v
 }
 
-// histogramSize accumulates a histogram of b in h.
-// An estimated size in bits is returned.
 // Unassigned values are assigned '1' in the histogram.
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
+func fillHist(b []uint16) {
+	for i, v := range b {
+		if v == 0 {
+			b[i] = 1
+		}
+	}
+}
+
+func histogram(b []byte, h []uint16, fill bool) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
-	invTotal := 1.0 / float32(len(b))
-	shannon := float32(0.0)
-	var extra float32
 	if fill {
-		oneBits := atLeastOne(-mFastLog2(invTotal))
-		for i, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			} else {
-				h[i] = 1
-				extra += oneBits
-			}
-		}
-	} else {
-		for _, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			}
-		}
+		fillHist(h)
 	}
-
-	return int(shannon + 0.99), int(extra + 0.99)
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate.go
index 16bc5140..414c0bea 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -9,10 +9,10 @@ package flate
 
 import (
 	"bufio"
+	"compress/flate"
 	"fmt"
 	"io"
 	"math/bits"
-	"strconv"
 	"sync"
 )
 
@@ -36,16 +36,19 @@ type lengthExtra struct {
 
 var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
 
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
 // Initialize the fixedHuffmanDecoder only once upon first use.
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
 
 // A CorruptInputError reports the presence of corrupt input at a given offset.
-type CorruptInputError int64
-
-func (e CorruptInputError) Error() string {
-	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
-}
+type CorruptInputError = flate.CorruptInputError
 
 // An InternalError reports an error in the flate code itself.
 type InternalError string
@@ -55,26 +58,12 @@ func (e InternalError) Error() string { return "flate: internal error: " + strin
 // A ReadError reports an error encountered while reading input.
 //
 // Deprecated: No longer returned.
-type ReadError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Read
-}
-
-func (e *ReadError) Error() string {
-	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type ReadError = flate.ReadError
 
 // A WriteError reports an error encountered while writing output.
 //
 // Deprecated: No longer returned.
-type WriteError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Write
-}
-
-func (e *WriteError) Error() string {
-	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type WriteError = flate.WriteError
 
 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
 // to switch to a new underlying Reader. This permits reusing a ReadCloser
@@ -346,11 +335,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -359,6 +354,9 @@ func (f *decompressor) nextBlock() {
 		f.hl = &f.h1
 		f.hd = &f.h2
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {
@@ -568,221 +566,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
-		}
-
-		var dist uint32
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			sym, err := f.huffSym(f.hd)
-			if err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-			dist = uint32(sym)
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
-			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, int(dist)
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index cc6db279..8d632cea 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 	)
 	fr := f.r.(*bytes.Buffer)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -39,41 +44,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -88,10 +87,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -101,9 +102,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -111,25 +113,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -137,12 +141,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -152,38 +156,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -197,9 +198,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -207,14 +209,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -224,6 +228,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -248,10 +253,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() {
 	)
 	fr := f.r.(*bytes.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -283,41 +295,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -332,10 +338,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -345,9 +353,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -355,25 +364,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -381,12 +392,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -396,38 +407,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -441,9 +449,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -451,14 +460,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -468,6 +479,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -492,10 +504,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() {
 	)
 	fr := f.r.(*bufio.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -527,41 +546,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -576,10 +589,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -589,9 +604,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -599,25 +615,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -625,12 +643,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -640,38 +658,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -685,9 +700,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -695,14 +711,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -712,6 +730,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -736,10 +755,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() {
 	)
 	fr := f.r.(*strings.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -771,41 +797,286 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -818,12 +1089,14 @@ readLiteral:
 			f.dict.writeByte(byte(v))
 			if f.dict.availWrite() == 0 {
 				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanStringsReader
+				f.step = (*decompressor).huffmanGenericReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -833,9 +1106,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -843,25 +1117,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -869,12 +1145,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -884,38 +1160,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -929,9 +1202,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -939,14 +1213,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -956,6 +1232,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -978,12 +1255,14 @@ copyHistory:
 
 		if f.dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 func (f *decompressor) huffmanBlockDecoder() func() {
@@ -996,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() {
 		return f.huffmanBufioReader
 	case *strings.Reader:
 		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
 	default:
-		return f.huffmanBlockGeneric
+		return f.huffmanGenericReader
 	}
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level1.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level1.go
index 1e5eea39..0f14f8d6 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -125,11 +154,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level2.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level2.go
index 5b986a19..8603fbd5 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level2.go
@@ -134,7 +134,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -155,7 +163,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
-				x := load6432(src, int32(i))
+				x := load6432(src, i)
 				nextHash := hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level3.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level3.go
index c22b4244..039639f8 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level3.go
@@ -5,7 +5,7 @@ import "fmt"
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
-	table [tableSize]tableEntryPrev
+	table [1 << 16]tableEntryPrev
 }
 
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
@@ -13,6 +13,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 8 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -73,7 +75,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hash4u(cv, tableBits)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -141,7 +143,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -156,7 +166,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(t+4) < len(src) && t > 0 {
 					cv := load3232(src, t)
-					nextHash := hash(cv)
+					nextHash := hash4u(cv, tableBits)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -165,30 +175,31 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				goto emitRemainder
 			}
 
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntryPrev{
-				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3},
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 5 {
+				nextHash := hash4u(load3232(src, i), tableBits)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
 			}
-			x >>= 8
-			prevHash = hash(uint32(x))
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash(uint32(x))
+			prevHash = hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash(uint32(x))
+			currHash := hash4u(uint32(x), tableBits)
 			candidates := e.table[currHash]
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
@@ -200,15 +211,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			candidate = candidates.Cur
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
-			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
+			if candidate.offset > minOffset {
+				if cv == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
 				candidate = candidates.Prev
 				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
+					// Match at prev...
+					continue
 				}
 			}
 			cv = uint32(x >> 8)
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level4.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level4.go
index e62f0c02..1cbffa1a 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level4.go
@@ -135,7 +135,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level5.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level5.go
index d513f1ff..4b97576b 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level5.go
@@ -182,12 +182,27 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 
-		// Extend the 4-byte match as long as possible.
 		if l == 0 {
+			// Extend the 4-byte match as long as possible.
 			l = e.matchlenLong(s+4, t+4, src) + 4
 		} else if l == maxMatchLength {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			// Test current
+			t2 := eLong - e.cur - l
+			off := s - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s, t2, src); l2 > l {
+					t = t2
+					l = l2
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -195,7 +210,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level6.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level6.go
index a52c80ea..62888edf 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/level6.go
@@ -211,6 +211,31 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
 
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l
+			off := s - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+				// Test next:
+				t2 = eLong.Prev.offset - e.cur - l
+				off := s - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -218,7 +243,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if false {
 			if t >= s {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/regmask_other.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/regmask_other.go
index f477a5d6..1b7a2cbd 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/regmask_other.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/regmask_other.go
@@ -1,4 +1,5 @@
-//+build !amd64
+//go:build !amd64
+// +build !amd64
 
 package flate
 
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/stateless.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/stateless.go
index 53e89912..544162a4 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -249,7 +249,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/token.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/token.go
index f9abf606..d818790c 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/token.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/flate/token.go
@@ -13,14 +13,16 @@ import (
 )
 
 const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	literalType         = 0 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
 )
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
@@ -126,11 +128,11 @@ var offsetCodes14 = [256]uint32{
 type token uint32
 
 type tokens struct {
-	nLits     int
 	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
-	n         uint16      // Must be able to contain maxStoreBlockSize
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 
@@ -139,7 +141,7 @@ func (t *tokens) Reset() {
 		return
 	}
 	t.n = 0
-	t.nLits = 0
+	t.nFilled = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
@@ -158,12 +160,12 @@ func (t *tokens) Fill() {
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
-			t.nLits++
+			t.nFilled++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
-			t.nLits++
+			t.nFilled++
 			t.extraHist[i] = 1
 		}
 	}
@@ -187,26 +189,23 @@ func (t *tokens) indexTokens(in []token) {
 			t.AddLiteral(tok.literal())
 			continue
 		}
-		t.AddMatch(uint32(tok.length()), tok.offset())
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
 	}
 }
 
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
 		dst.litHist[v]++
+		dst.n++
 	}
-	dst.n += uint16(len(lit))
-	dst.nLits += len(lit)
 }
 
 func (t *tokens) AddLiteral(lit byte) {
 	t.tokens[t.n] = token(lit)
 	t.litHist[lit]++
 	t.n++
-	t.nLits++
 }
 
 // from https://stackoverflow.com/a/28730362
@@ -227,12 +226,13 @@ func (t *tokens) EstimatedBits() int {
 	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
-	if t.nLits > 0 {
-		invTotal := 1.0 / float32(t.nLits)
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 			}
 		}
 		// Just add 15 for EOB
@@ -240,7 +240,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.extraHist[1 : literalCount-256] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(lengthExtraBits[i&31]) * int(v)
 				nMatches += int(v)
 			}
@@ -251,7 +251,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.offHist[:offsetCodeCount] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(offsetExtraBits[i&31]) * int(v)
 			}
 		}
@@ -270,11 +270,12 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	t.nLits++
-	lengthCode := lengthCodes1[uint8(xlength)] & 31
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
+
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
-	t.extraHist[lengthCode]++
-	t.offHist[offsetCode(xoffset)&31]++
 	t.n++
 }
 
@@ -286,20 +287,23 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	oc := offsetCode(xoffset) & 31
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
 	for xlength > 0 {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
-			xl = 258 - baseMatchLength
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
 		}
 		xlength -= xl
-		xl -= 3
-		t.nLits++
-		lengthCode := lengthCodes1[uint8(xl)] & 31
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
-		t.extraHist[lengthCode]++
-		t.offHist[oc]++
 		t.n++
 	}
 }
@@ -354,8 +358,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
 func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-// The code is never more than 8 bits, but is returned as uint32 for convenience.
-func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/gzip/gunzip.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/gzip/gunzip.go
index 568b5d4f..4d701891 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/gzip/gunzip.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/gzip/gunzip.go
@@ -8,8 +8,8 @@ package gzip
 
 import (
 	"bufio"
+	"compress/gzip"
 	"encoding/binary"
-	"errors"
 	"hash/crc32"
 	"io"
 	"time"
@@ -30,9 +30,9 @@ const (
 
 var (
 	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
-	ErrChecksum = errors.New("gzip: invalid checksum")
+	ErrChecksum = gzip.ErrChecksum
 	// ErrHeader is returned when reading GZIP data that has an invalid header.
-	ErrHeader = errors.New("gzip: invalid header")
+	ErrHeader = gzip.ErrHeader
 )
 
 var le = binary.LittleEndian
@@ -75,6 +75,7 @@ type Header struct {
 type Reader struct {
 	Header       // valid after NewReader or Reader.Reset
 	r            flate.Reader
+	br           *bufio.Reader
 	decompressor io.ReadCloser
 	digest       uint32 // CRC-32, IEEE polynomial (section 8)
 	size         uint32 // Uncompressed size (section 2.3.1)
@@ -109,7 +110,13 @@ func (z *Reader) Reset(r io.Reader) error {
 	if rr, ok := r.(flate.Reader); ok {
 		z.r = rr
 	} else {
-		z.r = bufio.NewReader(r)
+		// Reuse if we can.
+		if z.br != nil {
+			z.br.Reset(r)
+		} else {
+			z.br = bufio.NewReader(r)
+		}
+		z.r = z.br
 	}
 	z.Header, z.err = z.readHeader()
 	return z.err
diff --git a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/zlib/reader.go b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/zlib/reader.go
index d9091e83..f127d477 100644
--- a/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/zlib/reader.go
+++ b/http-benchmark/fasthttp/vendor/github.com/klauspost/compress/zlib/reader.go
@@ -25,7 +25,7 @@ package zlib
 
 import (
 	"bufio"
-	"errors"
+	"compress/zlib"
 	"hash"
 	"hash/adler32"
 	"io"
@@ -37,11 +37,11 @@ const zlibDeflate = 8
 
 var (
 	// ErrChecksum is returned when reading ZLIB data that has an invalid checksum.
-	ErrChecksum = errors.New("zlib: invalid checksum")
+	ErrChecksum = zlib.ErrChecksum
 	// ErrDictionary is returned when reading ZLIB data that has an invalid dictionary.
-	ErrDictionary = errors.New("zlib: invalid dictionary")
+	ErrDictionary = zlib.ErrDictionary
 	// ErrHeader is returned when reading ZLIB data that has an invalid header.
-	ErrHeader = errors.New("zlib: invalid header")
+	ErrHeader = zlib.ErrHeader
 )
 
 type reader struct {
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.gitignore b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.gitignore
index 4d110c67..df53ec19 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.gitignore
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.gitignore
@@ -4,3 +4,4 @@ tags
 *.fasthttp.br
 .idea
 .DS_Store
+vendor/
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.travis.yml b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.travis.yml
deleted file mode 100644
index 44a0fcaa..00000000
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/.travis.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-language: go
-arch:
-  - amd64
-  - ppc64le
-
-dist: bionic
-
-os:
-  - linux
-  - osx
-  - windows
-go:
-  - 1.15.x
-  - 1.14.x
-  - 1.13.x
-  - 1.12.x
-
-script:
-  - go test -v ./...
-
-env:
-  global:
-    secure: "v/F0oI9zE9mcpEp4AVdHzSSHbe5ZFtH6B0i/BiUXKdQRQ10+JMPDOFRJQti7yxjMwltyd/QSFmR50Fl108sQYpo4xdlEXMHp2Y6OAN6crrp6PuHbLYgDWu3df/cH7/BqDyIq1uX8KZEeQssnygYN8hN4tpJCUg+NIb40Lm57Zsodt8DVjjyDWQQFDL7soNyAwGwQIqEyJsn+NUieXWEB1Qnt0xUtPIReuLlrwXR8wC1nLEjG9yz4ftDHHQdhVbO2b+xGWyaJ7QB5ixztaQP8Jnny6kSW9j6zEhJVuzdZ6d3xz23ibCbzSXBHdIUEI9u6ifQj8BYXr8fFS0FB3++IxgAYSs3ybZ+qEwuAxSBBm6YNW+3FrfDknVwTQscjKqnXPisjUqaRC9b31hke0tXzBq1488hE+wxMXeDM4LwWT5IMEO2gz0WGQXxmdVit72DIjCZxJkf1TvZZ0YH7Y//6wJTYYP9xulsy4gqu8CuFdWiF3fiGc3p5DTIS75nJ/Yy76Sa1pRPASKCujfLxtHE6Mt0XKvSolIXklYIzBkjN6vn80N6JIrqtqlimBGPW/Ec6+dwbmRe2AcOKRl4y7pZsGYhJhqdue1mucUYO/e2QeBZJGkqqG+zF5AW0v8x29BHvMwViAonc8o9eelkJ8khYzc/Qeq05pZnR/N/Pqfc+68k="
-
-before_install:
-  - go get -t -v ./...
-
-jobs:
-  include:
-    - stage: cross compilation
-      os:
-        - linux
-      go:
-        - 1.15.x
-      script:
-        - GOOS=linux go build
-        - GOOS=darwin go build
-        - GOOS=freebsd go build
-        - GOOS=windows go build
-        - GOARCH=386 go build
-    - stage: cross compilation
-      os:
-        - osx
-      go:
-        - 1.15.x
-      script:
-        - GOOS=linux go build
-        - GOOS=darwin go build
-        - GOOS=freebsd go build
-        - GOOS=windows go build
-    - stage: race detector
-      os:
-        - linux
-      go:
-        - 1.15.x
-      script:
-        - go test -race -v ./...
-    - stage: race detector
-      os:
-        - osx
-      go:
-        - 1.15.x
-      script:
-        - go test -race -v ./...
-    - stage: race detector
-      arch: ppc64le
-      os:
-        - linux
-      go:
-        - 1.15.x
-      script:
-        - go test -race -v ./...
-
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/README.md b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/README.md
index 6d93d3fe..c11d9cef 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/README.md
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/README.md
@@ -1,9 +1,15 @@
-# fasthttp [![Build Status](https://travis-ci.org/valyala/fasthttp.svg?branch=master)](https://travis-ci.org/valyala/fasthttp?branch=master) [![GoDoc](https://godoc.org/github.com/valyala/fasthttp?status.svg)](http://godoc.org/github.com/valyala/fasthttp) [![Go Report](https://goreportcard.com/badge/github.com/valyala/fasthttp)](https://goreportcard.com/report/github.com/valyala/fasthttp) [![Sourcegraph](https://sourcegraph.com/github.com/valyala/fasthttp/-/badge.svg)](https://sourcegraph.com/github.com/valyala/fasthttp?badge)
+# fasthttp [![GoDoc](https://godoc.org/github.com/valyala/fasthttp?status.svg)](http://godoc.org/github.com/valyala/fasthttp) [![Go Report](https://goreportcard.com/badge/github.com/valyala/fasthttp)](https://goreportcard.com/report/github.com/valyala/fasthttp)
 
 ![FastHTTP – Fastest and reliable HTTP implementation in Go](https://github.com/fasthttp/docs-assets/raw/master/banner@0.5.png)
 
 Fast HTTP implementation for Go.
 
+# fasthttp might not be for you!
+fasthttp was design for some high performance edge cases. **Unless** your server/client needs to handle **thousands of small to medium requests per seconds** and needs a consistent low millisecond response time fasthttp might not be for you. **For most cases `net/http` is much better** as it's easier to use and can handle more cases. For most cases you won't even notice the performance difference.
+
+
+## General info and links
+
 Currently fasthttp is successfully used by [VertaMedia](https://vertamedia.com/)
 in a production serving up to 200K rps from more than 1.5M concurrent keep-alive
 connections per physical server.
@@ -34,7 +40,7 @@ connections per physical server.
 
 [FAQ](#faq)
 
-# HTTP server performance comparison with [net/http](https://golang.org/pkg/net/http/)
+## HTTP server performance comparison with [net/http](https://golang.org/pkg/net/http/)
 
 In short, fasthttp server is up to 10 times faster than net/http.
 Below are benchmark results.
@@ -95,7 +101,7 @@ BenchmarkServerGet10ReqPerConn10KClients-4              	30000000	       346 ns/
 BenchmarkServerGet100ReqPerConn10KClients-4             	50000000	       282 ns/op	       0 B/op	       0 allocs/op
 ```
 
-# HTTP client comparison with net/http
+## HTTP client comparison with net/http
 
 In short, fasthttp client is up to 10 times faster than net/http.
 Below are benchmark results.
@@ -157,14 +163,14 @@ BenchmarkClientGetEndToEnd1000Inmemory-4                	10000000	      1316 ns/
 ```
 
 
-# Install
+## Install
 
 ```
 go get -u github.com/valyala/fasthttp
 ```
 
 
-# Switching from net/http to fasthttp
+## Switching from net/http to fasthttp
 
 Unfortunately, fasthttp doesn't provide API identical to net/http.
 See the [FAQ](#faq) for details.
@@ -393,17 +399,17 @@ instead of [html/template](https://golang.org/pkg/html/template/).
 [expvarhandler](https://godoc.org/github.com/valyala/fasthttp/expvarhandler).
 
 
-# Performance optimization tips for multi-core systems
+## Performance optimization tips for multi-core systems
 
 * Use [reuseport](https://godoc.org/github.com/valyala/fasthttp/reuseport) listener.
 * Run a separate server instance per CPU core with GOMAXPROCS=1.
 * Pin each server instance to a separate CPU core using [taskset](http://linux.die.net/man/1/taskset).
 * Ensure the interrupts of multiqueue network card are evenly distributed between CPU cores.
   See [this article](https://blog.cloudflare.com/how-to-achieve-low-latency/) for details.
-* Use Go 1.13 as it provides some considerable performance improvements.
+* Use the latest version of Go as each version contains performance improvements.
 
 
-# Fasthttp best practices
+## Fasthttp best practices
 
 * Do not allocate objects and `[]byte` buffers - just reuse them as much
   as possible. Fasthttp API design encourages this.
@@ -424,7 +430,7 @@ instead of [html/template](https://golang.org/pkg/html/template/).
   [html/template](https://golang.org/pkg/html/template/) in your webserver.
 
 
-# Tricks with `[]byte` buffers
+## Tricks with `[]byte` buffers
 
 The following tricks are used by fasthttp. Use them in your code too.
 
@@ -479,12 +485,34 @@ statusCode, body, err := fasthttp.Get(nil, "http://google.com/")
 uintBuf := fasthttp.AppendUint(nil, 1234)
 ```
 
-# Related projects
+* String and `[]byte` buffers may converted without memory allocations
+```go
+func b2s(b []byte) string {
+    return *(*string)(unsafe.Pointer(&b))
+}
+
+func s2b(s string) (b []byte) {
+    bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+    sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
+    bh.Data = sh.Data
+    bh.Cap = sh.Len
+    bh.Len = sh.Len
+    return b
+}
+```
+
+### Warning:
+This is an **unsafe** way, the result string and `[]byte` buffer share the same bytes.
+
+**Please make sure not to modify the bytes in the `[]byte` buffer if the string still survives!**
+
+## Related projects
 
   * [fasthttp](https://github.com/fasthttp) - various useful
     helpers for projects based on fasthttp.
   * [fasthttp-routing](https://github.com/qiangxue/fasthttp-routing) - fast and
     powerful routing package for fasthttp servers.
+  * [http2](https://github.com/dgrr/http2) - HTTP/2 implementation for fasthttp.
   * [router](https://github.com/fasthttp/router) - a high
     performance fasthttp request router that scales well.
   * [fastws](https://github.com/fasthttp/fastws) - Bloatless WebSocket package made for fasthttp
@@ -494,6 +522,8 @@ uintBuf := fasthttp.AppendUint(nil, 1234)
     go middleware web framework which is based on fasthttp.
   * [websocket](https://github.com/fasthttp/websocket) - Gorilla-based
     websocket implementation for fasthttp.
+  * [websocket](https://github.com/dgrr/websocket) - Event-based high-performance WebSocket library for zero-allocation
+    websocket servers and clients.
   * [fasthttpsession](https://github.com/phachon/fasthttpsession) - a fast and powerful session package for fasthttp servers.
   * [atreugo](https://github.com/savsgio/atreugo) - High performance and extensible micro web framework with zero memory allocations in hot paths.
   * [kratgo](https://github.com/savsgio/kratgo) - Simple, lightweight and ultra-fast HTTP Cache to speed up your websites.
@@ -502,7 +532,7 @@ uintBuf := fasthttp.AppendUint(nil, 1234)
   * [Gearbox](https://github.com/gogearbox/gearbox) - :gear: gearbox is a web framework written in Go with a focus on high performance and memory optimization
 
 
-# FAQ
+## FAQ
 
 * *Why creating yet another http package instead of optimizing net/http?*
 
@@ -539,9 +569,10 @@ uintBuf := fasthttp.AppendUint(nil, 1234)
   * net/http supports [HTTP/2.0 starting from go1.6](https://http2.golang.org/).
   * net/http API is stable, while fasthttp API constantly evolves.
   * net/http handles more HTTP corner cases.
+  * net/http can stream both request and response bodies
+  * net/http can handle bigger bodies as it doesn't read the whole body into memory
   * net/http should contain less bugs, since it is used and tested by much
     wider audience.
-  * net/http works on Go older than 1.5.
 
 * *Why fasthttp API prefers returning `[]byte` instead of `string`?*
 
@@ -552,10 +583,7 @@ uintBuf := fasthttp.AppendUint(nil, 1234)
 
 * *Which GO versions are supported by fasthttp?*
 
-  Go1.5+. Older versions won't be supported, since their standard package
-  [miss useful functions](https://github.com/valyala/fasthttp/issues/5).
-
-  **NOTE**: Go 1.9.7 is the oldest tested version. We recommend you to update as soon as you can. As of 1.11.3 we will drop 1.9.x support.
+  Go 1.15.x. Older versions won't be supported.
 
 * *Please provide real benchmark data and server information*
 
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/args.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/args.go
index 07600f9d..a8e43941 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/args.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/args.go
@@ -110,7 +110,8 @@ func (a *Args) String() string {
 
 // QueryString returns query string for the args.
 //
-// The returned value is valid until the next call to Args methods.
+// The returned value is valid until the Args is reused or released (ReleaseArgs).
+// Do not store references to the returned value. Make copies instead.
 func (a *Args) QueryString() []byte {
 	a.buf = a.AppendBytes(a.buf[:0])
 	return a.buf
@@ -241,14 +242,16 @@ func (a *Args) SetBytesKNoValue(key []byte) {
 
 // Peek returns query arg value for the given key.
 //
-// Returned value is valid until the next Args call.
+// The returned value is valid until the Args is reused or released (ReleaseArgs).
+// Do not store references to the returned value. Make copies instead.
 func (a *Args) Peek(key string) []byte {
 	return peekArgStr(a.args, key)
 }
 
 // PeekBytes returns query arg value for the given key.
 //
-// Returned value is valid until the next Args call.
+// The returned value is valid until the Args is reused or released (ReleaseArgs).
+// Do not store references to the returned value. Make copies instead.
 func (a *Args) PeekBytes(key []byte) []byte {
 	return peekArgBytes(a.args, key)
 }
@@ -358,6 +361,13 @@ func visitArgs(args []argsKV, f func(k, v []byte)) {
 	}
 }
 
+func visitArgsKey(args []argsKV, f func(k []byte)) {
+	for i, n := 0, len(args); i < n; i++ {
+		kv := &args[i]
+		f(kv.key)
+	}
+}
+
 func copyArgs(dst, src []argsKV) []argsKV {
 	if cap(dst) < len(src) {
 		tmp := make([]argsKV, len(src))
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv.go
index 813376e9..bf582afd 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv.go
@@ -98,7 +98,7 @@ func ParseIPv4(dst net.IP, ipStr []byte) (net.IP, error) {
 		}
 		v, err := ParseUint(b[:n])
 		if err != nil {
-			return dst, fmt.Errorf("cannot parse ipStr %q: %s", ipStr, err)
+			return dst, fmt.Errorf("cannot parse ipStr %q: %w", ipStr, err)
 		}
 		if v > 255 {
 			return dst, fmt.Errorf("cannot parse ipStr %q: ip part cannot exceed 255: parsed %d", ipStr, v)
@@ -108,7 +108,7 @@ func ParseIPv4(dst net.IP, ipStr []byte) (net.IP, error) {
 	}
 	v, err := ParseUint(b)
 	if err != nil {
-		return dst, fmt.Errorf("cannot parse ipStr %q: %s", ipStr, err)
+		return dst, fmt.Errorf("cannot parse ipStr %q: %w", ipStr, err)
 	}
 	if v > 255 {
 		return dst, fmt.Errorf("cannot parse ipStr %q: ip part cannot exceed 255: parsed %d", ipStr, v)
@@ -345,8 +345,8 @@ func s2b(s string) (b []byte) {
 	/* #nosec G103 */
 	sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
 	bh.Data = sh.Data
-	bh.Len = sh.Len
 	bh.Cap = sh.Len
+	bh.Len = sh.Len
 	return b
 }
 
@@ -380,7 +380,7 @@ func appendQuotedPath(dst, src []byte) []byte {
 
 	for _, c := range src {
 		if quotedPathShouldEscapeTable[int(c)] != 0 {
-			dst = append(dst, '%', upperhex[c>>4], upperhex[c&15])
+			dst = append(dst, '%', upperhex[c>>4], upperhex[c&0xf])
 		} else {
 			dst = append(dst, c)
 		}
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_32.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_32.go
index 1b3e5574..6a6fec23 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_32.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_32.go
@@ -1,3 +1,4 @@
+//go:build !amd64 && !arm64 && !ppc64 && !ppc64le
 // +build !amd64,!arm64,!ppc64,!ppc64le
 
 package fasthttp
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_64.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_64.go
index dc866947..1300d5ae 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_64.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/bytesconv_64.go
@@ -1,3 +1,4 @@
+//go:build amd64 || arm64 || ppc64 || ppc64le
 // +build amd64 arm64 ppc64 ppc64le
 
 package fasthttp
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/client.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/client.go
index f135ec37..b36ca408 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/client.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/client.go
@@ -1,8 +1,9 @@
+// go:build !windows || !race
+
 package fasthttp
 
 import (
 	"bufio"
-	"bytes"
 	"crypto/tls"
 	"errors"
 	"fmt"
@@ -176,6 +177,8 @@ var defaultClient Client
 // Copying Client by value is prohibited. Create new instance instead.
 //
 // It is safe calling Client methods from concurrently running goroutines.
+//
+// The fields of a Client should not be changed while it is in use.
 type Client struct {
 	noCopy noCopy //nolint:unused,structcheck
 
@@ -294,9 +297,12 @@ type Client struct {
 	// By default will use isIdempotent function
 	RetryIf RetryIfFunc
 
-	mLock sync.Mutex
-	m     map[string]*HostClient
-	ms    map[string]*HostClient
+	// ConfigureClient configures the fasthttp.HostClient.
+	ConfigureClient func(hc *HostClient) error
+
+	mLock      sync.Mutex
+	m          map[string]*HostClient
+	ms         map[string]*HostClient
 	readerPool sync.Pool
 	writerPool sync.Pool
 }
@@ -460,11 +466,10 @@ func (c *Client) Do(req *Request, resp *Response) error {
 	host := uri.Host()
 
 	isTLS := false
-	scheme := uri.Scheme()
-	if bytes.Equal(scheme, strHTTPS) {
+	if uri.isHttps() {
 		isTLS = true
-	} else if !bytes.Equal(scheme, strHTTP) {
-		return fmt.Errorf("unsupported protocol %q. http and https are supported", scheme)
+	} else if !uri.isHttp() {
+		return fmt.Errorf("unsupported protocol %q. http and https are supported", uri.Scheme())
 	}
 
 	startCleaner := false
@@ -508,11 +513,22 @@ func (c *Client) Do(req *Request, resp *Response) error {
 			clientReaderPool:              &c.readerPool,
 			clientWriterPool:              &c.writerPool,
 		}
+
+		if c.ConfigureClient != nil {
+			if err := c.ConfigureClient(hc); err != nil {
+				return err
+			}
+		}
+
 		m[string(host)] = hc
 		if len(m) == 1 {
 			startCleaner = true
 		}
 	}
+
+	atomic.AddInt32(&hc.pendingClientRequests, 1)
+	defer atomic.AddInt32(&hc.pendingClientRequests, -1)
+
 	c.mLock.Unlock()
 
 	if startCleaner {
@@ -540,16 +556,21 @@ func (c *Client) CloseIdleConnections() {
 func (c *Client) mCleaner(m map[string]*HostClient) {
 	mustStop := false
 
+	sleep := c.MaxIdleConnDuration
+	if sleep < time.Second {
+		sleep = time.Second
+	} else if sleep > 10*time.Second {
+		sleep = 10 * time.Second
+	}
+
 	for {
 		c.mLock.Lock()
 		for k, v := range m {
 			v.connsLock.Lock()
-			shouldRemove := v.connsCount == 0
-			v.connsLock.Unlock()
-
-			if shouldRemove {
+			if v.connsCount == 0 && atomic.LoadInt32(&v.pendingClientRequests) == 0 {
 				delete(m, k)
 			}
+			v.connsLock.Unlock()
 		}
 		if len(m) == 0 {
 			mustStop = true
@@ -559,7 +580,7 @@ func (c *Client) mCleaner(m map[string]*HostClient) {
 		if mustStop {
 			break
 		}
-		time.Sleep(10 * time.Second)
+		time.Sleep(sleep)
 	}
 }
 
@@ -594,6 +615,9 @@ type DialFunc func(addr string) (net.Conn, error)
 // Request argument passed to RetryIfFunc, if there are any request errors.
 type RetryIfFunc func(request *Request) bool
 
+// TransportFunc wraps every request/response.
+type TransportFunc func(*Request, *Response) error
+
 // HostClient balances http requests among hosts listed in Addr.
 //
 // HostClient may be used for balancing load among multiple upstream hosts.
@@ -745,6 +769,9 @@ type HostClient struct {
 	// By default will use isIdempotent function
 	RetryIf RetryIfFunc
 
+	// Transport defines a transport-like mechanism that wraps every request/response.
+	Transport TransportFunc
+
 	clientName  atomic.Value
 	lastUseTime uint32
 
@@ -768,6 +795,10 @@ type HostClient struct {
 
 	pendingRequests int32
 
+	// pendingClientRequests counts the number of requests that a Client is currently running using this HostClient.
+	// It will be incremented ealier than pendingRequests and will be used by Client to see if the HostClient is still in use.
+	pendingClientRequests int32
+
 	connsCleanerRun bool
 }
 
@@ -935,7 +966,7 @@ var clientURLResponseChPool sync.Pool
 
 func clientPostURL(dst []byte, url string, postArgs *Args, c clientDoer) (statusCode int, body []byte, err error) {
 	req := AcquireRequest()
-	req.Header.SetMethodBytes(strPost)
+	req.Header.SetMethod(MethodPost)
 	req.Header.SetContentTypeBytes(strPostArgsContentType)
 	if postArgs != nil {
 		if _, err := postArgs.WriteTo(req.BodyWriter()); err != nil {
@@ -1346,7 +1377,7 @@ func (c *HostClient) doNonNilReqResp(req *Request, resp *Response) (bool, error)
 	req.secureErrorLogMessage = c.SecureErrorLogMessage
 	req.Header.secureErrorLogMessage = c.SecureErrorLogMessage
 
-	if c.IsTLS != bytes.Equal(req.uri.Scheme(), strHTTPS) {
+	if c.IsTLS != req.URI().isHttps() {
 		return false, ErrHostClientRedirectToDifferentScheme
 	}
 
@@ -1360,8 +1391,16 @@ func (c *HostClient) doNonNilReqResp(req *Request, resp *Response) (bool, error)
 	resp.Reset()
 	resp.SkipBody = customSkipBody
 
-	if c.DisablePathNormalizing {
-		req.URI().DisablePathNormalizing = true
+	req.URI().DisablePathNormalizing = c.DisablePathNormalizing
+
+	userAgentOld := req.Header.UserAgent()
+	if len(userAgentOld) == 0 {
+		req.Header.userAgent = append(req.Header.userAgent[:0], c.getClientName()...)
+	}
+
+	if c.Transport != nil {
+		err := c.Transport(req, resp)
+		return err == nil, err
 	}
 
 	cc, err := c.acquireConn(req.timeout, req.ConnectionClose())
@@ -1388,10 +1427,6 @@ func (c *HostClient) doNonNilReqResp(req *Request, resp *Response) (bool, error)
 		resetConnection = true
 	}
 
-	userAgentOld := req.Header.UserAgent()
-	if len(userAgentOld) == 0 {
-		req.Header.userAgent = append(req.Header.userAgent[:0], c.getClientName()...)
-	}
 	bw := c.acquireWriter(conn)
 	err = req.Write(bw)
 
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/cookie.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/cookie.go
index 9e9bd871..69f73285 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/cookie.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/cookie.go
@@ -85,12 +85,12 @@ type Cookie struct {
 // CopyTo copies src cookie to c.
 func (c *Cookie) CopyTo(src *Cookie) {
 	c.Reset()
-	c.key = append(c.key[:0], src.key...)
-	c.value = append(c.value[:0], src.value...)
+	c.key = append(c.key, src.key...)
+	c.value = append(c.value, src.value...)
 	c.expire = src.expire
 	c.maxAge = src.maxAge
-	c.domain = append(c.domain[:0], src.domain...)
-	c.path = append(c.path[:0], src.path...)
+	c.domain = append(c.domain, src.domain...)
+	c.path = append(c.path, src.path...)
 	c.httpOnly = src.httpOnly
 	c.secure = src.secure
 	c.sameSite = src.sameSite
@@ -149,7 +149,8 @@ func (c *Cookie) SetPathBytes(path []byte) {
 
 // Domain returns cookie domain.
 //
-// The returned domain is valid until the next Cookie modification method call.
+// The returned value is valid until the Cookie reused or released (ReleaseCookie).
+// Do not store references to the returned value. Make copies instead.
 func (c *Cookie) Domain() []byte {
 	return c.domain
 }
@@ -201,7 +202,8 @@ func (c *Cookie) SetExpire(expire time.Time) {
 
 // Value returns cookie value.
 //
-// The returned value is valid until the next Cookie modification method call.
+// The returned value is valid until the Cookie reused or released (ReleaseCookie).
+// Do not store references to the returned value. Make copies instead.
 func (c *Cookie) Value() []byte {
 	return c.value
 }
@@ -218,7 +220,8 @@ func (c *Cookie) SetValueBytes(value []byte) {
 
 // Key returns cookie name.
 //
-// The returned value is valid until the next Cookie modification method call.
+// The returned value is valid until the Cookie reused or released (ReleaseCookie).
+// Do not store references to the returned value. Make copies instead.
 func (c *Cookie) Key() []byte {
 	return c.key
 }
@@ -306,7 +309,8 @@ func (c *Cookie) AppendBytes(dst []byte) []byte {
 
 // Cookie returns cookie representation.
 //
-// The returned value is valid until the next call to Cookie methods.
+// The returned value is valid until the Cookie reused or released (ReleaseCookie).
+// Do not store references to the returned value. Make copies instead.
 func (c *Cookie) Cookie() []byte {
 	c.buf = c.AppendBytes(c.buf[:0])
 	return c.buf
@@ -345,8 +349,8 @@ func (c *Cookie) ParseBytes(src []byte) error {
 		return errNoCookies
 	}
 
-	c.key = append(c.key[:0], kv.key...)
-	c.value = append(c.value[:0], kv.value...)
+	c.key = append(c.key, kv.key...)
+	c.value = append(c.value, kv.value...)
 
 	for s.next(kv) {
 		if len(kv.key) != 0 {
@@ -378,29 +382,31 @@ func (c *Cookie) ParseBytes(src []byte) error {
 
 			case 'd': // "domain"
 				if caseInsensitiveCompare(strCookieDomain, kv.key) {
-					c.domain = append(c.domain[:0], kv.value...)
+					c.domain = append(c.domain, kv.value...)
 				}
 
 			case 'p': // "path"
 				if caseInsensitiveCompare(strCookiePath, kv.key) {
-					c.path = append(c.path[:0], kv.value...)
+					c.path = append(c.path, kv.value...)
 				}
 
 			case 's': // "samesite"
 				if caseInsensitiveCompare(strCookieSameSite, kv.key) {
-					// Case insensitive switch on first char
-					switch kv.value[0] | 0x20 {
-					case 'l': // "lax"
-						if caseInsensitiveCompare(strCookieSameSiteLax, kv.value) {
-							c.sameSite = CookieSameSiteLaxMode
-						}
-					case 's': // "strict"
-						if caseInsensitiveCompare(strCookieSameSiteStrict, kv.value) {
-							c.sameSite = CookieSameSiteStrictMode
-						}
-					case 'n': // "none"
-						if caseInsensitiveCompare(strCookieSameSiteNone, kv.value) {
-							c.sameSite = CookieSameSiteNoneMode
+					if len(kv.value) > 0 {
+						// Case insensitive switch on first char
+						switch kv.value[0] | 0x20 {
+						case 'l': // "lax"
+							if caseInsensitiveCompare(strCookieSameSiteLax, kv.value) {
+								c.sameSite = CookieSameSiteLaxMode
+							}
+						case 's': // "strict"
+							if caseInsensitiveCompare(strCookieSameSiteStrict, kv.value) {
+								c.sameSite = CookieSameSiteStrictMode
+							}
+						case 'n': // "none"
+							if caseInsensitiveCompare(strCookieSameSiteNone, kv.value) {
+								c.sameSite = CookieSameSiteNoneMode
+							}
 						}
 					}
 				}
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/fs.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/fs.go
index f8d4add9..72c832aa 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/fs.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/fs.go
@@ -30,6 +30,10 @@ import (
 // with good compression ratio.
 //
 // See also RequestCtx.SendFileBytes.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func ServeFileBytesUncompressed(ctx *RequestCtx, path []byte) {
 	ServeFileUncompressed(ctx, b2s(path))
 }
@@ -43,6 +47,10 @@ func ServeFileBytesUncompressed(ctx *RequestCtx, path []byte) {
 // with good compression ratio.
 //
 // See also RequestCtx.SendFile.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func ServeFileUncompressed(ctx *RequestCtx, path string) {
 	ctx.Request.Header.DelBytes(strAcceptEncoding)
 	ServeFile(ctx, path)
@@ -62,6 +70,10 @@ func ServeFileUncompressed(ctx *RequestCtx, path string) {
 // file contents.
 //
 // See also RequestCtx.SendFileBytes.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func ServeFileBytes(ctx *RequestCtx, path []byte) {
 	ServeFile(ctx, b2s(path))
 }
@@ -79,6 +91,10 @@ func ServeFileBytes(ctx *RequestCtx, path []byte) {
 // Use ServeFileUncompressed is you don't need serving compressed file contents.
 //
 // See also RequestCtx.SendFile.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func ServeFile(ctx *RequestCtx, path string) {
 	rootFSOnce.Do(func() {
 		rootFSHandler = rootFS.NewRequestHandler()
@@ -524,7 +540,7 @@ func (ff *fsFile) bigFileReader() (io.Reader, error) {
 
 	f, err := os.Open(ff.f.Name())
 	if err != nil {
-		return nil, fmt.Errorf("cannot open already opened file: %s", err)
+		return nil, fmt.Errorf("cannot open already opened file: %w", err)
 	}
 	return &bigFileReader{
 		f:  f,
@@ -981,7 +997,7 @@ func (h *fsHandler) openIndexFile(ctx *RequestCtx, dirPath string, mustCompress
 			return ff, nil
 		}
 		if !os.IsNotExist(err) {
-			return nil, fmt.Errorf("cannot open file %q: %s", indexFilePath, err)
+			return nil, fmt.Errorf("cannot open file %q: %w", indexFilePath, err)
 		}
 	}
 
@@ -1100,7 +1116,7 @@ func (h *fsHandler) compressAndOpenFSFile(filePath string, fileEncoding string)
 	fileInfo, err := f.Stat()
 	if err != nil {
 		f.Close()
-		return nil, fmt.Errorf("cannot obtain info for file %q: %s", filePath, err)
+		return nil, fmt.Errorf("cannot obtain info for file %q: %w", filePath, err)
 	}
 
 	if fileInfo.IsDir() {
@@ -1146,7 +1162,7 @@ func (h *fsHandler) compressFileNolock(f *os.File, fileInfo os.FileInfo, filePat
 	if err != nil {
 		f.Close()
 		if !os.IsPermission(err) {
-			return nil, fmt.Errorf("cannot create temporary file %q: %s", tmpFilePath, err)
+			return nil, fmt.Errorf("cannot create temporary file %q: %w", tmpFilePath, err)
 		}
 		return nil, errNoCreatePermission
 	}
@@ -1168,14 +1184,14 @@ func (h *fsHandler) compressFileNolock(f *os.File, fileInfo os.FileInfo, filePat
 	zf.Close()
 	f.Close()
 	if err != nil {
-		return nil, fmt.Errorf("error when compressing file %q to %q: %s", filePath, tmpFilePath, err)
+		return nil, fmt.Errorf("error when compressing file %q to %q: %w", filePath, tmpFilePath, err)
 	}
 	if err = os.Chtimes(tmpFilePath, time.Now(), fileInfo.ModTime()); err != nil {
 		return nil, fmt.Errorf("cannot change modification time to %s for tmp file %q: %s",
 			fileInfo.ModTime(), tmpFilePath, err)
 	}
 	if err = os.Rename(tmpFilePath, compressedFilePath); err != nil {
-		return nil, fmt.Errorf("cannot move compressed file from %q to %q: %s", tmpFilePath, compressedFilePath, err)
+		return nil, fmt.Errorf("cannot move compressed file from %q to %q: %w", tmpFilePath, compressedFilePath, err)
 	}
 	return h.newCompressedFSFile(compressedFilePath, fileEncoding)
 }
@@ -1183,12 +1199,12 @@ func (h *fsHandler) compressFileNolock(f *os.File, fileInfo os.FileInfo, filePat
 func (h *fsHandler) newCompressedFSFile(filePath string, fileEncoding string) (*fsFile, error) {
 	f, err := os.Open(filePath)
 	if err != nil {
-		return nil, fmt.Errorf("cannot open compressed file %q: %s", filePath, err)
+		return nil, fmt.Errorf("cannot open compressed file %q: %w", filePath, err)
 	}
 	fileInfo, err := f.Stat()
 	if err != nil {
 		f.Close()
-		return nil, fmt.Errorf("cannot obtain info for compressed file %q: %s", filePath, err)
+		return nil, fmt.Errorf("cannot obtain info for compressed file %q: %w", filePath, err)
 	}
 	return h.newFSFile(f, fileInfo, true, fileEncoding)
 }
@@ -1210,7 +1226,7 @@ func (h *fsHandler) openFSFile(filePath string, mustCompress bool, fileEncoding
 	fileInfo, err := f.Stat()
 	if err != nil {
 		f.Close()
-		return nil, fmt.Errorf("cannot obtain info for file %q: %s", filePath, err)
+		return nil, fmt.Errorf("cannot obtain info for file %q: %w", filePath, err)
 	}
 
 	if fileInfo.IsDir() {
@@ -1226,7 +1242,7 @@ func (h *fsHandler) openFSFile(filePath string, mustCompress bool, fileEncoding
 		fileInfoOriginal, err := os.Stat(filePathOriginal)
 		if err != nil {
 			f.Close()
-			return nil, fmt.Errorf("cannot obtain info for original file %q: %s", filePathOriginal, err)
+			return nil, fmt.Errorf("cannot obtain info for original file %q: %w", filePathOriginal, err)
 		}
 
 		// Only re-create the compressed file if there was more than a second between the mod times.
@@ -1257,7 +1273,7 @@ func (h *fsHandler) newFSFile(f *os.File, fileInfo os.FileInfo, compressed bool,
 	if len(contentType) == 0 {
 		data, err := readFileHeader(f, compressed, fileEncoding)
 		if err != nil {
-			return nil, fmt.Errorf("cannot read header of the file %q: %s", f.Name(), err)
+			return nil, fmt.Errorf("cannot read header of the file %q: %w", f.Name(), err)
 		}
 		contentType = http.DetectContentType(data)
 	}
@@ -1370,18 +1386,10 @@ func fsModTime(t time.Time) time.Time {
 	return t.In(time.UTC).Truncate(time.Second)
 }
 
-var (
-	filesLockMap     = make(map[string]*sync.Mutex)
-	filesLockMapLock sync.Mutex
-)
+var filesLockMap sync.Map
 
 func getFileLock(absPath string) *sync.Mutex {
-	filesLockMapLock.Lock()
-	flock := filesLockMap[absPath]
-	if flock == nil {
-		flock = &sync.Mutex{}
-		filesLockMap[absPath] = flock
-	}
-	filesLockMapLock.Unlock()
-	return flock
+	v, _ := filesLockMap.LoadOrStore(absPath, &sync.Mutex{})
+	filelock := v.(*sync.Mutex)
+	return filelock
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.mod b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.mod
index b34e79e8..f5cfe1bb 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.mod
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.mod
@@ -1,14 +1,13 @@
 module github.com/valyala/fasthttp
 
-go 1.11
+go 1.12
 
 require (
-	github.com/andybalholm/brotli v1.0.1
-	github.com/klauspost/compress v1.11.8
+	github.com/andybalholm/brotli v1.0.4
+	github.com/klauspost/compress v1.15.0
 	github.com/valyala/bytebufferpool v1.0.0
-	github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a
-	golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83
-	golang.org/x/net v0.0.0-20210226101413-39120d07d75e
-	golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073
-	golang.org/x/text v0.3.5 // indirect
+	github.com/valyala/tcplisten v1.0.0
+	golang.org/x/crypto v0.0.0-20220214200702-86341886e292
+	golang.org/x/net v0.0.0-20220225172249-27dd8689420f
+	golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9
 )
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.sum b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.sum
index 601e5b29..8595e94e 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.sum
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/go.sum
@@ -1,40 +1,26 @@
-github.com/andybalholm/brotli v1.0.0 h1:7UCwP93aiSfvWpapti8g88vVVGp2qqtGyePsSuDafo4=
-github.com/andybalholm/brotli v1.0.0/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
-github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc=
-github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
-github.com/klauspost/compress v1.10.7 h1:7rix8v8GpI3ZBb0nSozFRgbtXKv+hOe+qfEpZqybrAg=
-github.com/klauspost/compress v1.10.7/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
-github.com/klauspost/compress v1.11.8 h1:difgzQsp5mdAz9v8lm3P/I+EpDKMU/6uTMw1y1FObuo=
-github.com/klauspost/compress v1.11.8/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
+github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/klauspost/compress v1.15.0 h1:xqfchp4whNFxn5A4XFyyYtitiWI8Hy5EW59jEwcyL6U=
+github.com/klauspost/compress v1.15.0/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a h1:0R4NLDRDZX6JcmhJgXi5E4b8Wg84ihbmUKp/GvSPEzc=
-github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83 h1:/ZScEX8SfEmUGRHs0gxpqteO5nfNW6axyZbBdw9A12g=
-golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20201016165138-7b1cca2348c0 h1:5kGOVHlq0euqwzgTC9Vu15p6fV1Wi0ArVi8da2urnVg=
-golang.org/x/net v0.0.0-20201016165138-7b1cca2348c0/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.0.0-20210226101413-39120d07d75e h1:jIQURUJ9mlLvYwTBtRHm9h58rYhSonLvRvgAnP8Nr7I=
-golang.org/x/net v0.0.0-20210226101413-39120d07d75e/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
+github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
+golang.org/x/crypto v0.0.0-20220214200702-86341886e292 h1:f+lwQ+GtmgoY+A2YaQxlSOnDjXcQ7ZRLWOHbC6HtRqE=
+golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc=
+golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073 h1:8qxJSnu+7dRq6upnbntrmriWByIakBuct5OM/MdQC1M=
-golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9 h1:nhht2DYV/Sn3qOayu8lM+cU1ii9sTLUeBQwQQfUHtrs=
+golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/header.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/header.go
index 3a552d9a..7b9df083 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/header.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/header.go
@@ -32,16 +32,19 @@ type ResponseHeader struct {
 	noDefaultContentType bool
 	noDefaultDate        bool
 
-	statusCode         int
-	contentLength      int
-	contentLengthBytes []byte
-	secureErrorLogMessage     bool
+	statusCode            int
+	statusMessage         []byte
+	protocol              []byte
+	contentLength         int
+	contentLengthBytes    []byte
+	secureErrorLogMessage bool
 
 	contentType []byte
 	server      []byte
 
-	h     []argsKV
-	bufKV argsKV
+	h       []argsKV
+	trailer []argsKV
+	bufKV   argsKV
 
 	cookies []argsKV
 }
@@ -56,17 +59,18 @@ type ResponseHeader struct {
 type RequestHeader struct {
 	noCopy noCopy //nolint:unused,structcheck
 
-	disableNormalizing bool
-	noHTTP11           bool
-	connectionClose    bool
+	disableNormalizing   bool
+	noHTTP11             bool
+	connectionClose      bool
+	noDefaultContentType bool
 
 	// These two fields have been moved close to other bool fields
 	// for reducing RequestHeader object size.
 	cookiesCollected bool
 
-	contentLength      int
-	contentLengthBytes []byte
-	secureErrorLogMessage     bool
+	contentLength         int
+	contentLengthBytes    []byte
+	secureErrorLogMessage bool
 
 	method      []byte
 	requestURI  []byte
@@ -75,8 +79,9 @@ type RequestHeader struct {
 	contentType []byte
 	userAgent   []byte
 
-	h     []argsKV
-	bufKV argsKV
+	h       []argsKV
+	trailer []argsKV
+	bufKV   argsKV
 
 	cookies []argsKV
 
@@ -136,6 +141,29 @@ func (h *ResponseHeader) SetStatusCode(statusCode int) {
 	h.statusCode = statusCode
 }
 
+// StatusMessage returns response status message.
+func (h *ResponseHeader) StatusMessage() []byte {
+	return h.statusMessage
+}
+
+// SetStatusMessage sets response status message bytes.
+func (h *ResponseHeader) SetStatusMessage(statusMessage []byte) {
+	h.statusMessage = append(h.statusMessage[:0], statusMessage...)
+}
+
+// Protocol returns response protocol bytes.
+func (h *ResponseHeader) Protocol() []byte {
+	if len(h.protocol) > 0 {
+		return h.protocol
+	}
+	return strHTTP11
+}
+
+// SetProtocol sets response protocol bytes.
+func (h *ResponseHeader) SetProtocol(protocol []byte) {
+	h.protocol = append(h.protocol[:0], protocol...)
+}
+
 // SetLastModified sets 'Last-Modified' header to the given value.
 func (h *ResponseHeader) SetLastModified(t time.Time) {
 	h.bufKV.value = AppendHTTPDate(h.bufKV.value[:0], t)
@@ -271,7 +299,11 @@ func (h *RequestHeader) SetContentLength(contentLength int) {
 func (h *ResponseHeader) isCompressibleContentType() bool {
 	contentType := h.ContentType()
 	return bytes.HasPrefix(contentType, strTextSlash) ||
-		bytes.HasPrefix(contentType, strApplicationSlash)
+		bytes.HasPrefix(contentType, strApplicationSlash) ||
+		bytes.HasPrefix(contentType, strImageSVG) ||
+		bytes.HasPrefix(contentType, strImageIcon) ||
+		bytes.HasPrefix(contentType, strFontSlash) ||
+		bytes.HasPrefix(contentType, strMultipartSlash)
 }
 
 // ContentType returns Content-Type header value.
@@ -353,6 +385,117 @@ func (h *RequestHeader) SetMultipartFormBoundaryBytes(boundary []byte) {
 	h.SetContentTypeBytes(h.bufKV.value)
 }
 
+// SetTrailer sets header Trailer value for chunked response
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *ResponseHeader) SetTrailer(trailer string) error {
+	return h.SetTrailerBytes(s2b(trailer))
+}
+
+// SetTrailerBytes sets Trailer header value for chunked response
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *ResponseHeader) SetTrailerBytes(trailer []byte) error {
+	h.trailer = h.trailer[:0]
+	return h.AddTrailerBytes(trailer)
+}
+
+// AddTrailer add Trailer header value for chunked response
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *ResponseHeader) AddTrailer(trailer string) error {
+	return h.AddTrailerBytes(s2b(trailer))
+}
+
+var ErrBadTrailer = errors.New("contain forbidden trailer")
+
+// AddTrailerBytes add Trailer header value for chunked response
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *ResponseHeader) AddTrailerBytes(trailer []byte) error {
+	var err error
+	for i := -1; i+1 < len(trailer); {
+		trailer = trailer[i+1:]
+		i = bytes.IndexByte(trailer, ',')
+		if i < 0 {
+			i = len(trailer)
+		}
+		key := trailer[:i]
+		for len(key) > 0 && key[0] == ' ' {
+			key = key[1:]
+		}
+		for len(key) > 0 && key[len(key)-1] == ' ' {
+			key = key[:len(key)-1]
+		}
+		// Forbidden by RFC 7230, section 4.1.2
+		if isBadTrailer(key) {
+			err = ErrBadTrailer
+			continue
+		}
+		h.bufKV.key = append(h.bufKV.key[:0], key...)
+		normalizeHeaderKey(h.bufKV.key, h.disableNormalizing)
+		h.trailer = appendArgBytes(h.trailer, h.bufKV.key, nil, argsNoValue)
+	}
+
+	return err
+}
+
 // MultipartFormBoundary returns boundary part
 // from 'multipart/form-data; boundary=...' Content-Type.
 func (h *RequestHeader) MultipartFormBoundary() []byte {
@@ -443,7 +586,7 @@ func (h *RequestHeader) SetRefererBytes(referer []byte) {
 // Method returns HTTP request method.
 func (h *RequestHeader) Method() []byte {
 	if len(h.method) == 0 {
-		return strGet
+		return []byte(MethodGet)
 	}
 	return h.method
 }
@@ -501,49 +644,158 @@ func (h *RequestHeader) SetRequestURIBytes(requestURI []byte) {
 	h.requestURI = append(h.requestURI[:0], requestURI...)
 }
 
+// SetTrailer sets Trailer header value for chunked request
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *RequestHeader) SetTrailer(trailer string) error {
+	return h.SetTrailerBytes(s2b(trailer))
+}
+
+// SetTrailerBytes sets Trailer header value for chunked request
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *RequestHeader) SetTrailerBytes(trailer []byte) error {
+	h.trailer = h.trailer[:0]
+	return h.AddTrailerBytes(trailer)
+}
+
+// AddTrailer add Trailer header value for chunked request
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *RequestHeader) AddTrailer(trailer string) error {
+	return h.AddTrailerBytes(s2b(trailer))
+}
+
+// AddTrailerBytes add Trailer header value for chunked request
+// to indicate which headers will be sent after the body.
+//
+// Use Set to set the trailer header later.
+//
+// Trailers are only supported with chunked transfer.
+// Trailers allow the sender to include additional headers at the end of chunked messages.
+//
+// The following trailers are forbidden:
+// 1. necessary for message framing (e.g., Transfer-Encoding and Content-Length),
+// 2. routing (e.g., Host),
+// 3. request modifiers (e.g., controls and conditionals in Section 5 of [RFC7231]),
+// 4. authentication (e.g., see [RFC7235] and [RFC6265]),
+// 5. response control data (e.g., see Section 7.1 of [RFC7231]),
+// 6. determining how to process the payload (e.g., Content-Encoding, Content-Type, Content-Range, and Trailer)
+//
+// Return ErrBadTrailer if contain any forbidden trailers.
+func (h *RequestHeader) AddTrailerBytes(trailer []byte) error {
+	var err error
+	for i := -1; i+1 < len(trailer); {
+		trailer = trailer[i+1:]
+		i = bytes.IndexByte(trailer, ',')
+		if i < 0 {
+			i = len(trailer)
+		}
+		key := trailer[:i]
+		for len(key) > 0 && key[0] == ' ' {
+			key = key[1:]
+		}
+		for len(key) > 0 && key[len(key)-1] == ' ' {
+			key = key[:len(key)-1]
+		}
+		// Forbidden by RFC 7230, section 4.1.2
+		if isBadTrailer(key) {
+			err = ErrBadTrailer
+			continue
+		}
+		h.bufKV.key = append(h.bufKV.key[:0], key...)
+		normalizeHeaderKey(h.bufKV.key, h.disableNormalizing)
+		h.trailer = appendArgBytes(h.trailer, h.bufKV.key, nil, argsNoValue)
+	}
+
+	return err
+}
+
 // IsGet returns true if request method is GET.
 func (h *RequestHeader) IsGet() bool {
-	return bytes.Equal(h.Method(), strGet)
+	return string(h.Method()) == MethodGet
 }
 
 // IsPost returns true if request method is POST.
 func (h *RequestHeader) IsPost() bool {
-	return bytes.Equal(h.Method(), strPost)
+	return string(h.Method()) == MethodPost
 }
 
 // IsPut returns true if request method is PUT.
 func (h *RequestHeader) IsPut() bool {
-	return bytes.Equal(h.Method(), strPut)
+	return string(h.Method()) == MethodPut
 }
 
 // IsHead returns true if request method is HEAD.
 func (h *RequestHeader) IsHead() bool {
-	return bytes.Equal(h.Method(), strHead)
+	return string(h.Method()) == MethodHead
 }
 
 // IsDelete returns true if request method is DELETE.
 func (h *RequestHeader) IsDelete() bool {
-	return bytes.Equal(h.Method(), strDelete)
+	return string(h.Method()) == MethodDelete
 }
 
 // IsConnect returns true if request method is CONNECT.
 func (h *RequestHeader) IsConnect() bool {
-	return bytes.Equal(h.Method(), strConnect)
+	return string(h.Method()) == MethodConnect
 }
 
 // IsOptions returns true if request method is OPTIONS.
 func (h *RequestHeader) IsOptions() bool {
-	return bytes.Equal(h.Method(), strOptions)
+	return string(h.Method()) == MethodOptions
 }
 
 // IsTrace returns true if request method is TRACE.
 func (h *RequestHeader) IsTrace() bool {
-	return bytes.Equal(h.Method(), strTrace)
+	return string(h.Method()) == MethodTrace
 }
 
 // IsPatch returns true if request method is PATCH.
 func (h *RequestHeader) IsPatch() bool {
-	return bytes.Equal(h.Method(), strPatch)
+	return string(h.Method()) == MethodPatch
 }
 
 // IsHTTP11 returns true if the request is HTTP/1.1.
@@ -679,6 +931,8 @@ func (h *ResponseHeader) resetSkipNormalize() {
 	h.connectionClose = false
 
 	h.statusCode = 0
+	h.statusMessage = h.statusMessage[:0]
+	h.protocol = h.protocol[:0]
 	h.contentLength = 0
 	h.contentLengthBytes = h.contentLengthBytes[:0]
 
@@ -687,11 +941,18 @@ func (h *ResponseHeader) resetSkipNormalize() {
 
 	h.h = h.h[:0]
 	h.cookies = h.cookies[:0]
+	h.trailer = h.trailer[:0]
+}
+
+// SetNoDefaultContentType allows you to control if a default Content-Type header will be set (false) or not (true).
+func (h *RequestHeader) SetNoDefaultContentType(noDefaultContentType bool) {
+	h.noDefaultContentType = noDefaultContentType
 }
 
 // Reset clears request header.
 func (h *RequestHeader) Reset() {
 	h.disableNormalizing = false
+	h.SetNoDefaultContentType(false)
 	h.resetSkipNormalize()
 }
 
@@ -708,6 +969,7 @@ func (h *RequestHeader) resetSkipNormalize() {
 	h.host = h.host[:0]
 	h.contentType = h.contentType[:0]
 	h.userAgent = h.userAgent[:0]
+	h.trailer = h.trailer[:0]
 
 	h.h = h.h[:0]
 	h.cookies = h.cookies[:0]
@@ -727,12 +989,15 @@ func (h *ResponseHeader) CopyTo(dst *ResponseHeader) {
 	dst.noDefaultDate = h.noDefaultDate
 
 	dst.statusCode = h.statusCode
+	dst.statusMessage = append(dst.statusMessage, h.statusMessage...)
+	dst.protocol = append(dst.protocol, h.protocol...)
 	dst.contentLength = h.contentLength
-	dst.contentLengthBytes = append(dst.contentLengthBytes[:0], h.contentLengthBytes...)
-	dst.contentType = append(dst.contentType[:0], h.contentType...)
-	dst.server = append(dst.server[:0], h.server...)
+	dst.contentLengthBytes = append(dst.contentLengthBytes, h.contentLengthBytes...)
+	dst.contentType = append(dst.contentType, h.contentType...)
+	dst.server = append(dst.server, h.server...)
 	dst.h = copyArgs(dst.h, h.h)
 	dst.cookies = copyArgs(dst.cookies, h.cookies)
+	dst.trailer = copyArgs(dst.trailer, h.trailer)
 }
 
 // CopyTo copies all the headers to dst.
@@ -744,17 +1009,18 @@ func (h *RequestHeader) CopyTo(dst *RequestHeader) {
 	dst.connectionClose = h.connectionClose
 
 	dst.contentLength = h.contentLength
-	dst.contentLengthBytes = append(dst.contentLengthBytes[:0], h.contentLengthBytes...)
-	dst.method = append(dst.method[:0], h.method...)
-	dst.proto = append(dst.proto[:0], h.proto...)
-	dst.requestURI = append(dst.requestURI[:0], h.requestURI...)
-	dst.host = append(dst.host[:0], h.host...)
-	dst.contentType = append(dst.contentType[:0], h.contentType...)
-	dst.userAgent = append(dst.userAgent[:0], h.userAgent...)
+	dst.contentLengthBytes = append(dst.contentLengthBytes, h.contentLengthBytes...)
+	dst.method = append(dst.method, h.method...)
+	dst.proto = append(dst.proto, h.proto...)
+	dst.requestURI = append(dst.requestURI, h.requestURI...)
+	dst.host = append(dst.host, h.host...)
+	dst.contentType = append(dst.contentType, h.contentType...)
+	dst.userAgent = append(dst.userAgent, h.userAgent...)
+	dst.trailer = append(dst.trailer, h.trailer...)
 	dst.h = copyArgs(dst.h, h.h)
 	dst.cookies = copyArgs(dst.cookies, h.cookies)
 	dst.cookiesCollected = h.cookiesCollected
-	dst.rawHeaders = append(dst.rawHeaders[:0], h.rawHeaders...)
+	dst.rawHeaders = append(dst.rawHeaders, h.rawHeaders...)
 }
 
 // VisitAll calls f for each header.
@@ -778,12 +1044,29 @@ func (h *ResponseHeader) VisitAll(f func(key, value []byte)) {
 			f(strSetCookie, v)
 		})
 	}
+	if len(h.trailer) > 0 {
+		f(strTrailer, appendArgsKeyBytes(nil, h.trailer, strCommaSpace))
+	}
 	visitArgs(h.h, f)
 	if h.ConnectionClose() {
 		f(strConnection, strClose)
 	}
 }
 
+// VisitAllTrailer calls f for each response Trailer.
+//
+// f must not retain references to value after returning.
+func (h *ResponseHeader) VisitAllTrailer(f func(value []byte)) {
+	visitArgsKey(h.trailer, f)
+}
+
+// VisitAllTrailer calls f for each request Trailer.
+//
+// f must not retain references to value after returning.
+func (h *RequestHeader) VisitAllTrailer(f func(value []byte)) {
+	visitArgsKey(h.trailer, f)
+}
+
 // VisitAllCookie calls f for each response cookie.
 //
 // Cookie name is passed in key and the whole Set-Cookie header value
@@ -825,6 +1108,9 @@ func (h *RequestHeader) VisitAll(f func(key, value []byte)) {
 	if len(userAgent) > 0 {
 		f(strUserAgent, userAgent)
 	}
+	if len(h.trailer) > 0 {
+		f(strTrailer, appendArgsKeyBytes(nil, h.trailer, strCommaSpace))
+	}
 
 	h.collectCookies()
 	if len(h.cookies) > 0 {
@@ -881,6 +1167,8 @@ func (h *ResponseHeader) del(key []byte) {
 		h.contentLengthBytes = h.contentLengthBytes[:0]
 	case HeaderConnection:
 		h.connectionClose = false
+	case HeaderTrailer:
+		h.trailer = h.trailer[:0]
 	}
 	h.h = delAllArgsBytes(h.h, key)
 }
@@ -913,45 +1201,190 @@ func (h *RequestHeader) del(key []byte) {
 		h.contentLengthBytes = h.contentLengthBytes[:0]
 	case HeaderConnection:
 		h.connectionClose = false
+	case HeaderTrailer:
+		h.trailer = h.trailer[:0]
 	}
 	h.h = delAllArgsBytes(h.h, key)
 }
 
+// setSpecialHeader handles special headers and return true when a header is processed.
+func (h *ResponseHeader) setSpecialHeader(key, value []byte) bool {
+	if len(key) == 0 {
+		return false
+	}
+
+	switch key[0] | 0x20 {
+	case 'c':
+		if caseInsensitiveCompare(strContentType, key) {
+			h.SetContentTypeBytes(value)
+			return true
+		} else if caseInsensitiveCompare(strContentLength, key) {
+			if contentLength, err := parseContentLength(value); err == nil {
+				h.contentLength = contentLength
+				h.contentLengthBytes = append(h.contentLengthBytes[:0], value...)
+			}
+			return true
+		} else if caseInsensitiveCompare(strConnection, key) {
+			if bytes.Equal(strClose, value) {
+				h.SetConnectionClose()
+			} else {
+				h.ResetConnectionClose()
+				h.h = setArgBytes(h.h, key, value, argsHasValue)
+			}
+			return true
+		}
+	case 's':
+		if caseInsensitiveCompare(strServer, key) {
+			h.SetServerBytes(value)
+			return true
+		} else if caseInsensitiveCompare(strSetCookie, key) {
+			var kv *argsKV
+			h.cookies, kv = allocArg(h.cookies)
+			kv.key = getCookieKey(kv.key, value)
+			kv.value = append(kv.value[:0], value...)
+			return true
+		}
+	case 't':
+		if caseInsensitiveCompare(strTransferEncoding, key) {
+			// Transfer-Encoding is managed automatically.
+			return true
+		} else if caseInsensitiveCompare(strTrailer, key) {
+			_ = h.SetTrailerBytes(value)
+			return true
+		}
+	case 'd':
+		if caseInsensitiveCompare(strDate, key) {
+			// Date is managed automatically.
+			return true
+		}
+	}
+
+	return false
+}
+
+// setSpecialHeader handles special headers and return true when a header is processed.
+func (h *RequestHeader) setSpecialHeader(key, value []byte) bool {
+	if len(key) == 0 {
+		return false
+	}
+
+	switch key[0] | 0x20 {
+	case 'c':
+		if caseInsensitiveCompare(strContentType, key) {
+			h.SetContentTypeBytes(value)
+			return true
+		} else if caseInsensitiveCompare(strContentLength, key) {
+			if contentLength, err := parseContentLength(value); err == nil {
+				h.contentLength = contentLength
+				h.contentLengthBytes = append(h.contentLengthBytes[:0], value...)
+			}
+			return true
+		} else if caseInsensitiveCompare(strConnection, key) {
+			if bytes.Equal(strClose, value) {
+				h.SetConnectionClose()
+			} else {
+				h.ResetConnectionClose()
+				h.h = setArgBytes(h.h, key, value, argsHasValue)
+			}
+			return true
+		} else if caseInsensitiveCompare(strCookie, key) {
+			h.collectCookies()
+			h.cookies = parseRequestCookies(h.cookies, value)
+			return true
+		}
+	case 't':
+		if caseInsensitiveCompare(strTransferEncoding, key) {
+			// Transfer-Encoding is managed automatically.
+			return true
+		} else if caseInsensitiveCompare(strTrailer, key) {
+			_ = h.SetTrailerBytes(value)
+			return true
+		}
+	case 'h':
+		if caseInsensitiveCompare(strHost, key) {
+			h.SetHostBytes(value)
+			return true
+		}
+	case 'u':
+		if caseInsensitiveCompare(strUserAgent, key) {
+			h.SetUserAgentBytes(value)
+			return true
+		}
+	}
+
+	return false
+}
+
 // Add adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use Set for setting a single header for the given key.
+//
+// the Content-Type, Content-Length, Connection, Server, Set-Cookie,
+// Transfer-Encoding and Date headers can only be set once and will
+// overwrite the previous value.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked response body.
 func (h *ResponseHeader) Add(key, value string) {
-	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
-	h.h = appendArg(h.h, b2s(k), value, argsHasValue)
+	h.AddBytesKV(s2b(key), s2b(value))
 }
 
 // AddBytesK adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesK for setting a single header for the given key.
+//
+// the Content-Type, Content-Length, Connection, Server, Set-Cookie,
+// Transfer-Encoding and Date headers can only be set once and will
+// overwrite the previous value.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked response body.
 func (h *ResponseHeader) AddBytesK(key []byte, value string) {
-	h.Add(b2s(key), value)
+	h.AddBytesKV(key, s2b(value))
 }
 
 // AddBytesV adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesV for setting a single header for the given key.
+//
+// the Content-Type, Content-Length, Connection, Server, Set-Cookie,
+// Transfer-Encoding and Date headers can only be set once and will
+// overwrite the previous value.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked response body.
 func (h *ResponseHeader) AddBytesV(key string, value []byte) {
-	h.Add(key, b2s(value))
+	h.AddBytesKV(s2b(key), value)
 }
 
 // AddBytesKV adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesKV for setting a single header for the given key.
+//
+// the Content-Type, Content-Length, Connection, Server, Set-Cookie,
+// Transfer-Encoding and Date headers can only be set once and will
+// overwrite the previous value.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked response body.
 func (h *ResponseHeader) AddBytesKV(key, value []byte) {
-	h.Add(b2s(key), b2s(value))
+	if h.setSpecialHeader(key, value) {
+		return
+	}
+
+	k := getHeaderKeyBytes(&h.bufKV, b2s(key), h.disableNormalizing)
+	h.h = appendArgBytes(h.h, k, value, argsHasValue)
 }
 
 // Set sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked response body.
+//
 // Use Add for setting multiple header values under the same key.
 func (h *ResponseHeader) Set(key, value string) {
 	initHeaderKV(&h.bufKV, key, value, h.disableNormalizing)
@@ -960,6 +1393,9 @@ func (h *ResponseHeader) Set(key, value string) {
 
 // SetBytesK sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked response body.
+//
 // Use AddBytesK for setting multiple header values under the same key.
 func (h *ResponseHeader) SetBytesK(key []byte, value string) {
 	h.bufKV.value = append(h.bufKV.value[:0], value...)
@@ -968,6 +1404,9 @@ func (h *ResponseHeader) SetBytesK(key []byte, value string) {
 
 // SetBytesV sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked response body.
+//
 // Use AddBytesV for setting multiple header values under the same key.
 func (h *ResponseHeader) SetBytesV(key string, value []byte) {
 	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
@@ -976,6 +1415,9 @@ func (h *ResponseHeader) SetBytesV(key string, value []byte) {
 
 // SetBytesKV sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked response body.
+//
 // Use AddBytesKV for setting multiple header values under the same key.
 func (h *ResponseHeader) SetBytesKV(key, value []byte) {
 	h.bufKV.key = append(h.bufKV.key[:0], key...)
@@ -985,36 +1427,15 @@ func (h *ResponseHeader) SetBytesKV(key, value []byte) {
 
 // SetCanonical sets the given 'key: value' header assuming that
 // key is in canonical form.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked response body.
 func (h *ResponseHeader) SetCanonical(key, value []byte) {
-	switch string(key) {
-	case HeaderContentType:
-		h.SetContentTypeBytes(value)
-	case HeaderServer:
-		h.SetServerBytes(value)
-	case HeaderSetCookie:
-		var kv *argsKV
-		h.cookies, kv = allocArg(h.cookies)
-		kv.key = getCookieKey(kv.key, value)
-		kv.value = append(kv.value[:0], value...)
-	case HeaderContentLength:
-		if contentLength, err := parseContentLength(value); err == nil {
-			h.contentLength = contentLength
-			h.contentLengthBytes = append(h.contentLengthBytes[:0], value...)
-		}
-	case HeaderConnection:
-		if bytes.Equal(strClose, value) {
-			h.SetConnectionClose()
-		} else {
-			h.ResetConnectionClose()
-			h.h = setArgBytes(h.h, key, value, argsHasValue)
-		}
-	case HeaderTransferEncoding:
-		// Transfer-Encoding is managed automatically.
-	case HeaderDate:
-		// Date is managed automatically.
-	default:
-		h.h = setArgBytes(h.h, key, value, argsHasValue)
+	if h.setSpecialHeader(key, value) {
+		return
 	}
+
+	h.h = setArgBytes(h.h, key, value, argsHasValue)
 }
 
 // SetCookie sets the given response cookie.
@@ -1122,37 +1543,60 @@ func (h *RequestHeader) DelAllCookies() {
 //
 // Multiple headers with the same key may be added with this function.
 // Use Set for setting a single header for the given key.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked request body.
 func (h *RequestHeader) Add(key, value string) {
-	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
-	h.h = appendArg(h.h, b2s(k), value, argsHasValue)
+	h.AddBytesKV(s2b(key), s2b(value))
 }
 
 // AddBytesK adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesK for setting a single header for the given key.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked request body.
 func (h *RequestHeader) AddBytesK(key []byte, value string) {
-	h.Add(b2s(key), value)
+	h.AddBytesKV(key, s2b(value))
 }
 
 // AddBytesV adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesV for setting a single header for the given key.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked request body.
 func (h *RequestHeader) AddBytesV(key string, value []byte) {
-	h.Add(key, b2s(value))
+	h.AddBytesKV(s2b(key), value)
 }
 
 // AddBytesKV adds the given 'key: value' header.
 //
 // Multiple headers with the same key may be added with this function.
 // Use SetBytesKV for setting a single header for the given key.
+//
+// the Content-Type, Content-Length, Connection, Cookie,
+// Transfer-Encoding, Host and User-Agent headers can only be set once
+// and will overwrite the previous value.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see AddTrailer for more details),
+// it will be sent after the chunked request body.
 func (h *RequestHeader) AddBytesKV(key, value []byte) {
-	h.Add(b2s(key), b2s(value))
+	if h.setSpecialHeader(key, value) {
+		return
+	}
+
+	k := getHeaderKeyBytes(&h.bufKV, b2s(key), h.disableNormalizing)
+	h.h = appendArgBytes(h.h, k, value, argsHasValue)
 }
 
 // Set sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked request body.
+//
 // Use Add for setting multiple header values under the same key.
 func (h *RequestHeader) Set(key, value string) {
 	initHeaderKV(&h.bufKV, key, value, h.disableNormalizing)
@@ -1161,6 +1605,9 @@ func (h *RequestHeader) Set(key, value string) {
 
 // SetBytesK sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked request body.
+//
 // Use AddBytesK for setting multiple header values under the same key.
 func (h *RequestHeader) SetBytesK(key []byte, value string) {
 	h.bufKV.value = append(h.bufKV.value[:0], value...)
@@ -1169,6 +1616,9 @@ func (h *RequestHeader) SetBytesK(key []byte, value string) {
 
 // SetBytesV sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked request body.
+//
 // Use AddBytesV for setting multiple header values under the same key.
 func (h *RequestHeader) SetBytesV(key string, value []byte) {
 	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
@@ -1177,6 +1627,9 @@ func (h *RequestHeader) SetBytesV(key string, value []byte) {
 
 // SetBytesKV sets the given 'key: value' header.
 //
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked request body.
+//
 // Use AddBytesKV for setting multiple header values under the same key.
 func (h *RequestHeader) SetBytesKV(key, value []byte) {
 	h.bufKV.key = append(h.bufKV.key[:0], key...)
@@ -1186,40 +1639,22 @@ func (h *RequestHeader) SetBytesKV(key, value []byte) {
 
 // SetCanonical sets the given 'key: value' header assuming that
 // key is in canonical form.
+//
+// If the header is set as a Trailer (forbidden trailers will not be set, see SetTrailer for more details),
+// it will be sent after the chunked request body.
 func (h *RequestHeader) SetCanonical(key, value []byte) {
-	switch string(key) {
-	case HeaderHost:
-		h.SetHostBytes(value)
-	case HeaderContentType:
-		h.SetContentTypeBytes(value)
-	case HeaderUserAgent:
-		h.SetUserAgentBytes(value)
-	case HeaderCookie:
-		h.collectCookies()
-		h.cookies = parseRequestCookies(h.cookies, value)
-	case HeaderContentLength:
-		if contentLength, err := parseContentLength(value); err == nil {
-			h.contentLength = contentLength
-			h.contentLengthBytes = append(h.contentLengthBytes[:0], value...)
-		}
-	case HeaderConnection:
-		if bytes.Equal(strClose, value) {
-			h.SetConnectionClose()
-		} else {
-			h.ResetConnectionClose()
-			h.h = setArgBytes(h.h, key, value, argsHasValue)
-		}
-	case HeaderTransferEncoding:
-		// Transfer-Encoding is managed automatically.
-	default:
-		h.h = setArgBytes(h.h, key, value, argsHasValue)
+	if h.setSpecialHeader(key, value) {
+		return
 	}
+
+	h.h = setArgBytes(h.h, key, value, argsHasValue)
 }
 
 // Peek returns header value for the given key.
 //
-// Returned value is valid until the next call to ResponseHeader.
-// Do not store references to returned value. Make copies instead.
+// The returned value is valid until the response is released,
+// either though ReleaseResponse or your request handler returning.
+// Do not store references to the returned value. Make copies instead.
 func (h *ResponseHeader) Peek(key string) []byte {
 	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
 	return h.peek(k)
@@ -1227,7 +1662,8 @@ func (h *ResponseHeader) Peek(key string) []byte {
 
 // PeekBytes returns header value for the given key.
 //
-// Returned value is valid until the next call to ResponseHeader.
+// The returned value is valid until the response is released,
+// either though ReleaseResponse or your request handler returning.
 // Do not store references to returned value. Make copies instead.
 func (h *ResponseHeader) PeekBytes(key []byte) []byte {
 	h.bufKV.key = append(h.bufKV.key[:0], key...)
@@ -1237,7 +1673,8 @@ func (h *ResponseHeader) PeekBytes(key []byte) []byte {
 
 // Peek returns header value for the given key.
 //
-// Returned value is valid until the next call to RequestHeader.
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
 // Do not store references to returned value. Make copies instead.
 func (h *RequestHeader) Peek(key string) []byte {
 	k := getHeaderKeyBytes(&h.bufKV, key, h.disableNormalizing)
@@ -1246,7 +1683,8 @@ func (h *RequestHeader) Peek(key string) []byte {
 
 // PeekBytes returns header value for the given key.
 //
-// Returned value is valid until the next call to RequestHeader.
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
 // Do not store references to returned value. Make copies instead.
 func (h *RequestHeader) PeekBytes(key []byte) []byte {
 	h.bufKV.key = append(h.bufKV.key[:0], key...)
@@ -1269,6 +1707,8 @@ func (h *ResponseHeader) peek(key []byte) []byte {
 		return h.contentLengthBytes
 	case HeaderSetCookie:
 		return appendResponseCookieBytes(nil, h.cookies)
+	case HeaderTrailer:
+		return appendArgsKeyBytes(nil, h.trailer, strCommaSpace)
 	default:
 		return peekArgBytes(h.h, key)
 	}
@@ -1294,6 +1734,8 @@ func (h *RequestHeader) peek(key []byte) []byte {
 			return appendRequestCookieBytes(nil, h.cookies)
 		}
 		return peekArgBytes(h.h, key)
+	case HeaderTrailer:
+		return appendArgsKeyBytes(nil, h.trailer, strCommaSpace)
 	default:
 		return peekArgBytes(h.h, key)
 	}
@@ -1362,11 +1804,11 @@ func (h *ResponseHeader) tryRead(r *bufio.Reader, n int) error {
 				}
 			}
 			return &ErrSmallBuffer{
-				error: fmt.Errorf("error when reading response headers: %s", errSmallBuffer),
+				error: fmt.Errorf("error when reading response headers: %w", errSmallBuffer),
 			}
 		}
 
-		return fmt.Errorf("error when reading response headers: %s", err)
+		return fmt.Errorf("error when reading response headers: %w", err)
 	}
 	b = mustPeekBuffered(r)
 	headersLen, errParse := h.parse(b)
@@ -1377,6 +1819,61 @@ func (h *ResponseHeader) tryRead(r *bufio.Reader, n int) error {
 	return nil
 }
 
+// ReadTrailer reads response trailer header from r.
+//
+// io.EOF is returned if r is closed before reading the first byte.
+func (h *ResponseHeader) ReadTrailer(r *bufio.Reader) error {
+	n := 1
+	for {
+		err := h.tryReadTrailer(r, n)
+		if err == nil {
+			return nil
+		}
+		if err != errNeedMore {
+			return err
+		}
+		n = r.Buffered() + 1
+	}
+}
+
+func (h *ResponseHeader) tryReadTrailer(r *bufio.Reader, n int) error {
+	b, err := r.Peek(n)
+	if len(b) == 0 {
+		// Return ErrTimeout on any timeout.
+		if x, ok := err.(interface{ Timeout() bool }); ok && x.Timeout() {
+			return ErrTimeout
+		}
+
+		if n == 1 || err == io.EOF {
+			return io.EOF
+		}
+
+		// This is for go 1.6 bug. See https://github.com/golang/go/issues/14121 .
+		if err == bufio.ErrBufferFull {
+			if h.secureErrorLogMessage {
+				return &ErrSmallBuffer{
+					error: fmt.Errorf("error when reading response trailer"),
+				}
+			}
+			return &ErrSmallBuffer{
+				error: fmt.Errorf("error when reading response trailer: %w", errSmallBuffer),
+			}
+		}
+
+		return fmt.Errorf("error when reading response trailer: %w", err)
+	}
+	b = mustPeekBuffered(r)
+	headersLen, errParse := h.parseTrailer(b)
+	if errParse != nil {
+		if err == io.EOF {
+			return err
+		}
+		return headerError("response", err, errParse, b, h.secureErrorLogMessage)
+	}
+	mustDiscard(r, headersLen)
+	return nil
+}
+
 func headerError(typ string, err, errParse error, b []byte, secureErrorLogMessage bool) error {
 	if errParse != errNeedMore {
 		return headerErrorMsg(typ, errParse, b, secureErrorLogMessage)
@@ -1401,22 +1898,29 @@ func headerError(typ string, err, errParse error, b []byte, secureErrorLogMessag
 
 func headerErrorMsg(typ string, err error, b []byte, secureErrorLogMessage bool) error {
 	if secureErrorLogMessage {
-		return fmt.Errorf("error when reading %s headers: %s. Buffer size=%d", typ, err, len(b))
+		return fmt.Errorf("error when reading %s headers: %w. Buffer size=%d", typ, err, len(b))
 	}
-	return fmt.Errorf("error when reading %s headers: %s. Buffer size=%d, contents: %s", typ, err, len(b), bufferSnippet(b))
+	return fmt.Errorf("error when reading %s headers: %w. Buffer size=%d, contents: %s", typ, err, len(b), bufferSnippet(b))
 }
 
 // Read reads request header from r.
 //
 // io.EOF is returned if r is closed before reading the first header byte.
 func (h *RequestHeader) Read(r *bufio.Reader) error {
+	return h.readLoop(r, true)
+}
+
+// readLoop reads request header from r optionally loops until it has enough data.
+//
+// io.EOF is returned if r is closed before reading the first header byte.
+func (h *RequestHeader) readLoop(r *bufio.Reader, waitForMore bool) error {
 	n := 1
 	for {
 		err := h.tryRead(r, n)
 		if err == nil {
 			return nil
 		}
-		if err != errNeedMore {
+		if !waitForMore || err != errNeedMore {
 			h.resetSkipNormalize()
 			return err
 		}
@@ -1424,6 +1928,61 @@ func (h *RequestHeader) Read(r *bufio.Reader) error {
 	}
 }
 
+// ReadTrailer reads request trailer header from r.
+//
+// io.EOF is returned if r is closed before reading the first byte.
+func (h *RequestHeader) ReadTrailer(r *bufio.Reader) error {
+	n := 1
+	for {
+		err := h.tryReadTrailer(r, n)
+		if err == nil {
+			return nil
+		}
+		if err != errNeedMore {
+			return err
+		}
+		n = r.Buffered() + 1
+	}
+}
+
+func (h *RequestHeader) tryReadTrailer(r *bufio.Reader, n int) error {
+	b, err := r.Peek(n)
+	if len(b) == 0 {
+		// Return ErrTimeout on any timeout.
+		if x, ok := err.(interface{ Timeout() bool }); ok && x.Timeout() {
+			return ErrTimeout
+		}
+
+		if n == 1 || err == io.EOF {
+			return io.EOF
+		}
+
+		// This is for go 1.6 bug. See https://github.com/golang/go/issues/14121 .
+		if err == bufio.ErrBufferFull {
+			if h.secureErrorLogMessage {
+				return &ErrSmallBuffer{
+					error: fmt.Errorf("error when reading request trailer"),
+				}
+			}
+			return &ErrSmallBuffer{
+				error: fmt.Errorf("error when reading request trailer: %w", errSmallBuffer),
+			}
+		}
+
+		return fmt.Errorf("error when reading request trailer: %w", err)
+	}
+	b = mustPeekBuffered(r)
+	headersLen, errParse := h.parseTrailer(b)
+	if errParse != nil {
+		if err == io.EOF {
+			return err
+		}
+		return headerError("request", err, errParse, b, h.secureErrorLogMessage)
+	}
+	mustDiscard(r, headersLen)
+	return nil
+}
+
 func (h *RequestHeader) tryRead(r *bufio.Reader, n int) error {
 	h.resetSkipNormalize()
 	b, err := r.Peek(n)
@@ -1439,7 +1998,7 @@ func (h *RequestHeader) tryRead(r *bufio.Reader, n int) error {
 		// This is for go 1.6 bug. See https://github.com/golang/go/issues/14121 .
 		if err == bufio.ErrBufferFull {
 			return &ErrSmallBuffer{
-				error: fmt.Errorf("error when reading request headers: %s", errSmallBuffer),
+				error: fmt.Errorf("error when reading request headers: %w (n=%d, r.Buffered()=%d)", errSmallBuffer, n, r.Buffered()),
 			}
 		}
 
@@ -1449,7 +2008,7 @@ func (h *RequestHeader) tryRead(r *bufio.Reader, n int) error {
 			return ErrNothingRead{err}
 		}
 
-		return fmt.Errorf("error when reading request headers: %s", err)
+		return fmt.Errorf("error when reading request headers: %w", err)
 	}
 	b = mustPeekBuffered(r)
 	headersLen, errParse := h.parse(b)
@@ -1520,25 +2079,58 @@ func (h *ResponseHeader) WriteTo(w io.Writer) (int64, error) {
 
 // Header returns response header representation.
 //
-// The returned value is valid until the next call to ResponseHeader methods.
+// Headers that set as Trailer will not represent. Use TrailerHeader for trailers.
+//
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
+// Do not store references to returned value. Make copies instead.
 func (h *ResponseHeader) Header() []byte {
 	h.bufKV.value = h.AppendBytes(h.bufKV.value[:0])
 	return h.bufKV.value
 }
 
+// writeTrailer writes response trailer to w.
+func (h *ResponseHeader) writeTrailer(w *bufio.Writer) error {
+	_, err := w.Write(h.TrailerHeader())
+	return err
+}
+
+// TrailerHeader returns response trailer header representation.
+//
+// Trailers will only be received with chunked transfer.
+//
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
+// Do not store references to returned value. Make copies instead.
+func (h *ResponseHeader) TrailerHeader() []byte {
+	h.bufKV.value = h.bufKV.value[:0]
+	for _, t := range h.trailer {
+		value := h.peek(t.key)
+		h.bufKV.value = appendHeaderLine(h.bufKV.value, t.key, value)
+	}
+	h.bufKV.value = append(h.bufKV.value, strCRLF...)
+	return h.bufKV.value
+}
+
 // String returns response header representation.
 func (h *ResponseHeader) String() string {
 	return string(h.Header())
 }
 
-// AppendBytes appends response header representation to dst and returns
+// appendStatusLine appends the response status line to dst and returns
 // the extended dst.
-func (h *ResponseHeader) AppendBytes(dst []byte) []byte {
+func (h *ResponseHeader) appendStatusLine(dst []byte) []byte {
 	statusCode := h.StatusCode()
 	if statusCode < 0 {
 		statusCode = StatusOK
 	}
-	dst = append(dst, statusLine(statusCode)...)
+	return formatStatusLine(dst, h.Protocol(), statusCode, h.StatusMessage())
+}
+
+// AppendBytes appends response header representation to dst and returns
+// the extended dst.
+func (h *ResponseHeader) AppendBytes(dst []byte) []byte {
+	dst = h.appendStatusLine(dst[:0])
 
 	server := h.Server()
 	if len(server) != 0 {
@@ -1566,11 +2158,24 @@ func (h *ResponseHeader) AppendBytes(dst []byte) []byte {
 
 	for i, n := 0, len(h.h); i < n; i++ {
 		kv := &h.h[i]
-		if h.noDefaultDate || !bytes.Equal(kv.key, strDate) {
+
+		// Exclude trailer from header
+		exclude := false
+		for _, t := range h.trailer {
+			if bytes.Equal(kv.key, t.key) {
+				exclude = true
+				break
+			}
+		}
+		if !exclude && (h.noDefaultDate || !bytes.Equal(kv.key, strDate)) {
 			dst = appendHeaderLine(dst, kv.key, kv.value)
 		}
 	}
 
+	if len(h.trailer) > 0 {
+		dst = appendHeaderLine(dst, strTrailer, appendArgsKeyBytes(nil, h.trailer, strCommaSpace))
+	}
+
 	n := len(h.cookies)
 	if n > 0 {
 		for i := 0; i < n; i++ {
@@ -1602,12 +2207,39 @@ func (h *RequestHeader) WriteTo(w io.Writer) (int64, error) {
 
 // Header returns request header representation.
 //
-// The returned representation is valid until the next call to RequestHeader methods.
+// Headers that set as Trailer will not represent. Use TrailerHeader for trailers.
+//
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
+// Do not store references to returned value. Make copies instead.
 func (h *RequestHeader) Header() []byte {
 	h.bufKV.value = h.AppendBytes(h.bufKV.value[:0])
 	return h.bufKV.value
 }
 
+// writeTrailer writes request trailer to w.
+func (h *RequestHeader) writeTrailer(w *bufio.Writer) error {
+	_, err := w.Write(h.TrailerHeader())
+	return err
+}
+
+// TrailerHeader returns request trailer header representation.
+//
+// Trailers will only be received with chunked transfer.
+//
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
+// Do not store references to returned value. Make copies instead.
+func (h *RequestHeader) TrailerHeader() []byte {
+	h.bufKV.value = h.bufKV.value[:0]
+	for _, t := range h.trailer {
+		value := h.peek(t.key)
+		h.bufKV.value = appendHeaderLine(h.bufKV.value, t.key, value)
+	}
+	h.bufKV.value = append(h.bufKV.value, strCRLF...)
+	return h.bufKV.value
+}
+
 // RawHeaders returns raw header key/value bytes.
 //
 // Depending on server configuration, header keys may be normalized to
@@ -1648,8 +2280,8 @@ func (h *RequestHeader) AppendBytes(dst []byte) []byte {
 	}
 
 	contentType := h.ContentType()
-	if len(contentType) == 0 && !h.ignoreBody() {
-		contentType = strPostArgsContentType
+	if !h.noDefaultContentType && len(contentType) == 0 && !h.ignoreBody() {
+		contentType = strDefaultContentType
 	}
 	if len(contentType) > 0 {
 		dst = appendHeaderLine(dst, strContentType, contentType)
@@ -1660,7 +2292,21 @@ func (h *RequestHeader) AppendBytes(dst []byte) []byte {
 
 	for i, n := 0, len(h.h); i < n; i++ {
 		kv := &h.h[i]
-		dst = appendHeaderLine(dst, kv.key, kv.value)
+		// Exclude trailer from header
+		exclude := false
+		for _, t := range h.trailer {
+			if bytes.Equal(kv.key, t.key) {
+				exclude = true
+				break
+			}
+		}
+		if !exclude {
+			dst = appendHeaderLine(dst, kv.key, kv.value)
+		}
+	}
+
+	if len(h.trailer) > 0 {
+		dst = appendHeaderLine(dst, strTrailer, appendArgsKeyBytes(nil, h.trailer, strCommaSpace))
 	}
 
 	// there is no need in h.collectCookies() here, since if cookies aren't collected yet,
@@ -1699,6 +2345,43 @@ func (h *ResponseHeader) parse(buf []byte) (int, error) {
 	return m + n, nil
 }
 
+func (h *ResponseHeader) parseTrailer(buf []byte) (int, error) {
+	// Skip any 0 length chunk.
+	if buf[0] == '0' {
+		skip := len(strCRLF) + 1
+		if len(buf) < skip {
+			return 0, io.EOF
+		}
+		buf = buf[skip:]
+	}
+
+	var s headerScanner
+	s.b = buf
+	s.disableNormalizing = h.disableNormalizing
+	var err error
+	for s.next() {
+		if len(s.key) > 0 {
+			if bytes.IndexByte(s.key, ' ') != -1 || bytes.IndexByte(s.key, '\t') != -1 {
+				err = fmt.Errorf("invalid trailer key %q", s.key)
+				continue
+			}
+			// Forbidden by RFC 7230, section 4.1.2
+			if isBadTrailer(s.key) {
+				err = fmt.Errorf("forbidden trailer key %q", s.key)
+				continue
+			}
+		}
+		h.h = appendArgBytes(h.h, s.key, s.value, argsHasValue)
+	}
+	if s.err != nil {
+		return 0, s.err
+	}
+	if err != nil {
+		return 0, err
+	}
+	return s.hLen, nil
+}
+
 func (h *RequestHeader) ignoreBody() bool {
 	return h.IsGet() || h.IsHead()
 }
@@ -1721,6 +2404,87 @@ func (h *RequestHeader) parse(buf []byte) (int, error) {
 	return m + n, nil
 }
 
+func (h *RequestHeader) parseTrailer(buf []byte) (int, error) {
+	// Skip any 0 length chunk.
+	if buf[0] == '0' {
+		skip := len(strCRLF) + 1
+		if len(buf) < skip {
+			return 0, io.EOF
+		}
+		buf = buf[skip:]
+	}
+
+	var s headerScanner
+	s.b = buf
+	s.disableNormalizing = h.disableNormalizing
+	var err error
+	for s.next() {
+		if len(s.key) > 0 {
+			if bytes.IndexByte(s.key, ' ') != -1 || bytes.IndexByte(s.key, '\t') != -1 {
+				err = fmt.Errorf("invalid trailer key %q", s.key)
+				continue
+			}
+			// Forbidden by RFC 7230, section 4.1.2
+			if isBadTrailer(s.key) {
+				err = fmt.Errorf("forbidden trailer key %q", s.key)
+				continue
+			}
+			h.h = appendArgBytes(h.h, s.key, s.value, argsHasValue)
+		}
+	}
+	if s.err != nil {
+		return 0, s.err
+	}
+	if err != nil {
+		return 0, err
+	}
+	return s.hLen, nil
+}
+
+func isBadTrailer(key []byte) bool {
+	if len(key) == 0 {
+		return true
+	}
+
+	switch key[0] | 0x20 {
+	case 'a':
+		return caseInsensitiveCompare(key, strAuthorization)
+	case 'c':
+		if len(key) > len(HeaderContentType) && caseInsensitiveCompare(key[:8], strContentType[:8]) {
+			// skip compare prefix 'Content-'
+			return caseInsensitiveCompare(key[8:], strContentEncoding[8:]) ||
+				caseInsensitiveCompare(key[8:], strContentLength[8:]) ||
+				caseInsensitiveCompare(key[8:], strContentType[8:]) ||
+				caseInsensitiveCompare(key[8:], strContentRange[8:])
+		}
+		return caseInsensitiveCompare(key, strConnection)
+	case 'e':
+		return caseInsensitiveCompare(key, strExpect)
+	case 'h':
+		return caseInsensitiveCompare(key, strHost)
+	case 'k':
+		return caseInsensitiveCompare(key, strKeepAlive)
+	case 'm':
+		return caseInsensitiveCompare(key, strMaxForwards)
+	case 'p':
+		if len(key) > len(HeaderProxyConnection) && caseInsensitiveCompare(key[:6], strProxyConnection[:6]) {
+			// skip compare prefix 'Proxy-'
+			return caseInsensitiveCompare(key[6:], strProxyConnection[6:]) ||
+				caseInsensitiveCompare(key[6:], strProxyAuthenticate[6:]) ||
+				caseInsensitiveCompare(key[6:], strProxyAuthorization[6:])
+		}
+	case 'r':
+		return caseInsensitiveCompare(key, strRange)
+	case 't':
+		return caseInsensitiveCompare(key, strTE) ||
+			caseInsensitiveCompare(key, strTrailer) ||
+			caseInsensitiveCompare(key, strTransferEncoding)
+	case 'w':
+		return caseInsensitiveCompare(key, strWWWAuthenticate)
+	}
+	return false
+}
+
 func (h *ResponseHeader) parseFirstLine(buf []byte) (int, error) {
 	bNext := buf
 	var b []byte
@@ -1746,9 +2510,9 @@ func (h *ResponseHeader) parseFirstLine(buf []byte) (int, error) {
 	h.statusCode, n, err = parseUintBuf(b)
 	if err != nil {
 		if h.secureErrorLogMessage {
-			return 0, fmt.Errorf("cannot parse response status code: %s", err)
+			return 0, fmt.Errorf("cannot parse response status code: %w", err)
 		}
-		return 0, fmt.Errorf("cannot parse response status code: %s. Response %q", err, buf)
+		return 0, fmt.Errorf("cannot parse response status code: %w. Response %q", err, buf)
 	}
 	if len(b) > n && b[n] != ' ' {
 		if h.secureErrorLogMessage {
@@ -1756,6 +2520,9 @@ func (h *ResponseHeader) parseFirstLine(buf []byte) (int, error) {
 		}
 		return 0, fmt.Errorf("unexpected char at the end of status code. Response %q", buf)
 	}
+	if len(b) > n+1 {
+		h.SetStatusMessage(b[n+1:])
+	}
 
 	return len(buf) - len(bNext), nil
 }
@@ -1804,37 +2571,6 @@ func (h *RequestHeader) parseFirstLine(buf []byte) (int, error) {
 	return len(buf) - len(bNext), nil
 }
 
-func peekRawHeader(buf, key []byte) []byte {
-	n := bytes.Index(buf, key)
-	if n < 0 {
-		return nil
-	}
-	if n > 0 && buf[n-1] != nChar {
-		return nil
-	}
-	n += len(key)
-	if n >= len(buf) {
-		return nil
-	}
-	if buf[n] != ':' {
-		return nil
-	}
-	n++
-	if buf[n] != ' ' {
-		return nil
-	}
-	n++
-	buf = buf[n:]
-	n = bytes.IndexByte(buf, nChar)
-	if n < 0 {
-		return nil
-	}
-	if n > 0 && buf[n-1] == rChar {
-		n--
-	}
-	return buf[:n]
-}
-
 func readRawHeaders(dst, buf []byte) ([]byte, int, error) {
 	n := bytes.IndexByte(buf, nChar)
 	if n < 0 {
@@ -1918,6 +2654,10 @@ func (h *ResponseHeader) parseHeaders(buf []byte) (int, error) {
 					}
 					continue
 				}
+				if caseInsensitiveCompare(s.key, strTrailer) {
+					err = h.SetTrailerBytes(s.value)
+					continue
+				}
 			}
 			h.h = appendArgBytes(h.h, s.key, s.value, argsHasValue)
 		}
@@ -1940,7 +2680,7 @@ func (h *ResponseHeader) parseHeaders(buf []byte) (int, error) {
 		h.connectionClose = !hasHeaderValue(v, strKeepAlive)
 	}
 
-	return len(buf) - len(s.b), nil
+	return len(buf) - len(s.b), err
 }
 
 func (h *RequestHeader) parseHeaders(buf []byte) (int, error) {
@@ -2006,6 +2746,14 @@ func (h *RequestHeader) parseHeaders(buf []byte) (int, error) {
 					}
 					continue
 				}
+				if caseInsensitiveCompare(s.key, strTrailer) {
+					if nerr := h.SetTrailerBytes(s.value); nerr != nil {
+						if err == nil {
+							err = nerr
+						}
+					}
+					continue
+				}
 			}
 		}
 		h.h = appendArgBytes(h.h, s.key, s.value, argsHasValue)
@@ -2049,13 +2797,15 @@ func (h *RequestHeader) collectCookies() {
 	h.cookiesCollected = true
 }
 
+var errNonNumericChars = errors.New("non-numeric chars found")
+
 func parseContentLength(b []byte) (int, error) {
 	v, n, err := parseUintBuf(b)
 	if err != nil {
-		return -1, err
+		return -1, fmt.Errorf("cannot parse Content-Length: %w", err)
 	}
 	if n != len(b) {
-		return -1, fmt.Errorf("non-numeric chars at the end of Content-Length")
+		return -1, fmt.Errorf("cannot parse Content-Length: %w", errNonNumericChars)
 	}
 	return v, nil
 }
@@ -2389,6 +3139,17 @@ func AppendNormalizedHeaderKeyBytes(dst, key []byte) []byte {
 	return AppendNormalizedHeaderKey(dst, b2s(key))
 }
 
+func appendArgsKeyBytes(dst []byte, args []argsKV, sep []byte) []byte {
+	for i, n := 0, len(args); i < n; i++ {
+		kv := &args[i]
+		dst = append(dst, kv.key...)
+		if i+1 < n {
+			dst = append(dst, sep...)
+		}
+	}
+	return dst
+}
+
 var (
 	errNeedMore    = errors.New("need more data: cannot find trailing lf")
 	errInvalidName = errors.New("invalid header name")
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/headers.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/headers.go
index 378dfec8..676a0da1 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/headers.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/headers.go
@@ -36,8 +36,9 @@ const (
 	HeaderVary              = "Vary"
 
 	// Connection management
-	HeaderConnection = "Connection"
-	HeaderKeepAlive  = "Keep-Alive"
+	HeaderConnection      = "Connection"
+	HeaderKeepAlive       = "Keep-Alive"
+	HeaderProxyConnection = "Proxy-Connection"
 
 	// Content negotiation
 	HeaderAccept         = "Accept"
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/http.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/http.go
index a8efbe85..47431cdc 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/http.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/http.go
@@ -17,6 +17,18 @@ import (
 	"github.com/valyala/bytebufferpool"
 )
 
+var (
+	requestBodyPoolSizeLimit  = -1
+	responseBodyPoolSizeLimit = -1
+)
+
+// SetBodySizePoolLimit set the max body size for bodies to be returned to the pool.
+// If the body size is larger it will be released instead of put back into the pool for reuse.
+func SetBodySizePoolLimit(reqBodyLimit, respBodyLimit int) {
+	requestBodyPoolSizeLimit = reqBodyLimit
+	responseBodyPoolSizeLimit = respBodyLimit
+}
+
 // Request represents HTTP request.
 //
 // It is forbidden copying Request instances. Create new instances
@@ -41,7 +53,7 @@ type Request struct {
 
 	multipartForm         *multipart.Form
 	multipartFormBoundary string
-	secureErrorLogMessage        bool
+	secureErrorLogMessage bool
 
 	// Group bool members in order to reduce Request object size.
 	parsedURI      bool
@@ -53,9 +65,12 @@ type Request struct {
 	// Client/HostClient shouldn't use this field but should depend on the uri.scheme instead.
 	isTLS bool
 
-	// Request timeout. Usually set by DoDealine or DoTimeout
+	// Request timeout. Usually set by DoDeadline or DoTimeout
 	// if <= 0, means not set
 	timeout time.Duration
+
+	// Use Host header (request.Header.SetHost) instead of the host from SetRequestURI, SetHost, or URI().SetHost
+	UseHostHeader bool
 }
 
 // Response represents HTTP response.
@@ -88,7 +103,7 @@ type Response struct {
 	// Use it for writing HEAD responses.
 	SkipBody bool
 
-	keepBodyBuffer bool
+	keepBodyBuffer        bool
 	secureErrorLogMessage bool
 
 	// Remote TCPAddr from concurrently net.Conn
@@ -320,7 +335,9 @@ func (resp *Response) LocalAddr() net.Addr {
 
 // Body returns response body.
 //
-// The returned body is valid until the response modification.
+// The returned value is valid until the response is released,
+// either though ReleaseResponse or your request handler returning.
+// Do not store references to returned value. Make copies instead.
 func (resp *Response) Body() []byte {
 	if resp.bodyStream != nil {
 		bodyBuf := resp.bodyBuffer()
@@ -638,7 +655,9 @@ func (req *Request) SwapBody(body []byte) []byte {
 
 // Body returns request body.
 //
-// The returned body is valid until the request modification.
+// The returned value is valid until the request is released,
+// either though ReleaseRequest or your request handler returning.
+// Do not store references to returned value. Make copies instead.
 func (req *Request) Body() []byte {
 	if req.bodyRaw != nil {
 		return req.bodyRaw
@@ -725,6 +744,8 @@ func (req *Request) copyToSkipBody(dst *Request) {
 	dst.parsedPostArgs = req.parsedPostArgs
 	dst.isTLS = req.isTLS
 
+	dst.UseHostHeader = req.UseHostHeader
+
 	// do not copy multipartForm - it will be automatically
 	// re-created on the first call to MultipartForm.
 }
@@ -770,6 +791,20 @@ func (req *Request) URI() *URI {
 	return &req.uri
 }
 
+// SetURI initializes request URI
+// Use this method if a single URI may be reused across multiple requests.
+// Otherwise, you can just use SetRequestURI() and it will be parsed as new URI.
+// The URI is copied and can be safely modified later.
+func (req *Request) SetURI(newUri *URI) {
+	if newUri != nil {
+		newUri.CopyTo(&req.uri)
+		req.parsedURI = true
+		return
+	}
+	req.uri.Reset()
+	req.parsedURI = false
+}
+
 func (req *Request) parseURI() error {
 	if req.parsedURI {
 		return nil
@@ -826,7 +861,7 @@ func (req *Request) MultipartForm() (*multipart.Form, error) {
 		if bytes.Equal(ce, strGzip) {
 			// Do not care about memory usage here.
 			if bodyStream, err = gzip.NewReader(bodyStream); err != nil {
-				return nil, fmt.Errorf("cannot gunzip request body: %s", err)
+				return nil, fmt.Errorf("cannot gunzip request body: %w", err)
 			}
 		} else if len(ce) > 0 {
 			return nil, fmt.Errorf("unsupported Content-Encoding: %q", ce)
@@ -835,14 +870,14 @@ func (req *Request) MultipartForm() (*multipart.Form, error) {
 		mr := multipart.NewReader(bodyStream, req.multipartFormBoundary)
 		req.multipartForm, err = mr.ReadForm(8 * 1024)
 		if err != nil {
-			return nil, fmt.Errorf("cannot read multipart/form-data body: %s", err)
+			return nil, fmt.Errorf("cannot read multipart/form-data body: %w", err)
 		}
 	} else {
 		body := req.bodyBytes()
 		if bytes.Equal(ce, strGzip) {
 			// Do not care about memory usage here.
 			if body, err = AppendGunzipBytes(nil, body); err != nil {
-				return nil, fmt.Errorf("cannot gunzip request body: %s", err)
+				return nil, fmt.Errorf("cannot gunzip request body: %w", err)
 			}
 		} else if len(ce) > 0 {
 			return nil, fmt.Errorf("unsupported Content-Encoding: %q", ce)
@@ -876,14 +911,14 @@ func WriteMultipartForm(w io.Writer, f *multipart.Form, boundary string) error {
 
 	mw := multipart.NewWriter(w)
 	if err := mw.SetBoundary(boundary); err != nil {
-		return fmt.Errorf("cannot use form boundary %q: %s", boundary, err)
+		return fmt.Errorf("cannot use form boundary %q: %w", boundary, err)
 	}
 
 	// marshal values
 	for k, vv := range f.Value {
 		for _, v := range vv {
 			if err := mw.WriteField(k, v); err != nil {
-				return fmt.Errorf("cannot write form field %q value %q: %s", k, v, err)
+				return fmt.Errorf("cannot write form field %q value %q: %w", k, v, err)
 			}
 		}
 	}
@@ -893,23 +928,23 @@ func WriteMultipartForm(w io.Writer, f *multipart.Form, boundary string) error {
 		for _, fv := range fvv {
 			vw, err := mw.CreatePart(fv.Header)
 			if err != nil {
-				return fmt.Errorf("cannot create form file %q (%q): %s", k, fv.Filename, err)
+				return fmt.Errorf("cannot create form file %q (%q): %w", k, fv.Filename, err)
 			}
 			fh, err := fv.Open()
 			if err != nil {
 				return fmt.Errorf("cannot open form file %q (%q): %s", k, fv.Filename, err)
 			}
 			if _, err = copyZeroAlloc(vw, fh); err != nil {
-				return fmt.Errorf("error when copying form file %q (%q): %s", k, fv.Filename, err)
+				return fmt.Errorf("error when copying form file %q (%q): %w", k, fv.Filename, err)
 			}
 			if err = fh.Close(); err != nil {
-				return fmt.Errorf("cannot close form file %q (%q): %s", k, fv.Filename, err)
+				return fmt.Errorf("cannot close form file %q (%q): %w", k, fv.Filename, err)
 			}
 		}
 	}
 
 	if err := mw.Close(); err != nil {
-		return fmt.Errorf("error when closing multipart form writer: %s", err)
+		return fmt.Errorf("error when closing multipart form writer: %w", err)
 	}
 
 	return nil
@@ -927,16 +962,20 @@ func readMultipartForm(r io.Reader, boundary string, size, maxInMemoryFileSize i
 	mr := multipart.NewReader(lr, boundary)
 	f, err := mr.ReadForm(int64(maxInMemoryFileSize))
 	if err != nil {
-		return nil, fmt.Errorf("cannot read multipart/form-data body: %s", err)
+		return nil, fmt.Errorf("cannot read multipart/form-data body: %w", err)
 	}
 	return f, nil
 }
 
 // Reset clears request contents.
 func (req *Request) Reset() {
+	if requestBodyPoolSizeLimit >= 0 && req.body != nil {
+		req.ReleaseBody(requestBodyPoolSizeLimit)
+	}
 	req.Header.Reset()
 	req.resetSkipHeader()
 	req.timeout = 0
+	req.UseHostHeader = false
 }
 
 func (req *Request) resetSkipHeader() {
@@ -962,6 +1001,9 @@ func (req *Request) RemoveMultipartFormFiles() {
 
 // Reset clears response contents.
 func (resp *Response) Reset() {
+	if responseBodyPoolSizeLimit >= 0 && resp.body != nil {
+		resp.ReleaseBody(responseBodyPoolSizeLimit)
+	}
 	resp.Header.Reset()
 	resp.resetSkipHeader()
 	resp.SkipBody = false
@@ -1111,22 +1153,53 @@ func (req *Request) ContinueReadBody(r *bufio.Reader, maxBodySize int, preParseM
 		// the end of body is determined by connection close.
 		// So just ignore request body for requests without
 		// 'Content-Length' and 'Transfer-Encoding' headers.
-		req.Header.SetContentLength(0)
+		// refer to https://tools.ietf.org/html/rfc7230#section-3.3.2
+		if !req.Header.ignoreBody() {
+			req.Header.SetContentLength(0)
+		}
 		return nil
 	}
 
+	if err = req.ReadBody(r, contentLength, maxBodySize); err != nil {
+		return err
+	}
+
+	if req.Header.ContentLength() == -1 {
+		err = req.Header.ReadTrailer(r)
+		if err != nil && err != io.EOF {
+			return err
+		}
+	}
+	return nil
+}
+
+// ReadBody reads request body from the given r, limiting the body size.
+//
+// If maxBodySize > 0 and the body size exceeds maxBodySize,
+// then ErrBodyTooLarge is returned.
+func (req *Request) ReadBody(r *bufio.Reader, contentLength int, maxBodySize int) (err error) {
 	bodyBuf := req.bodyBuffer()
 	bodyBuf.Reset()
-	bodyBuf.B, err = readBody(r, contentLength, maxBodySize, bodyBuf.B)
+
+	if contentLength >= 0 {
+		bodyBuf.B, err = readBody(r, contentLength, maxBodySize, bodyBuf.B)
+
+	} else if contentLength == -1 {
+		bodyBuf.B, err = readBodyChunked(r, maxBodySize, bodyBuf.B)
+
+	} else {
+		bodyBuf.B, err = readBodyIdentity(r, maxBodySize, bodyBuf.B)
+		req.Header.SetContentLength(len(bodyBuf.B))
+	}
+
 	if err != nil {
 		req.Reset()
 		return err
 	}
-	req.Header.SetContentLength(len(bodyBuf.B))
 	return nil
 }
 
-// ContinueReadBody reads request body if request header contains
+// ContinueReadBodyStream reads request body if request header contains
 // 'Expect: 100-continue'.
 //
 // The caller must send StatusContinue response before calling this method.
@@ -1168,12 +1241,12 @@ func (req *Request) ContinueReadBodyStream(r *bufio.Reader, maxBodySize int, pre
 		if err == ErrBodyTooLarge {
 			req.Header.SetContentLength(contentLength)
 			req.body = bodyBuf
-			req.bodyStream = acquireRequestStream(bodyBuf, r, contentLength)
+			req.bodyStream = acquireRequestStream(bodyBuf, r, &req.Header)
 			return nil
 		}
 		if err == errChunkedStream {
 			req.body = bodyBuf
-			req.bodyStream = acquireRequestStream(bodyBuf, r, -1)
+			req.bodyStream = acquireRequestStream(bodyBuf, r, &req.Header)
 			return nil
 		}
 		req.Reset()
@@ -1181,8 +1254,8 @@ func (req *Request) ContinueReadBodyStream(r *bufio.Reader, maxBodySize int, pre
 	}
 
 	req.body = bodyBuf
-	req.bodyStream = acquireRequestStream(bodyBuf, r, contentLength)
-	req.Header.SetContentLength(len(bodyBuf.B))
+	req.bodyStream = acquireRequestStream(bodyBuf, r, &req.Header)
+	req.Header.SetContentLength(contentLength)
 	return nil
 }
 
@@ -1193,7 +1266,10 @@ func (resp *Response) Read(r *bufio.Reader) error {
 	return resp.ReadLimitBody(r, 0)
 }
 
-// ReadLimitBody reads response from the given r, limiting the body size.
+// ReadLimitBody reads response headers from the given r,
+// then reads the body using the ReadBody function and limiting the body size.
+//
+// If resp.SkipBody is true then it skips reading the response body.
 //
 // If maxBodySize > 0 and the body size exceeds maxBodySize,
 // then ErrBodyTooLarge is returned.
@@ -1213,17 +1289,43 @@ func (resp *Response) ReadLimitBody(r *bufio.Reader, maxBodySize int) error {
 	}
 
 	if !resp.mustSkipBody() {
-		bodyBuf := resp.bodyBuffer()
-		bodyBuf.Reset()
-		bodyBuf.B, err = readBody(r, resp.Header.ContentLength(), maxBodySize, bodyBuf.B)
+		err = resp.ReadBody(r, maxBodySize)
 		if err != nil {
 			return err
 		}
-		resp.Header.SetContentLength(len(bodyBuf.B))
+	}
+
+	if resp.Header.ContentLength() == -1 {
+		err = resp.Header.ReadTrailer(r)
+		if err != nil && err != io.EOF {
+			return err
+		}
 	}
 	return nil
 }
 
+// ReadBody reads response body from the given r, limiting the body size.
+//
+// If maxBodySize > 0 and the body size exceeds maxBodySize,
+// then ErrBodyTooLarge is returned.
+func (resp *Response) ReadBody(r *bufio.Reader, maxBodySize int) (err error) {
+	bodyBuf := resp.bodyBuffer()
+	bodyBuf.Reset()
+
+	contentLength := resp.Header.ContentLength()
+	if contentLength >= 0 {
+		bodyBuf.B, err = readBody(r, contentLength, maxBodySize, bodyBuf.B)
+
+	} else if contentLength == -1 {
+		bodyBuf.B, err = readBodyChunked(r, maxBodySize, bodyBuf.B)
+
+	} else {
+		bodyBuf.B, err = readBodyIdentity(r, maxBodySize, bodyBuf.B)
+		resp.Header.SetContentLength(len(bodyBuf.B))
+	}
+	return err
+}
+
 func (resp *Response) mustSkipBody() bool {
 	return resp.SkipBody || resp.Header.mustSkipContentLength()
 }
@@ -1316,10 +1418,15 @@ func (req *Request) Write(w *bufio.Writer) error {
 	if len(req.Header.Host()) == 0 || req.parsedURI {
 		uri := req.URI()
 		host := uri.Host()
-		if len(host) == 0 {
-			return errRequestHostRequired
+		if len(req.Header.Host()) == 0 {
+			if len(host) == 0 {
+				return errRequestHostRequired
+			} else {
+				req.Header.SetHostBytes(host)
+			}
+		} else if !req.UseHostHeader {
+			req.Header.SetHostBytes(host)
 		}
-		req.Header.SetHostBytes(host)
 		req.Header.SetRequestURIBytes(uri.RequestURI())
 
 		if len(uri.username) > 0 {
@@ -1351,7 +1458,7 @@ func (req *Request) Write(w *bufio.Writer) error {
 	if req.onlyMultipartForm() {
 		body, err = marshalMultipartForm(req.multipartForm, req.multipartFormBoundary)
 		if err != nil {
-			return fmt.Errorf("error when marshaling multipart form: %s", err)
+			return fmt.Errorf("error when marshaling multipart form: %w", err)
 		}
 		req.Header.SetMultipartFormBoundary(req.multipartFormBoundary)
 	}
@@ -1682,9 +1789,13 @@ func (req *Request) writeBodyStream(w *bufio.Writer) error {
 		}
 	} else {
 		req.Header.SetContentLength(-1)
-		if err = req.Header.Write(w); err == nil {
+		err = req.Header.Write(w)
+		if err == nil {
 			err = writeBodyChunked(w, req.bodyStream)
 		}
+		if err == nil {
+			err = req.Header.writeTrailer(w)
+		}
 	}
 	err1 := req.closeBodyStream()
 	if err == nil {
@@ -1738,6 +1849,9 @@ func (resp *Response) writeBodyStream(w *bufio.Writer, sendBody bool) (err error
 			if err == nil && sendBody {
 				err = writeBodyChunked(w, resp.bodyStream)
 			}
+			if err == nil {
+				err = resp.Header.writeTrailer(w)
+			}
 		}
 	}
 	err1 := resp.closeBodyStream()
@@ -1853,19 +1967,8 @@ func writeBodyFixedSize(w *bufio.Writer, r io.Reader, size int64) error {
 		}
 	}
 
-	// Unwrap a single limited reader for triggering sendfile path
-	// in net.TCPConn.ReadFrom.
-	lr, ok := r.(*io.LimitedReader)
-	if ok {
-		r = lr.R
-	}
-
 	n, err := copyZeroAlloc(w, r)
 
-	if ok {
-		lr.N -= n
-	}
-
 	if n != size && err == nil {
 		err = fmt.Errorf("copied %d bytes from body stream instead of %d bytes", n, size)
 	}
@@ -1897,12 +2000,13 @@ func writeChunk(w *bufio.Writer, b []byte) error {
 	if _, err := w.Write(b); err != nil {
 		return err
 	}
-	_, err := w.Write(strCRLF)
-	err1 := w.Flush()
-	if err == nil {
-		err = err1
+	// If is end chunk, write CRLF after writing trailer
+	if n > 0 {
+		if _, err := w.Write(strCRLF); err != nil {
+			return err
+		}
 	}
-	return err
+	return w.Flush()
 }
 
 // ErrBodyTooLarge is returned if either request or response body exceeds
@@ -1910,17 +2014,10 @@ func writeChunk(w *bufio.Writer, b []byte) error {
 var ErrBodyTooLarge = errors.New("body size exceeds the given limit")
 
 func readBody(r *bufio.Reader, contentLength int, maxBodySize int, dst []byte) ([]byte, error) {
-	dst = dst[:0]
-	if contentLength >= 0 {
-		if maxBodySize > 0 && contentLength > maxBodySize {
-			return dst, ErrBodyTooLarge
-		}
-		return appendBodyFixedSize(r, dst, contentLength)
+	if maxBodySize > 0 && contentLength > maxBodySize {
+		return dst, ErrBodyTooLarge
 	}
-	if contentLength == -1 {
-		return readBodyChunked(r, maxBodySize, dst)
-	}
-	return readBodyIdentity(r, maxBodySize, dst)
+	return appendBodyFixedSize(r, dst, contentLength)
 }
 
 var errChunkedStream = errors.New("chunked stream")
@@ -2037,6 +2134,9 @@ func readBodyChunked(r *bufio.Reader, maxBodySize int, dst []byte) ([]byte, erro
 		if err != nil {
 			return dst, err
 		}
+		if chunkSize == 0 {
+			return dst, err
+		}
 		if maxBodySize > 0 && len(dst)+chunkSize > maxBodySize {
 			return dst, ErrBodyTooLarge
 		}
@@ -2050,9 +2150,6 @@ func readBodyChunked(r *bufio.Reader, maxBodySize int, dst []byte) ([]byte, erro
 			}
 		}
 		dst = dst[:len(dst)-strCRLFLen]
-		if chunkSize == 0 {
-			return dst, nil
-		}
 	}
 }
 
@@ -2068,29 +2165,40 @@ func parseChunkSize(r *bufio.Reader) (int, error) {
 				error: fmt.Errorf("cannot read '\r' char at the end of chunk size: %s", err),
 			}
 		}
-		// Skip any trailing whitespace after chunk size.
-		if c == ' ' {
+		// Skip chunk extension after chunk size.
+		// Add support later if anyone needs it.
+		if c != '\r' {
 			continue
 		}
-		if c != '\r' {
+		if err := r.UnreadByte(); err != nil {
 			return -1, ErrBrokenChunk{
-				error: fmt.Errorf("unexpected char %q at the end of chunk size. Expected %q", c, '\r'),
+				error: fmt.Errorf("cannot unread '\r' char at the end of chunk size: %w", err),
 			}
 		}
 		break
 	}
-	c, err := r.ReadByte()
+	err = readCrLf(r)
 	if err != nil {
-		return -1, ErrBrokenChunk{
-			error: fmt.Errorf("cannot read '\n' char at the end of chunk size: %s", err),
-		}
+		return -1, err
 	}
-	if c != '\n' {
-		return -1, ErrBrokenChunk{
-			error: fmt.Errorf("unexpected char %q at the end of chunk size. Expected %q", c, '\n'),
+	return n, nil
+}
+
+func readCrLf(r *bufio.Reader) error {
+	for _, exp := range []byte{'\r', '\n'} {
+		c, err := r.ReadByte()
+		if err != nil {
+			return ErrBrokenChunk{
+				error: fmt.Errorf("cannot read %q char at the end of chunk size: %w", exp, err),
+			}
+		}
+		if c != exp {
+			return ErrBrokenChunk{
+				error: fmt.Errorf("unexpected char %q at the end of chunk size. Expected %q", c, exp),
+			}
 		}
 	}
-	return n, nil
+	return nil
 }
 
 func round2(n int) int {
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/server.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/server.go
index 67f47d86..3b585422 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/server.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/server.go
@@ -196,14 +196,6 @@ type Server struct {
 	// It works with ListenAndServe as well.
 	Concurrency int
 
-	// Whether to disable keep-alive connections.
-	//
-	// The server will close all the incoming connections after sending
-	// the first response to client if this option is set to true.
-	//
-	// By default keep-alive connections are enabled.
-	DisableKeepalive bool
-
 	// Per-connection buffer size for requests' reading.
 	// This also limits the maximum header size.
 	//
@@ -256,12 +248,9 @@ type Server struct {
 	// Deprecated: Use IdleTimeout instead.
 	MaxKeepaliveDuration time.Duration
 
-	// Whether to enable tcp keep-alive connections.
-	//
-	// Whether the operating system should send tcp keep-alive messages on the tcp connection.
-	//
-	// By default tcp keep-alive connections are disabled.
-	TCPKeepalive bool
+	// MaxIdleWorkerDuration is the maximum idle time of a single worker in the underlying
+	// worker pool of the Server. Idle workers beyond this time will be cleared.
+	MaxIdleWorkerDuration time.Duration
 
 	// Period between tcp keep-alive messages.
 	//
@@ -275,6 +264,21 @@ type Server struct {
 	// Request body size is limited by DefaultMaxRequestBodySize by default.
 	MaxRequestBodySize int
 
+	// Whether to disable keep-alive connections.
+	//
+	// The server will close all the incoming connections after sending
+	// the first response to client if this option is set to true.
+	//
+	// By default keep-alive connections are enabled.
+	DisableKeepalive bool
+
+	// Whether to enable tcp keep-alive connections.
+	//
+	// Whether the operating system should send tcp keep-alive messages on the tcp connection.
+	//
+	// By default tcp keep-alive connections are disabled.
+	TCPKeepalive bool
+
 	// Aggressively reduces memory usage at the cost of higher CPU usage
 	// if set to true.
 	//
@@ -340,7 +344,7 @@ type Server struct {
 
 	// SleepWhenConcurrencyLimitsExceeded is a duration to be slept of if
 	// the concurrency limit in exceeded (default [when is 0]: don't sleep
-	// and accept new connections immidiatelly).
+	// and accept new connections immediately).
 	SleepWhenConcurrencyLimitsExceeded time.Duration
 
 	// NoDefaultServerHeader, when set to true, causes the default Server header
@@ -366,16 +370,6 @@ type Server struct {
 	// set to true, the Content-Type will not be present.
 	NoDefaultContentType bool
 
-	// ConnState specifies an optional callback function that is
-	// called when a client connection changes state. See the
-	// ConnState type and associated constants for details.
-	ConnState func(net.Conn, ConnState)
-
-	// Logger, which is used by RequestCtx.Logger().
-	//
-	// By default standard logger from log package is used.
-	Logger Logger
-
 	// KeepHijackedConns is an opt-in disable of connection
 	// close by fasthttp after connections' HijackHandler returns.
 	// This allows to save goroutines, e.g. when fasthttp used to upgrade
@@ -391,7 +385,27 @@ type Server struct {
 	// larger then the current limit.
 	StreamRequestBody bool
 
-	tlsConfig  *tls.Config
+	// ConnState specifies an optional callback function that is
+	// called when a client connection changes state. See the
+	// ConnState type and associated constants for details.
+	ConnState func(net.Conn, ConnState)
+
+	// Logger, which is used by RequestCtx.Logger().
+	//
+	// By default standard logger from log package is used.
+	Logger Logger
+
+	// TLSConfig optionally provides a TLS configuration for use
+	// by ServeTLS, ServeTLSEmbed, ListenAndServeTLS, ListenAndServeTLSEmbed,
+	// AppendCert, AppendCertEmbed and NextProto.
+	//
+	// Note that this value is cloned by ServeTLS, ServeTLSEmbed, ListenAndServeTLS
+	// and ListenAndServeTLSEmbed, so it's not possible to modify the configuration
+	// with methods like tls.Config.SetSessionTicketKeys.
+	// To use SetSessionTicketKeys, use Server.Serve with a TLS Listener
+	// instead.
+	TLSConfig *tls.Config
+
 	nextProtos map[string]ServeHandler
 
 	concurrency      uint32
@@ -404,9 +418,12 @@ type Server struct {
 	writerPool     sync.Pool
 	hijackConnPool sync.Pool
 
-	// We need to know our listeners so we can close them in Shutdown().
+	// We need to know our listeners and idle connections so we can close them in Shutdown().
 	ln []net.Listener
 
+	idleConns   map[net.Conn]struct{}
+	idleConnsMu sync.Mutex
+
 	mu   sync.Mutex
 	open int32
 	stop int32
@@ -572,6 +589,7 @@ type RequestCtx struct {
 	connID         uint64
 	connRequestNum uint64
 	connTime       time.Time
+	remoteAddr     net.Addr
 
 	time time.Time
 
@@ -692,6 +710,21 @@ func (ctx *RequestCtx) VisitUserValues(visitor func([]byte, interface{})) {
 	}
 }
 
+// ResetUserValues allows to reset user values from Request Context
+func (ctx *RequestCtx) ResetUserValues() {
+	ctx.userValues.Reset()
+}
+
+// RemoveUserValue removes the given key and the value under it in ctx.
+func (ctx *RequestCtx) RemoveUserValue(key string) {
+	ctx.userValues.Remove(key)
+}
+
+// RemoveUserValueBytes removes the given key and the value under it in ctx.
+func (ctx *RequestCtx) RemoveUserValueBytes(key []byte) {
+	ctx.userValues.RemoveBytes(key)
+}
+
 type connTLSer interface {
 	Handshake() error
 	ConnectionState() tls.ConnectionState
@@ -709,6 +742,13 @@ func (ctx *RequestCtx) IsTLS() bool {
 	//
 	//     // other custom fields here
 	// }
+
+	// perIPConn wraps the net.Conn in the Conn field
+	if pic, ok := ctx.c.(*perIPConn); ok {
+		_, ok := pic.Conn.(connTLSer)
+		return ok
+	}
+
 	_, ok := ctx.c.(connTLSer)
 	return ok
 }
@@ -737,12 +777,49 @@ func (ctx *RequestCtx) Conn() net.Conn {
 	return ctx.c
 }
 
+func (ctx *RequestCtx) reset() {
+	ctx.userValues.Reset()
+	ctx.Request.Reset()
+	ctx.Response.Reset()
+	ctx.fbr.reset()
+
+	ctx.connID = 0
+	ctx.connRequestNum = 0
+	ctx.connTime = zeroTime
+	ctx.remoteAddr = nil
+	ctx.time = zeroTime
+	ctx.c = nil
+
+	// Don't reset ctx.s!
+	// We have a pool per server so the next time this ctx is used it
+	// will be assigned the same value again.
+	// ctx might still be in use for context.Done() and context.Err()
+	// which are safe to use as they only use ctx.s and no other value.
+
+	if ctx.timeoutResponse != nil {
+		ctx.timeoutResponse.Reset()
+	}
+
+	if ctx.timeoutTimer != nil {
+		stopTimer(ctx.timeoutTimer)
+	}
+
+	ctx.hijackHandler = nil
+	ctx.hijackNoResponse = false
+}
+
 type firstByteReader struct {
 	c        net.Conn
 	ch       byte
 	byteRead bool
 }
 
+func (r *firstByteReader) reset() {
+	r.c = nil
+	r.ch = 0
+	r.byteRead = false
+}
+
 func (r *firstByteReader) Read(b []byte) (int, error) {
 	if len(b) == 0 {
 		return 0, nil
@@ -846,40 +923,42 @@ func (ctx *RequestCtx) SetContentTypeBytes(contentType []byte) {
 
 // RequestURI returns RequestURI.
 //
-// This uri is valid until returning from RequestHandler.
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) RequestURI() []byte {
 	return ctx.Request.Header.RequestURI()
 }
 
 // URI returns requested uri.
 //
-// The uri is valid until returning from RequestHandler.
+// This uri is valid until your request handler returns.
 func (ctx *RequestCtx) URI() *URI {
 	return ctx.Request.URI()
 }
 
 // Referer returns request referer.
 //
-// The referer is valid until returning from RequestHandler.
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) Referer() []byte {
 	return ctx.Request.Header.Referer()
 }
 
 // UserAgent returns User-Agent header value from the request.
+//
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) UserAgent() []byte {
 	return ctx.Request.Header.UserAgent()
 }
 
 // Path returns requested path.
 //
-// The path is valid until returning from RequestHandler.
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) Path() []byte {
 	return ctx.URI().Path()
 }
 
 // Host returns requested host.
 //
-// The host is valid until returning from RequestHandler.
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) Host() []byte {
 	return ctx.URI().Host()
 }
@@ -888,9 +967,9 @@ func (ctx *RequestCtx) Host() []byte {
 //
 // It doesn't return POST'ed arguments - use PostArgs() for this.
 //
-// Returned arguments are valid until returning from RequestHandler.
-//
 // See also PostArgs, FormValue and FormFile.
+//
+// These args are valid until your request handler returns.
 func (ctx *RequestCtx) QueryArgs() *Args {
 	return ctx.URI().QueryArgs()
 }
@@ -899,9 +978,9 @@ func (ctx *RequestCtx) QueryArgs() *Args {
 //
 // It doesn't return query arguments from RequestURI - use QueryArgs for this.
 //
-// Returned arguments are valid until returning from RequestHandler.
-//
 // See also QueryArgs, FormValue and FormFile.
+//
+// These args are valid until your request handler returns.
 func (ctx *RequestCtx) PostArgs() *Args {
 	return ctx.Request.PostArgs()
 }
@@ -917,7 +996,7 @@ func (ctx *RequestCtx) PostArgs() *Args {
 //
 // Use SaveMultipartFile function for permanently saving uploaded file.
 //
-// The returned form is valid until returning from RequestHandler.
+// The returned form is valid until your request handler returns.
 //
 // See also FormFile and FormValue.
 func (ctx *RequestCtx) MultipartForm() (*multipart.Form, error) {
@@ -931,7 +1010,7 @@ func (ctx *RequestCtx) MultipartForm() (*multipart.Form, error) {
 //
 // Use SaveMultipartFile function for permanently saving uploaded file.
 //
-// The returned file header is valid until returning from RequestHandler.
+// The returned file header is valid until your request handler returns.
 func (ctx *RequestCtx) FormFile(key string) (*multipart.FileHeader, error) {
 	mf, err := ctx.MultipartForm()
 	if err != nil {
@@ -1015,7 +1094,7 @@ func SaveMultipartFile(fh *multipart.FileHeader, path string) (err error) {
 //   * MultipartForm for obtaining values from multipart form.
 //   * FormFile for obtaining uploaded files.
 //
-// The returned value is valid until returning from RequestHandler.
+// The returned value is valid until your request handler returns.
 func (ctx *RequestCtx) FormValue(key string) []byte {
 	v := ctx.QueryArgs().Peek(key)
 	if len(v) > 0 {
@@ -1077,7 +1156,7 @@ func (ctx *RequestCtx) IsPatch() bool {
 
 // Method return request method.
 //
-// Returned value is valid until returning from RequestHandler.
+// Returned value is valid until your request handler returns.
 func (ctx *RequestCtx) Method() []byte {
 	return ctx.Request.Header.Method()
 }
@@ -1091,6 +1170,9 @@ func (ctx *RequestCtx) IsHead() bool {
 //
 // Always returns non-nil result.
 func (ctx *RequestCtx) RemoteAddr() net.Addr {
+	if ctx.remoteAddr != nil {
+		return ctx.remoteAddr
+	}
 	if ctx.c == nil {
 		return zeroTCPAddr
 	}
@@ -1101,6 +1183,14 @@ func (ctx *RequestCtx) RemoteAddr() net.Addr {
 	return addr
 }
 
+// SetRemoteAddr sets remote address to the given value.
+//
+// Set nil value to resore default behaviour for using
+// connection remote address.
+func (ctx *RequestCtx) SetRemoteAddr(remoteAddr net.Addr) {
+	ctx.remoteAddr = remoteAddr
+}
+
 // LocalAddr returns server address for the given request.
 //
 // Always returns non-nil result.
@@ -1253,6 +1343,10 @@ func (ctx *RequestCtx) ResetBody() {
 // SendFile logs all the errors via ctx.Logger.
 //
 // See also ServeFile, FSHandler and FS.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func (ctx *RequestCtx) SendFile(path string) {
 	ServeFile(ctx, path)
 }
@@ -1264,6 +1358,10 @@ func (ctx *RequestCtx) SendFile(path string) {
 // SendFileBytes logs all the errors via ctx.Logger.
 //
 // See also ServeFileBytes, FSHandler and FS.
+//
+// WARNING: do not pass any user supplied paths to this function!
+// WARNING: if path is based on user input users will be able to request
+// any file on your filesystem! Use fasthttp.FS with a sane Root instead.
 func (ctx *RequestCtx) SendFileBytes(path []byte) {
 	ServeFileBytes(ctx, path)
 }
@@ -1312,7 +1410,7 @@ func (ctx *RequestCtx) WriteString(s string) (int, error) {
 
 // PostBody returns POST request body.
 //
-// The returned value is valid until RequestHandler return.
+// The returned bytes are valid until your request handler returns.
 func (ctx *RequestCtx) PostBody() []byte {
 	return ctx.Request.Body()
 }
@@ -1362,7 +1460,7 @@ func (ctx *RequestCtx) IsBodyStream() bool {
 // It is safe re-using returned logger for logging multiple messages
 // for the current request.
 //
-// The returned logger is valid until returning from RequestHandler.
+// The returned logger is valid until your request handler returns.
 func (ctx *RequestCtx) Logger() Logger {
 	if ctx.logger.ctx == nil {
 		ctx.logger.ctx = ctx
@@ -1428,8 +1526,9 @@ func (s *Server) NextProto(key string, nph ServeHandler) {
 	if s.nextProtos == nil {
 		s.nextProtos = make(map[string]ServeHandler)
 	}
+
 	s.configTLS()
-	s.tlsConfig.NextProtos = append(s.tlsConfig.NextProtos, key)
+	s.TLSConfig.NextProtos = append(s.TLSConfig.NextProtos, key)
 	s.nextProtos[key] = nph
 }
 
@@ -1511,14 +1610,14 @@ func (s *Server) ListenAndServe(addr string) error {
 // The server sets the given file mode for the UNIX addr.
 func (s *Server) ListenAndServeUNIX(addr string, mode os.FileMode) error {
 	if err := os.Remove(addr); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("unexpected error when trying to remove unix socket file %q: %s", addr, err)
+		return fmt.Errorf("unexpected error when trying to remove unix socket file %q: %w", addr, err)
 	}
 	ln, err := net.Listen("unix", addr)
 	if err != nil {
 		return err
 	}
 	if err = os.Chmod(addr, mode); err != nil {
-		return fmt.Errorf("cannot chmod %#o for %q: %s", mode, addr, err)
+		return fmt.Errorf("cannot chmod %#o for %q: %w", mode, addr, err)
 	}
 	return s.Serve(ln)
 }
@@ -1588,19 +1687,19 @@ func (s *Server) ServeTLS(ln net.Listener, certFile, keyFile string) error {
 		s.mu.Unlock()
 		return err
 	}
-	if s.tlsConfig == nil {
+	if s.TLSConfig == nil {
 		s.mu.Unlock()
 		return errNoCertOrKeyProvided
 	}
 
 	// BuildNameToCertificate has been deprecated since 1.14.
 	// But since we also support older versions we'll keep this here.
-	s.tlsConfig.BuildNameToCertificate() //nolint:staticcheck
+	s.TLSConfig.BuildNameToCertificate() //nolint:staticcheck
 
 	s.mu.Unlock()
 
 	return s.Serve(
-		tls.NewListener(ln, s.tlsConfig),
+		tls.NewListener(ln, s.TLSConfig.Clone()),
 	)
 }
 
@@ -1618,19 +1717,19 @@ func (s *Server) ServeTLSEmbed(ln net.Listener, certData, keyData []byte) error
 		s.mu.Unlock()
 		return err
 	}
-	if s.tlsConfig == nil {
+	if s.TLSConfig == nil {
 		s.mu.Unlock()
 		return errNoCertOrKeyProvided
 	}
 
 	// BuildNameToCertificate has been deprecated since 1.14.
 	// But since we also support older versions we'll keep this here.
-	s.tlsConfig.BuildNameToCertificate() //nolint:staticcheck
+	s.TLSConfig.BuildNameToCertificate() //nolint:staticcheck
 
 	s.mu.Unlock()
 
 	return s.Serve(
-		tls.NewListener(ln, s.tlsConfig),
+		tls.NewListener(ln, s.TLSConfig.Clone()),
 	)
 }
 
@@ -1645,12 +1744,12 @@ func (s *Server) AppendCert(certFile, keyFile string) error {
 
 	cert, err := tls.LoadX509KeyPair(certFile, keyFile)
 	if err != nil {
-		return fmt.Errorf("cannot load TLS key pair from certFile=%q and keyFile=%q: %s", certFile, keyFile, err)
+		return fmt.Errorf("cannot load TLS key pair from certFile=%q and keyFile=%q: %w", certFile, keyFile, err)
 	}
 
 	s.configTLS()
+	s.TLSConfig.Certificates = append(s.TLSConfig.Certificates, cert)
 
-	s.tlsConfig.Certificates = append(s.tlsConfig.Certificates, cert)
 	return nil
 }
 
@@ -1667,16 +1766,14 @@ func (s *Server) AppendCertEmbed(certData, keyData []byte) error {
 	}
 
 	s.configTLS()
+	s.TLSConfig.Certificates = append(s.TLSConfig.Certificates, cert)
 
-	s.tlsConfig.Certificates = append(s.tlsConfig.Certificates, cert)
 	return nil
 }
 
 func (s *Server) configTLS() {
-	if s.tlsConfig == nil {
-		s.tlsConfig = &tls.Config{
-			PreferServerCipherSuites: true,
-		}
+	if s.TLSConfig == nil {
+		s.TLSConfig = &tls.Config{}
 	}
 }
 
@@ -1709,11 +1806,12 @@ func (s *Server) Serve(ln net.Listener) error {
 	s.mu.Unlock()
 
 	wp := &workerPool{
-		WorkerFunc:      s.serveConn,
-		MaxWorkersCount: maxWorkersCount,
-		LogAllErrors:    s.LogAllErrors,
-		Logger:          s.logger(),
-		connState:       s.setState,
+		WorkerFunc:            s.serveConn,
+		MaxWorkersCount:       maxWorkersCount,
+		LogAllErrors:          s.LogAllErrors,
+		MaxIdleWorkerDuration: s.MaxIdleWorkerDuration,
+		Logger:                s.logger(),
+		connState:             s.setState,
 	}
 	wp.Start()
 
@@ -1768,7 +1866,7 @@ func (s *Server) Serve(ln net.Listener) error {
 // When Shutdown is called, Serve, ListenAndServe, and ListenAndServeTLS immediately return nil.
 // Make sure the program doesn't exit and waits instead for Shutdown to return.
 //
-// Shutdown does not close keepalive connections so its recommended to set ReadTimeout to something else than 0.
+// Shutdown does not close keepalive connections so its recommended to set ReadTimeout and IdleTimeout to something else than 0.
 func (s *Server) Shutdown() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -1790,6 +1888,8 @@ func (s *Server) Shutdown() error {
 		close(s.done)
 	}
 
+	s.closeIdleConns()
+
 	// Closing the listener will make Serve() call Stop on the worker pool.
 	// Setting .stop to 1 will make serveConn() break out of its loop.
 	// Now we just have to wait until all workers are done.
@@ -1990,6 +2090,14 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 		return
 	}
 	if handler, ok := s.nextProtos[proto]; ok {
+		// Remove read or write deadlines that might have previously been set.
+		// The next handler is responsible for setting its own deadlines.
+		if s.ReadTimeout > 0 || s.WriteTimeout > 0 {
+			if err := c.SetDeadline(zeroTime); err != nil {
+				panic(fmt.Sprintf("BUG: error in SetDeadline(zeroTime): %s", err))
+			}
+		}
+
 		return handler(c)
 	}
 
@@ -2005,6 +2113,7 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 		maxRequestBodySize = DefaultMaxRequestBodySize
 	}
 	writeTimeout := s.WriteTimeout
+	previousWriteTimeout := time.Duration(0)
 
 	ctx := s.acquireCtx(c)
 	ctx.connTime = connTime
@@ -2020,7 +2129,6 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 		connectionClose bool
 		isHTTP11        bool
 
-		reqReset               bool
 		continueReadingRequest bool = true
 	)
 	for {
@@ -2044,7 +2152,7 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 			// within the idle time.
 			if connRequestNum > 1 {
 				var b []byte
-				b, err = br.Peek(4)
+				b, err = br.Peek(1)
 				if len(b) == 0 {
 					// If reading from a keep-alive connection returns nothing it means
 					// the connection was closed (either timeout or from the other side).
@@ -2074,13 +2182,39 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 				if err := c.SetReadDeadline(time.Now().Add(s.ReadTimeout)); err != nil {
 					panic(fmt.Sprintf("BUG: error in SetReadDeadline(%s): %s", s.ReadTimeout, err))
 				}
+			} else if s.IdleTimeout > 0 && connRequestNum > 1 {
+				// If this was an idle connection and the server has an IdleTimeout but
+				// no ReadTimeout then we should remove the ReadTimeout.
+				if err := c.SetReadDeadline(zeroTime); err != nil {
+					panic(fmt.Sprintf("BUG: error in SetReadDeadline(zeroTime): %s", err))
+				}
 			}
 			if s.DisableHeaderNamesNormalizing {
 				ctx.Request.Header.DisableNormalizing()
 				ctx.Response.Header.DisableNormalizing()
 			}
-			// reading Headers
-			if err = ctx.Request.Header.Read(br); err == nil {
+
+			// Reading Headers.
+			//
+			// If we have pipline response in the outgoing buffer,
+			// we only want to try and read the next headers once.
+			// If we have to wait for the next request we flush the
+			// outgoing buffer first so it doesn't have to wait.
+			if bw != nil && bw.Buffered() > 0 {
+				err = ctx.Request.Header.readLoop(br, false)
+				if err == errNeedMore {
+					err = bw.Flush()
+					if err != nil {
+						break
+					}
+
+					err = ctx.Request.Header.Read(br)
+				}
+			} else {
+				err = ctx.Request.Header.Read(br)
+			}
+
+			if err == nil {
 				if onHdrRecv := s.HeaderReceived; onHdrRecv != nil {
 					reqConf := onHdrRecv(&ctx.Request.Header)
 					if reqConf.ReadTimeout > 0 {
@@ -2214,19 +2348,15 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 			timeoutResponse.CopyTo(&ctx.Response)
 		}
 
-		if !ctx.IsGet() && ctx.IsHead() {
+		if ctx.IsHead() {
 			ctx.Response.SkipBody = true
 		}
-		reqReset = true
-		ctx.Request.Reset()
 
 		hijackHandler = ctx.hijackHandler
 		ctx.hijackHandler = nil
 		hijackNoResponse = ctx.hijackNoResponse && hijackHandler != nil
 		ctx.hijackNoResponse = false
 
-		ctx.userValues.Reset()
-
 		if s.MaxRequestsPerConn > 0 && connRequestNum >= uint64(s.MaxRequestsPerConn) {
 			ctx.SetConnectionClose()
 		}
@@ -2235,9 +2365,15 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 			if err := c.SetWriteDeadline(time.Now().Add(writeTimeout)); err != nil {
 				panic(fmt.Sprintf("BUG: error in SetWriteDeadline(%s): %s", writeTimeout, err))
 			}
+			previousWriteTimeout = writeTimeout
+		} else if previousWriteTimeout > 0 {
+			// We don't want a write timeout but we previously set one, remove it.
+			if err := c.SetWriteDeadline(zeroTime); err != nil {
+				panic(fmt.Sprintf("BUG: error in SetWriteDeadline(zeroTime): %s", err))
+			}
+			previousWriteTimeout = 0
 		}
 
-		connectionClose = connectionClose || ctx.Response.ConnectionClose()
 		connectionClose = connectionClose || ctx.Response.ConnectionClose() || (s.CloseOnShutdown && atomic.LoadInt32(&s.stop) == 1)
 		if connectionClose {
 			ctx.Response.Header.SetCanonical(strConnection, strClose)
@@ -2285,9 +2421,6 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 			if br != nil {
 				hjr = br
 				br = nil
-
-				// br may point to ctx.fbr, so do not return ctx into pool below.
-				ctx = nil
 			}
 			if bw != nil {
 				err = bw.Flush()
@@ -2297,15 +2430,11 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 				releaseWriter(s, bw)
 				bw = nil
 			}
-			err = c.SetReadDeadline(zeroTime)
+			err = c.SetDeadline(zeroTime)
 			if err != nil {
 				break
 			}
-			err = c.SetWriteDeadline(zeroTime)
-			if err != nil {
-				break
-			}
-			go hijackConnHandler(hjr, c, s, hijackHandler)
+			go hijackConnHandler(ctx, hjr, c, s, hijackHandler)
 			err = errHijacked
 			break
 		}
@@ -2317,6 +2446,9 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 		}
 
 		s.setState(c, StateIdle)
+		ctx.userValues.Reset()
+		ctx.Request.Reset()
+		ctx.Response.Reset()
 
 		if atomic.LoadInt32(&s.stop) == 1 {
 			err = nil
@@ -2330,25 +2462,21 @@ func (s *Server) serveConn(c net.Conn) (err error) {
 	if bw != nil {
 		releaseWriter(s, bw)
 	}
-	if ctx != nil {
-		// in unexpected cases the for loop will break
-		// before request reset call. in such cases, call it before
-		// release to fix #548
-		if !reqReset {
-			ctx.Request.Reset()
-		}
+	if hijackHandler == nil {
 		s.releaseCtx(ctx)
 	}
+
 	return
 }
 
 func (s *Server) setState(nc net.Conn, state ConnState) {
+	s.trackConn(nc, state)
 	if hook := s.ConnState; hook != nil {
 		hook(nc, state)
 	}
 }
 
-func hijackConnHandler(r io.Reader, c net.Conn, s *Server, h HijackHandler) {
+func hijackConnHandler(ctx *RequestCtx, r io.Reader, c net.Conn, s *Server, h HijackHandler) {
 	hjc := s.acquireHijackConn(r, c)
 	h(hjc)
 
@@ -2359,6 +2487,7 @@ func hijackConnHandler(r io.Reader, c net.Conn, s *Server, h HijackHandler) {
 		c.Close()
 		s.releaseHijackConn(hjc)
 	}
+	s.releaseCtx(ctx)
 }
 
 func (s *Server) acquireHijackConn(r io.Reader, c net.Conn) *hijackConn {
@@ -2422,7 +2551,7 @@ func writeResponse(ctx *RequestCtx, w *bufio.Writer) error {
 		panic("BUG: cannot write timed out response")
 	}
 	err := ctx.Response.Write(w)
-	ctx.Response.Reset()
+
 	return err
 }
 
@@ -2503,17 +2632,19 @@ func releaseWriter(s *Server, w *bufio.Writer) {
 func (s *Server) acquireCtx(c net.Conn) (ctx *RequestCtx) {
 	v := s.ctxPool.Get()
 	if v == nil {
-		ctx = &RequestCtx{
-			s: s,
-		}
 		keepBodyBuffer := !s.ReduceMemoryUsage
+
+		ctx = new(RequestCtx)
 		ctx.Request.keepBodyBuffer = keepBodyBuffer
 		ctx.Response.keepBodyBuffer = keepBodyBuffer
 	} else {
 		ctx = v.(*RequestCtx)
 	}
+
+	ctx.s = s
 	ctx.c = c
-	return
+
+	return ctx
 }
 
 // Init2 prepares ctx for passing to RequestHandler.
@@ -2524,6 +2655,7 @@ func (s *Server) acquireCtx(c net.Conn) (ctx *RequestCtx) {
 // See https://github.com/valyala/httpteleport for details.
 func (ctx *RequestCtx) Init2(conn net.Conn, logger Logger, reduceMemoryUsage bool) {
 	ctx.c = conn
+	ctx.remoteAddr = nil
 	ctx.logger.logger = logger
 	ctx.connID = nextConnID()
 	ctx.s = fakeServer
@@ -2635,8 +2767,8 @@ func (s *Server) releaseCtx(ctx *RequestCtx) {
 	if ctx.timeoutResponse != nil {
 		panic("BUG: cannot release timed out RequestCtx")
 	}
-	ctx.c = nil
-	ctx.fbr.c = nil
+
+	ctx.reset()
 	s.ctxPool.Put(ctx)
 }
 
@@ -2656,7 +2788,7 @@ func (s *Server) getServerName() []byte {
 }
 
 func (s *Server) writeFastError(w io.Writer, statusCode int, msg string) {
-	w.Write(statusLine(statusCode)) //nolint:errcheck
+	w.Write(formatStatusLine(nil, strHTTP11, statusCode, s2b(StatusMessage(statusCode)))) //nolint:errcheck
 
 	server := ""
 	if !s.NoDefaultServerHeader {
@@ -2704,11 +2836,38 @@ func (s *Server) writeErrorResponse(bw *bufio.Writer, ctx *RequestCtx, serverNam
 	if bw == nil {
 		bw = acquireWriter(ctx)
 	}
+
 	writeResponse(ctx, bw) //nolint:errcheck
+	ctx.Response.Reset()
 	bw.Flush()
+
 	return bw
 }
 
+func (s *Server) trackConn(c net.Conn, state ConnState) {
+	s.idleConnsMu.Lock()
+	switch state {
+	case StateIdle:
+		if s.idleConns == nil {
+			s.idleConns = make(map[net.Conn]struct{})
+		}
+		s.idleConns[c] = struct{}{}
+
+	default:
+		delete(s.idleConns, c)
+	}
+	s.idleConnsMu.Unlock()
+}
+
+func (s *Server) closeIdleConns() {
+	s.idleConnsMu.Lock()
+	for c := range s.idleConns {
+		_ = c.Close()
+	}
+	s.idleConns = nil
+	s.idleConnsMu.Unlock()
+}
+
 // A ConnState represents the state of a client connection to a server.
 // It's used by the optional Server.ConnState hook.
 type ConnState int
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/status.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/status.go
index 1746c01d..c88ba11e 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/status.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/status.go
@@ -1,7 +1,7 @@
 package fasthttp
 
 import (
-	"fmt"
+	"strconv"
 )
 
 const (
@@ -80,7 +80,7 @@ const (
 )
 
 var (
-	statusLines = make([][]byte, statusMessageMax+1)
+	unknownStatusCode = "Unknown Status Code"
 
 	statusMessages = []string{
 		StatusContinue:           "Continue",
@@ -154,32 +154,24 @@ var (
 // StatusMessage returns HTTP status message for the given status code.
 func StatusMessage(statusCode int) string {
 	if statusCode < statusMessageMin || statusCode > statusMessageMax {
-		return "Unknown Status Code"
+		return unknownStatusCode
 	}
 
-	s := statusMessages[statusCode]
-	if s == "" {
-		s = "Unknown Status Code"
+	if s := statusMessages[statusCode]; s != "" {
+		return s
 	}
-	return s
+	return unknownStatusCode
 }
 
-func init() {
-	// Fill all valid status lines
-	for i := 0; i < len(statusLines); i++ {
-		statusLines[i] = []byte(fmt.Sprintf("HTTP/1.1 %d %s\r\n", i, StatusMessage(i)))
+func formatStatusLine(dst []byte, protocol []byte, statusCode int, statusText []byte) []byte {
+	dst = append(dst, protocol...)
+	dst = append(dst, ' ')
+	dst = strconv.AppendInt(dst, int64(statusCode), 10)
+	dst = append(dst, ' ')
+	if len(statusText) == 0 {
+		dst = append(dst, s2b(StatusMessage(statusCode))...)
+	} else {
+		dst = append(dst, statusText...)
 	}
-}
-
-func statusLine(statusCode int) []byte {
-	if statusCode < 0 || statusCode > statusMessageMax {
-		return invalidStatusLine(statusCode)
-	}
-
-	return statusLines[statusCode]
-}
-
-func invalidStatusLine(statusCode int) []byte {
-	statusText := StatusMessage(statusCode)
-	return []byte(fmt.Sprintf("HTTP/1.1 %d %s\r\n", statusCode, statusText))
+	return append(dst, strCRLF...)
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/streaming.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/streaming.go
index 39000a26..11750a9d 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/streaming.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/streaming.go
@@ -3,7 +3,6 @@ package fasthttp
 import (
 	"bufio"
 	"bytes"
-	"fmt"
 	"io"
 	"sync"
 
@@ -11,40 +10,51 @@ import (
 )
 
 type requestStream struct {
+	header          *RequestHeader
 	prefetchedBytes *bytes.Reader
 	reader          *bufio.Reader
 	totalBytesRead  int
-	contentLength   int
+	chunkLeft       int
 }
 
 func (rs *requestStream) Read(p []byte) (int, error) {
-	if rs.contentLength == -1 {
-		p = p[:0]
-		strCRLFLen := len(strCRLF)
-		chunkSize, err := parseChunkSize(rs.reader)
-		if err != nil {
-			return len(p), err
+	var (
+		n   int
+		err error
+	)
+	if rs.header.contentLength == -1 {
+		if rs.chunkLeft == 0 {
+			chunkSize, err := parseChunkSize(rs.reader)
+			if err != nil {
+				return 0, err
+			}
+			if chunkSize == 0 {
+				err = rs.header.ReadTrailer(rs.reader)
+				if err != nil && err != io.EOF {
+					return 0, err
+				}
+				return 0, io.EOF
+			}
+			rs.chunkLeft = chunkSize
 		}
-		p, err = appendBodyFixedSize(rs.reader, p, chunkSize+strCRLFLen)
-		if err != nil {
-			return len(p), err
+		bytesToRead := len(p)
+		if rs.chunkLeft < len(p) {
+			bytesToRead = rs.chunkLeft
 		}
-		if !bytes.Equal(p[len(p)-strCRLFLen:], strCRLF) {
-			return len(p), ErrBrokenChunk{
-				error: fmt.Errorf("cannot find crlf at the end of chunk"),
-			}
+		n, err = rs.reader.Read(p[:bytesToRead])
+		rs.totalBytesRead += n
+		rs.chunkLeft -= n
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
 		}
-		p = p[:len(p)-strCRLFLen]
-		if chunkSize == 0 {
-			return len(p), io.EOF
+		if err == nil && rs.chunkLeft == 0 {
+			err = readCrLf(rs.reader)
 		}
-		return len(p), nil
+		return n, err
 	}
-	if rs.totalBytesRead == rs.contentLength {
+	if rs.totalBytesRead == rs.header.contentLength {
 		return 0, io.EOF
 	}
-	var n int
-	var err error
 	prefetchedSize := int(rs.prefetchedBytes.Size())
 	if prefetchedSize > rs.totalBytesRead {
 		left := prefetchedSize - rs.totalBytesRead
@@ -53,12 +63,12 @@ func (rs *requestStream) Read(p []byte) (int, error) {
 		}
 		n, err := rs.prefetchedBytes.Read(p)
 		rs.totalBytesRead += n
-		if n == rs.contentLength {
+		if n == rs.header.contentLength {
 			return n, io.EOF
 		}
 		return n, err
 	} else {
-		left := rs.contentLength - rs.totalBytesRead
+		left := rs.header.contentLength - rs.totalBytesRead
 		if len(p) > left {
 			p = p[:left]
 		}
@@ -69,24 +79,24 @@ func (rs *requestStream) Read(p []byte) (int, error) {
 		}
 	}
 
-	if rs.totalBytesRead == rs.contentLength {
+	if rs.totalBytesRead == rs.header.contentLength {
 		err = io.EOF
 	}
 	return n, err
 }
 
-func acquireRequestStream(b *bytebufferpool.ByteBuffer, r *bufio.Reader, contentLength int) *requestStream {
+func acquireRequestStream(b *bytebufferpool.ByteBuffer, r *bufio.Reader, h *RequestHeader) *requestStream {
 	rs := requestStreamPool.Get().(*requestStream)
 	rs.prefetchedBytes = bytes.NewReader(b.B)
 	rs.reader = r
-	rs.contentLength = contentLength
-
+	rs.header = h
 	return rs
 }
 
 func releaseRequestStream(rs *requestStream) {
 	rs.prefetchedBytes = nil
 	rs.totalBytesRead = 0
+	rs.chunkLeft = 0
 	rs.reader = nil
 	requestStreamPool.Put(rs)
 }
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/strings.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/strings.go
index e244f849..370e3079 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/strings.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/strings.go
@@ -7,54 +7,56 @@ var (
 )
 
 var (
-	strSlash            = []byte("/")
-	strSlashSlash       = []byte("//")
-	strSlashDotDot      = []byte("/..")
-	strSlashDotSlash    = []byte("/./")
-	strSlashDotDotSlash = []byte("/../")
-	strCRLF             = []byte("\r\n")
-	strHTTP             = []byte("http")
-	strHTTPS            = []byte("https")
-	strHTTP10           = []byte("HTTP/1.0")
-	strHTTP11           = []byte("HTTP/1.1")
-	strColon            = []byte(":")
-	strColonSlashSlash  = []byte("://")
-	strColonSpace       = []byte(": ")
-	strGMT              = []byte("GMT")
+	strSlash                    = []byte("/")
+	strSlashSlash               = []byte("//")
+	strSlashDotDot              = []byte("/..")
+	strSlashDotSlash            = []byte("/./")
+	strSlashDotDotSlash         = []byte("/../")
+	strBackSlashDotDot          = []byte(`\..`)
+	strBackSlashDotBackSlash    = []byte(`\.\`)
+	strSlashDotDotBackSlash     = []byte(`/..\`)
+	strBackSlashDotDotBackSlash = []byte(`\..\`)
+	strCRLF                     = []byte("\r\n")
+	strHTTP                     = []byte("http")
+	strHTTPS                    = []byte("https")
+	strHTTP10                   = []byte("HTTP/1.0")
+	strHTTP11                   = []byte("HTTP/1.1")
+	strColon                    = []byte(":")
+	strColonSlashSlash          = []byte("://")
+	strColonSpace               = []byte(": ")
+	strCommaSpace               = []byte(", ")
+	strGMT                      = []byte("GMT")
 
 	strResponseContinue = []byte("HTTP/1.1 100 Continue\r\n\r\n")
 
-	strGet     = []byte(MethodGet)
-	strHead    = []byte(MethodHead)
-	strPost    = []byte(MethodPost)
-	strPut     = []byte(MethodPut)
-	strDelete  = []byte(MethodDelete)
-	strConnect = []byte(MethodConnect)
-	strOptions = []byte(MethodOptions)
-	strTrace   = []byte(MethodTrace)
-	strPatch   = []byte(MethodPatch)
-
-	strExpect           = []byte(HeaderExpect)
-	strConnection       = []byte(HeaderConnection)
-	strContentLength    = []byte(HeaderContentLength)
-	strContentType      = []byte(HeaderContentType)
-	strDate             = []byte(HeaderDate)
-	strHost             = []byte(HeaderHost)
-	strReferer          = []byte(HeaderReferer)
-	strServer           = []byte(HeaderServer)
-	strTransferEncoding = []byte(HeaderTransferEncoding)
-	strContentEncoding  = []byte(HeaderContentEncoding)
-	strAcceptEncoding   = []byte(HeaderAcceptEncoding)
-	strUserAgent        = []byte(HeaderUserAgent)
-	strCookie           = []byte(HeaderCookie)
-	strSetCookie        = []byte(HeaderSetCookie)
-	strLocation         = []byte(HeaderLocation)
-	strIfModifiedSince  = []byte(HeaderIfModifiedSince)
-	strLastModified     = []byte(HeaderLastModified)
-	strAcceptRanges     = []byte(HeaderAcceptRanges)
-	strRange            = []byte(HeaderRange)
-	strContentRange     = []byte(HeaderContentRange)
-	strAuthorization    = []byte(HeaderAuthorization)
+	strExpect             = []byte(HeaderExpect)
+	strConnection         = []byte(HeaderConnection)
+	strContentLength      = []byte(HeaderContentLength)
+	strContentType        = []byte(HeaderContentType)
+	strDate               = []byte(HeaderDate)
+	strHost               = []byte(HeaderHost)
+	strReferer            = []byte(HeaderReferer)
+	strServer             = []byte(HeaderServer)
+	strTransferEncoding   = []byte(HeaderTransferEncoding)
+	strContentEncoding    = []byte(HeaderContentEncoding)
+	strAcceptEncoding     = []byte(HeaderAcceptEncoding)
+	strUserAgent          = []byte(HeaderUserAgent)
+	strCookie             = []byte(HeaderCookie)
+	strSetCookie          = []byte(HeaderSetCookie)
+	strLocation           = []byte(HeaderLocation)
+	strIfModifiedSince    = []byte(HeaderIfModifiedSince)
+	strLastModified       = []byte(HeaderLastModified)
+	strAcceptRanges       = []byte(HeaderAcceptRanges)
+	strRange              = []byte(HeaderRange)
+	strContentRange       = []byte(HeaderContentRange)
+	strAuthorization      = []byte(HeaderAuthorization)
+	strTE                 = []byte(HeaderTE)
+	strTrailer            = []byte(HeaderTrailer)
+	strMaxForwards        = []byte(HeaderMaxForwards)
+	strProxyConnection    = []byte(HeaderProxyConnection)
+	strProxyAuthenticate  = []byte(HeaderProxyAuthenticate)
+	strProxyAuthorization = []byte(HeaderProxyAuthorization)
+	strWWWAuthenticate    = []byte(HeaderWWWAuthenticate)
 
 	strCookieExpires        = []byte("expires")
 	strCookieDomain         = []byte("domain")
@@ -77,10 +79,16 @@ var (
 	strIdentity            = []byte("identity")
 	str100Continue         = []byte("100-continue")
 	strPostArgsContentType = []byte("application/x-www-form-urlencoded")
+	strDefaultContentType  = []byte("application/octet-stream")
 	strMultipartFormData   = []byte("multipart/form-data")
 	strBoundary            = []byte("boundary")
 	strBytes               = []byte("bytes")
-	strTextSlash           = []byte("text/")
-	strApplicationSlash    = []byte("application/")
 	strBasicSpace          = []byte("Basic ")
+
+	strApplicationSlash = []byte("application/")
+	strImageSVG         = []byte("image/svg")
+	strImageIcon        = []byte("image/x-icon")
+	strFontSlash        = []byte("font/")
+	strMultipartSlash   = []byte("multipart/")
+	strTextSlash        = []byte("text/")
 )
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/tcpdialer.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/tcpdialer.go
index 0023cf60..294cf141 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/tcpdialer.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/tcpdialer.go
@@ -15,7 +15,7 @@ import (
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -42,7 +42,7 @@ func Dial(addr string) (net.Conn, error) {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -67,7 +67,7 @@ func DialTimeout(addr string, timeout time.Duration) (net.Conn, error) {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -96,7 +96,7 @@ func DialDualStack(addr string) (net.Conn, error) {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -127,7 +127,7 @@ type Resolver interface {
 
 // TCPDialer contains options to control a group of Dial calls.
 type TCPDialer struct {
-	// Concurrency controls the maximum number of concurrent Dails
+	// Concurrency controls the maximum number of concurrent Dials
 	// that can be performed using this object.
 	// Setting this to 0 means unlimited.
 	//
@@ -153,8 +153,10 @@ type TCPDialer struct {
 	// }
 	Resolver Resolver
 
-	tcpAddrsLock sync.Mutex
-	tcpAddrsMap  map[string]*tcpAddrEntry
+	// DNSCacheDuration may be used to override the default DNS cache duration (DefaultDNSCacheDuration)
+	DNSCacheDuration time.Duration
+
+	tcpAddrsMap sync.Map
 
 	concurrencyCh chan struct{}
 
@@ -166,7 +168,7 @@ type TCPDialer struct {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -193,7 +195,7 @@ func (d *TCPDialer) Dial(addr string) (net.Conn, error) {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -218,7 +220,7 @@ func (d *TCPDialer) DialTimeout(addr string, timeout time.Duration) (net.Conn, e
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -247,7 +249,7 @@ func (d *TCPDialer) DialDualStack(addr string) (net.Conn, error) {
 // This function has the following additional features comparing to net.Dial:
 //
 //   * It reduces load on DNS resolver by caching resolved TCP addressed
-//     for DefaultDNSCacheDuration.
+//     for DNSCacheDuration.
 //   * It dials all the resolved TCP addresses in round-robin manner until
 //     connection is established. This may be useful if certain addresses
 //     are temporarily unreachable.
@@ -272,7 +274,11 @@ func (d *TCPDialer) dial(addr string, dualStack bool, timeout time.Duration) (ne
 		if d.Concurrency > 0 {
 			d.concurrencyCh = make(chan struct{}, d.Concurrency)
 		}
-		d.tcpAddrsMap = make(map[string]*tcpAddrEntry)
+
+		if d.DNSCacheDuration == 0 {
+			d.DNSCacheDuration = DefaultDNSCacheDuration
+		}
+
 		go d.tcpAddrsClean()
 	})
 
@@ -352,8 +358,8 @@ type tcpAddrEntry struct {
 	addrs    []net.TCPAddr
 	addrsIdx uint32
 
+	pending     int32
 	resolveTime time.Time
-	pending     bool
 }
 
 // DefaultDNSCacheDuration is the duration for caching resolved TCP addresses
@@ -361,39 +367,39 @@ type tcpAddrEntry struct {
 const DefaultDNSCacheDuration = time.Minute
 
 func (d *TCPDialer) tcpAddrsClean() {
-	expireDuration := 2 * DefaultDNSCacheDuration
+	expireDuration := 2 * d.DNSCacheDuration
 	for {
 		time.Sleep(time.Second)
 		t := time.Now()
-
-		d.tcpAddrsLock.Lock()
-		for k, e := range d.tcpAddrsMap {
-			if t.Sub(e.resolveTime) > expireDuration {
-				delete(d.tcpAddrsMap, k)
+		d.tcpAddrsMap.Range(func(k, v interface{}) bool {
+			if e, ok := v.(*tcpAddrEntry); ok && t.Sub(e.resolveTime) > expireDuration {
+				d.tcpAddrsMap.Delete(k)
 			}
-		}
-		d.tcpAddrsLock.Unlock()
+			return true
+		})
+
 	}
 }
 
 func (d *TCPDialer) getTCPAddrs(addr string, dualStack bool) ([]net.TCPAddr, uint32, error) {
-	d.tcpAddrsLock.Lock()
-	e := d.tcpAddrsMap[addr]
-	if e != nil && !e.pending && time.Since(e.resolveTime) > DefaultDNSCacheDuration {
-		e.pending = true
-		e = nil
+	item, exist := d.tcpAddrsMap.Load(addr)
+	e, ok := item.(*tcpAddrEntry)
+	if exist && ok && e != nil && time.Since(e.resolveTime) > d.DNSCacheDuration {
+		// Only let one goroutine re-resolve at a time.
+		if atomic.SwapInt32(&e.pending, 1) == 0 {
+			e = nil
+		}
 	}
-	d.tcpAddrsLock.Unlock()
 
 	if e == nil {
 		addrs, err := resolveTCPAddrs(addr, dualStack, d.Resolver)
 		if err != nil {
-			d.tcpAddrsLock.Lock()
-			e = d.tcpAddrsMap[addr]
-			if e != nil && e.pending {
-				e.pending = false
+			item, exist := d.tcpAddrsMap.Load(addr)
+			e, ok = item.(*tcpAddrEntry)
+			if exist && ok && e != nil {
+				// Set pending to 0 so another goroutine can retry.
+				atomic.StoreInt32(&e.pending, 0)
 			}
-			d.tcpAddrsLock.Unlock()
 			return nil, 0, err
 		}
 
@@ -401,10 +407,7 @@ func (d *TCPDialer) getTCPAddrs(addr string, dualStack bool) ([]net.TCPAddr, uin
 			addrs:       addrs,
 			resolveTime: time.Now(),
 		}
-
-		d.tcpAddrsLock.Lock()
-		d.tcpAddrsMap[addr] = e
-		d.tcpAddrsLock.Unlock()
+		d.tcpAddrsMap.Store(addr, e)
 	}
 
 	idx := atomic.AddUint32(&e.addrsIdx, 1)
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri.go
index c4681838..38a431e1 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri.go
@@ -5,6 +5,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"path/filepath"
+	"strconv"
 	"sync"
 )
 
@@ -69,14 +71,14 @@ type URI struct {
 // CopyTo copies uri contents to dst.
 func (u *URI) CopyTo(dst *URI) {
 	dst.Reset()
-	dst.pathOriginal = append(dst.pathOriginal[:0], u.pathOriginal...)
-	dst.scheme = append(dst.scheme[:0], u.scheme...)
-	dst.path = append(dst.path[:0], u.path...)
-	dst.queryString = append(dst.queryString[:0], u.queryString...)
-	dst.hash = append(dst.hash[:0], u.hash...)
-	dst.host = append(dst.host[:0], u.host...)
-	dst.username = append(dst.username[:0], u.username...)
-	dst.password = append(dst.password[:0], u.password...)
+	dst.pathOriginal = append(dst.pathOriginal, u.pathOriginal...)
+	dst.scheme = append(dst.scheme, u.scheme...)
+	dst.path = append(dst.path, u.path...)
+	dst.queryString = append(dst.queryString, u.queryString...)
+	dst.hash = append(dst.hash, u.hash...)
+	dst.host = append(dst.host, u.host...)
+	dst.username = append(dst.username, u.username...)
+	dst.password = append(dst.password, u.password...)
 
 	u.queryArgs.CopyTo(&dst.queryArgs)
 	dst.parsedQueryArgs = u.parsedQueryArgs
@@ -88,7 +90,7 @@ func (u *URI) CopyTo(dst *URI) {
 
 // Hash returns URI hash, i.e. qwe of http://aaa.com/foo/bar?baz=123#qwe .
 //
-// The returned value is valid until the next URI method call.
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Hash() []byte {
 	return u.hash
 }
@@ -104,6 +106,8 @@ func (u *URI) SetHashBytes(hash []byte) {
 }
 
 // Username returns URI username
+//
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Username() []byte {
 	return u.username
 }
@@ -119,6 +123,8 @@ func (u *URI) SetUsernameBytes(username []byte) {
 }
 
 // Password returns URI password
+//
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Password() []byte {
 	return u.password
 }
@@ -136,7 +142,7 @@ func (u *URI) SetPasswordBytes(password []byte) {
 // QueryString returns URI query string,
 // i.e. baz=123 of http://aaa.com/foo/bar?baz=123#qwe .
 //
-// The returned value is valid until the next URI method call.
+// The returned bytes are valid until the next URI method call.
 func (u *URI) QueryString() []byte {
 	return u.queryString
 }
@@ -158,7 +164,7 @@ func (u *URI) SetQueryStringBytes(queryString []byte) {
 // The returned path is always urldecoded and normalized,
 // i.e. '//f%20obar/baz/../zzz' becomes '/f obar/zzz'.
 //
-// The returned value is valid until the next URI method call.
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Path() []byte {
 	path := u.path
 	if len(path) == 0 {
@@ -181,7 +187,7 @@ func (u *URI) SetPathBytes(path []byte) {
 
 // PathOriginal returns the original path from requestURI passed to URI.Parse().
 //
-// The returned value is valid until the next URI method call.
+// The returned bytes are valid until the next URI method call.
 func (u *URI) PathOriginal() []byte {
 	return u.pathOriginal
 }
@@ -190,7 +196,7 @@ func (u *URI) PathOriginal() []byte {
 //
 // Returned scheme is always lowercased.
 //
-// The returned value is valid until the next URI method call.
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Scheme() []byte {
 	scheme := u.scheme
 	if len(scheme) == 0 {
@@ -211,6 +217,14 @@ func (u *URI) SetSchemeBytes(scheme []byte) {
 	lowercaseBytes(u.scheme)
 }
 
+func (u *URI) isHttps() bool {
+	return bytes.Equal(u.scheme, strHTTPS)
+}
+
+func (u *URI) isHttp() bool {
+	return len(u.scheme) == 0 || bytes.Equal(u.scheme, strHTTP)
+}
+
 // Reset clears uri.
 func (u *URI) Reset() {
 	u.pathOriginal = u.pathOriginal[:0]
@@ -236,6 +250,8 @@ func (u *URI) Reset() {
 // Host returns host part, i.e. aaa.com of http://aaa.com/foo/bar?baz=123#qwe .
 //
 // Host is always lowercased.
+//
+// The returned bytes are valid until the next URI method call.
 func (u *URI) Host() []byte {
 	return u.host
 }
@@ -275,14 +291,13 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
 
 	if len(host) == 0 || bytes.Contains(uri, strColonSlashSlash) {
 		scheme, newHost, newURI := splitHostURI(host, uri)
-		u.scheme = append(u.scheme, scheme...)
-		lowercaseBytes(u.scheme)
+		u.SetSchemeBytes(scheme)
 		host = newHost
 		uri = newURI
 	}
 
 	if isTLS {
-		u.scheme = append(u.scheme[:0], strHTTPS...)
+		u.SetSchemeBytes(strHTTPS)
 	}
 
 	if n := bytes.IndexByte(host, '@'); n >= 0 {
@@ -299,6 +314,11 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
 	}
 
 	u.host = append(u.host, host...)
+	if parsedHost, err := parseHost(u.host); err != nil {
+		return err
+	} else {
+		u.host = parsedHost
+	}
 	lowercaseBytes(u.host)
 
 	b := uri
@@ -338,6 +358,226 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
 	return nil
 }
 
+// parseHost parses host as an authority without user
+// information. That is, as host[:port].
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L619
+//
+// The host is parsed and unescaped in place overwriting the contents of the host parameter.
+func parseHost(host []byte) ([]byte, error) {
+	if len(host) > 0 && host[0] == '[' {
+		// Parse an IP-Literal in RFC 3986 and RFC 6874.
+		// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
+		i := bytes.LastIndexByte(host, ']')
+		if i < 0 {
+			return nil, errors.New("missing ']' in host")
+		}
+		colonPort := host[i+1:]
+		if !validOptionalPort(colonPort) {
+			return nil, fmt.Errorf("invalid port %q after host", colonPort)
+		}
+
+		// RFC 6874 defines that %25 (%-encoded percent) introduces
+		// the zone identifier, and the zone identifier can use basically
+		// any %-encoding it likes. That's different from the host, which
+		// can only %-encode non-ASCII bytes.
+		// We do impose some restrictions on the zone, to avoid stupidity
+		// like newlines.
+		zone := bytes.Index(host[:i], []byte("%25"))
+		if zone >= 0 {
+			host1, err := unescape(host[:zone], encodeHost)
+			if err != nil {
+				return nil, err
+			}
+			host2, err := unescape(host[zone:i], encodeZone)
+			if err != nil {
+				return nil, err
+			}
+			host3, err := unescape(host[i:], encodeHost)
+			if err != nil {
+				return nil, err
+			}
+			return append(host1, append(host2, host3...)...), nil
+		}
+	} else if i := bytes.LastIndexByte(host, ':'); i != -1 {
+		colonPort := host[i:]
+		if !validOptionalPort(colonPort) {
+			return nil, fmt.Errorf("invalid port %q after host", colonPort)
+		}
+	}
+
+	var err error
+	if host, err = unescape(host, encodeHost); err != nil {
+		return nil, err
+	}
+	return host, nil
+}
+
+type encoding int
+
+const (
+	encodeHost encoding = 1 + iota
+	encodeZone
+)
+
+type EscapeError string
+
+func (e EscapeError) Error() string {
+	return "invalid URL escape " + strconv.Quote(string(e))
+}
+
+type InvalidHostError string
+
+func (e InvalidHostError) Error() string {
+	return "invalid character " + strconv.Quote(string(e)) + " in host name"
+}
+
+// unescape unescapes a string; the mode specifies
+// which section of the URL string is being unescaped.
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L199
+//
+// Unescapes in place overwriting the contents of s and returning it.
+func unescape(s []byte, mode encoding) ([]byte, error) {
+	// Count %, check that they're well-formed.
+	n := 0
+	for i := 0; i < len(s); {
+		switch s[i] {
+		case '%':
+			n++
+			if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
+				s = s[i:]
+				if len(s) > 3 {
+					s = s[:3]
+				}
+				return nil, EscapeError(s)
+			}
+			// Per https://tools.ietf.org/html/rfc3986#page-21
+			// in the host component %-encoding can only be used
+			// for non-ASCII bytes.
+			// But https://tools.ietf.org/html/rfc6874#section-2
+			// introduces %25 being allowed to escape a percent sign
+			// in IPv6 scoped-address literals. Yay.
+			if mode == encodeHost && unhex(s[i+1]) < 8 && !bytes.Equal(s[i:i+3], []byte("%25")) {
+				return nil, EscapeError(s[i : i+3])
+			}
+			if mode == encodeZone {
+				// RFC 6874 says basically "anything goes" for zone identifiers
+				// and that even non-ASCII can be redundantly escaped,
+				// but it seems prudent to restrict %-escaped bytes here to those
+				// that are valid host name bytes in their unescaped form.
+				// That is, you can use escaping in the zone identifier but not
+				// to introduce bytes you couldn't just write directly.
+				// But Windows puts spaces here! Yay.
+				v := unhex(s[i+1])<<4 | unhex(s[i+2])
+				if !bytes.Equal(s[i:i+3], []byte("%25")) && v != ' ' && shouldEscape(v, encodeHost) {
+					return nil, EscapeError(s[i : i+3])
+				}
+			}
+			i += 3
+		default:
+			if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
+				return nil, InvalidHostError(s[i : i+1])
+			}
+			i++
+		}
+	}
+
+	if n == 0 {
+		return s, nil
+	}
+
+	t := s[:0]
+	for i := 0; i < len(s); i++ {
+		switch s[i] {
+		case '%':
+			t = append(t, unhex(s[i+1])<<4|unhex(s[i+2]))
+			i += 2
+		default:
+			t = append(t, s[i])
+		}
+	}
+	return t, nil
+}
+
+// Return true if the specified character should be escaped when
+// appearing in a URL string, according to RFC 3986.
+//
+// Please be informed that for now shouldEscape does not check all
+// reserved characters correctly. See golang.org/issue/5684.
+//
+// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L100
+func shouldEscape(c byte, mode encoding) bool {
+	// §2.3 Unreserved characters (alphanum)
+	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
+		return false
+	}
+
+	if mode == encodeHost || mode == encodeZone {
+		// §3.2.2 Host allows
+		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+		// as part of reg-name.
+		// We add : because we include :port as part of host.
+		// We add [ ] because we include [ipv6]:port as part of host.
+		// We add < > because they're the only characters left that
+		// we could possibly allow, and Parse will reject them if we
+		// escape them (because hosts can't use %-encoding for
+		// ASCII bytes).
+		switch c {
+		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
+			return false
+		}
+	}
+
+	if c == '-' || c == '_' || c == '.' || c == '~' { // §2.3 Unreserved characters (mark)
+		return false
+	}
+
+	// Everything else must be escaped.
+	return true
+}
+
+func ishex(c byte) bool {
+	switch {
+	case '0' <= c && c <= '9':
+		return true
+	case 'a' <= c && c <= 'f':
+		return true
+	case 'A' <= c && c <= 'F':
+		return true
+	}
+	return false
+}
+
+func unhex(c byte) byte {
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0'
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10
+	}
+	return 0
+}
+
+// validOptionalPort reports whether port is either an empty string
+// or matches /^:\d*$/
+func validOptionalPort(port []byte) bool {
+	if len(port) == 0 {
+		return true
+	}
+	if port[0] != ':' {
+		return false
+	}
+	for _, b := range port[1:] {
+		if b < '0' || b > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 func normalizePath(dst, src []byte) []byte {
 	dst = dst[:0]
 	dst = addLeadingSlash(dst, src)
@@ -390,11 +630,65 @@ func normalizePath(dst, src []byte) []byte {
 	if n >= 0 && n+len(strSlashDotDot) == len(b) {
 		nn := bytes.LastIndexByte(b[:n], '/')
 		if nn < 0 {
-			return strSlash
+			return append(dst[:0], strSlash...)
 		}
 		b = b[:nn+1]
 	}
 
+	if filepath.Separator == '\\' {
+		// remove \.\ parts
+		b = dst
+		for {
+			n := bytes.Index(b, strBackSlashDotBackSlash)
+			if n < 0 {
+				break
+			}
+			nn := n + len(strSlashDotSlash) - 1
+			copy(b[n:], b[nn:])
+			b = b[:len(b)-nn+n]
+		}
+
+		// remove /foo/..\ parts
+		for {
+			n := bytes.Index(b, strSlashDotDotBackSlash)
+			if n < 0 {
+				break
+			}
+			nn := bytes.LastIndexByte(b[:n], '/')
+			if nn < 0 {
+				nn = 0
+			}
+			n += len(strSlashDotDotBackSlash) - 1
+			copy(b[nn:], b[n:])
+			b = b[:len(b)-n+nn]
+		}
+
+		// remove /foo\..\ parts
+		for {
+			n := bytes.Index(b, strBackSlashDotDotBackSlash)
+			if n < 0 {
+				break
+			}
+			nn := bytes.LastIndexByte(b[:n], '/')
+			if nn < 0 {
+				nn = 0
+			}
+			n += len(strBackSlashDotDotBackSlash) - 1
+			copy(b[nn:], b[n:])
+			b = b[:len(b)-n+nn]
+		}
+
+		// remove trailing \foo\..
+		n := bytes.LastIndex(b, strBackSlashDotDot)
+		if n >= 0 && n+len(strSlashDotDot) == len(b) {
+			nn := bytes.LastIndexByte(b[:n], '/')
+			if nn < 0 {
+				return append(dst[:0], strSlash...)
+			}
+			b = b[:nn+1]
+		}
+	}
+
 	return b
 }
 
@@ -424,6 +718,8 @@ func (u *URI) RequestURI() []byte {
 //    * For /foo/bar/baz.html path returns baz.html.
 //    * For /foo/bar/ returns empty byte slice.
 //    * For /foobar.js returns foobar.js.
+//
+// The returned bytes are valid until the next URI method call.
 func (u *URI) LastPathSegment() []byte {
 	path := u.Path()
 	n := bytes.LastIndexByte(path, '/')
@@ -525,6 +821,8 @@ func (u *URI) updateBytes(newURI, buf []byte) []byte {
 }
 
 // FullURI returns full uri in the form {Scheme}://{Host}{RequestURI}#{Hash}.
+//
+// The returned bytes are valid until the next URI method call.
 func (u *URI) FullURI() []byte {
 	u.fullURI = u.AppendBytes(u.fullURI[:0])
 	return u.fullURI
@@ -591,6 +889,8 @@ func splitHostURI(host, uri []byte) ([]byte, []byte, []byte) {
 }
 
 // QueryArgs returns query args.
+//
+// The returned args are valid until the next URI method call.
 func (u *URI) QueryArgs() *Args {
 	u.parseQueryArgs()
 	return &u.queryArgs
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_unix.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_unix.go
index 1e307332..c2ac8fa4 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_unix.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_unix.go
@@ -1,3 +1,4 @@
+//go:build !windows
 // +build !windows
 
 package fasthttp
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_windows.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_windows.go
index 95917a6b..e1391a7a 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_windows.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/uri_windows.go
@@ -1,3 +1,4 @@
+//go:build windows
 // +build windows
 
 package fasthttp
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/userdata.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/userdata.go
index bd3e28aa..9a7c9883 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/userdata.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/userdata.go
@@ -22,6 +22,10 @@ func (d *userData) Set(key string, value interface{}) {
 		}
 	}
 
+	if value == nil {
+		return
+	}
+
 	c := cap(args)
 	if c > n {
 		args = args[:n+1]
@@ -69,3 +73,23 @@ func (d *userData) Reset() {
 	}
 	*d = (*d)[:0]
 }
+
+func (d *userData) Remove(key string) {
+	args := *d
+	n := len(args)
+	for i := 0; i < n; i++ {
+		kv := &args[i]
+		if string(kv.key) == key {
+			n--
+			args[i] = args[n]
+			args[n].value = nil
+			args = args[:n]
+			*d = args
+			return
+		}
+	}
+}
+
+func (d *userData) RemoveBytes(key []byte) {
+	d.Remove(b2s(key))
+}
diff --git a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/workerpool.go b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/workerpool.go
index 9b1987e8..f1a9a4cb 100644
--- a/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/workerpool.go
+++ b/http-benchmark/fasthttp/vendor/github.com/valyala/fasthttp/workerpool.go
@@ -1,6 +1,7 @@
 package fasthttp
 
 import (
+	"errors"
 	"net"
 	"runtime"
 	"strings"
@@ -226,7 +227,8 @@ func (wp *workerPool) workerFunc(ch *workerChan) {
 				strings.Contains(errStr, "reset by peer") ||
 				strings.Contains(errStr, "request headers: small read buffer") ||
 				strings.Contains(errStr, "unexpected EOF") ||
-				strings.Contains(errStr, "i/o timeout")) {
+				strings.Contains(errStr, "i/o timeout") ||
+				errors.Is(err, ErrBadTrailer)) {
 				wp.Logger.Printf("error when serving connection %q<->%q: %s", c.LocalAddr(), c.RemoteAddr(), err)
 			}
 		}
diff --git a/http-benchmark/fasthttp/vendor/modules.txt b/http-benchmark/fasthttp/vendor/modules.txt
index 6eb707bb..e4f17c75 100644
--- a/http-benchmark/fasthttp/vendor/modules.txt
+++ b/http-benchmark/fasthttp/vendor/modules.txt
@@ -1,12 +1,12 @@
-# github.com/andybalholm/brotli v1.0.1
+# github.com/andybalholm/brotli v1.0.4
 github.com/andybalholm/brotli
-# github.com/klauspost/compress v1.11.8
+# github.com/klauspost/compress v1.15.0
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/gzip
 github.com/klauspost/compress/zlib
 # github.com/valyala/bytebufferpool v1.0.0
 github.com/valyala/bytebufferpool
-# github.com/valyala/fasthttp v1.23.0
+# github.com/valyala/fasthttp v1.34.0
 ## explicit
 github.com/valyala/fasthttp
 github.com/valyala/fasthttp/fasthttputil