Skip to content

Commit 440bba9

Browse files
committed
+ lz4, - lzo and zlib, mark 0.1.3
1 parent 241fa15 commit 440bba9

File tree

6 files changed

+131
-59
lines changed

6 files changed

+131
-59
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,16 @@ Available methods can be found via `available-hash-functions`, `available-compre
2727

2828
```clj
2929
byte-transforms> (available-hash-functions)
30-
(:sha384 :md2 :crc32 :sha512 :sha1 :murmur32 :murmur128 :adler32 :sha256 :md5 :murmur64)
30+
(:sha384 :md2 :crc32 :crc64 :sha512 :sha1 :murmur32 :murmur128 :adler32 :sha256 :md5 :murmur64)
3131
byte-transforms> (available-compressors)
32-
(:lzo :bzip2 :snappy :gzip :zlib)
32+
(:lz4 :bzip2 :snappy :gzip)
3333
byte-transforms> (available-encoders)
3434
(:base64)
3535
```
3636

37-
When choosing a compression algorithm, `snappy` is typically the fastest, and `bzip2` yields the highest compression. Full stats on all methods can be found by cloning the project and running `lein test :benchmark`.
37+
When choosing a compression algorithm, `snappy` is typically the fastest, `bzip2` yields the highest compression, and `lz4` provides a good balance between higher compression rate and fast decompression. All the compression algorithms except `lz4` are concat-able; multiple compressed segments can be concatenated and decompressed as a single stream.
38+
39+
Full stats on all methods can be found by cloning the project and running `lein test :benchmark`.
3840

3941
## License
4042

project.clj

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
(defproject byte-transforms "0.1.3-SNAPSHOT"
1+
(defproject byte-transforms "0.1.3"
22
:description "Methods for hashing, compressing, and encoding bytes."
33
:license {:name "Apache License 2.0"
44
:url "http://www.apache.org/licenses/LICENSE-2.0.html"}
5-
:dependencies [[byte-streams "0.1.11-SNAPSHOT"]
5+
:dependencies [[byte-streams "0.1.11"]
66
[org.xerial.snappy/snappy-java "1.1.0.1"]
77
[commons-codec/commons-codec "1.9"]
8-
[org.anarres.lzo/lzo-core "1.0.0"]
8+
[net.jpountz.lz4/lz4 "1.2.0"]
99
[org.apache.commons/commons-compress "1.8"
1010
:exclusions [org.tukaani/xz]]]
11-
:profiles {:dev {:dependencies [[org.clojure/clojure "1.5.1"]
11+
:profiles {:dev {:dependencies [[org.clojure/clojure "1.6.0"]
1212
[criterium "0.4.3"]
1313
[reiddraper/simple-check "0.5.6"]
1414
[codox-md "0.2.0" :exclusions [org.clojure/clojure]]]}}

src/byte_transforms.clj

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,26 @@
55
[primitive-math :as p])
66
(:import
77
[byte_transforms
8-
CassandraMurmurHash]
8+
CassandraMurmurHash
9+
CRC64]
910
[java.util
1011
UUID]
1112
[java.lang.reflect
1213
Array]
1314
[java.util.zip
1415
CRC32
15-
Adler32
16-
DeflaterInputStream
17-
InflaterInputStream]
16+
Adler32]
1817
[java.io
1918
OutputStream
2019
InputStream
2120
PipedInputStream
2221
PipedOutputStream
2322
ByteArrayOutputStream]
23+
[net.jpountz.lz4
24+
LZ4BlockOutputStream
25+
LZ4BlockInputStream
26+
LZ4Factory
27+
LZ4Compressor]
2428
[java.security
2529
MessageDigest]
2630
[java.nio
@@ -40,12 +44,7 @@
4044
[org.xerial.snappy
4145
Snappy
4246
SnappyInputStream
43-
SnappyOutputStream]
44-
[org.anarres.lzo
45-
LzoInputStream
46-
LzoOutputStream
47-
LzoDecompressor1x
48-
LzoLibrary]))
47+
SnappyOutputStream]))
4948

5049
;;;
5150

@@ -142,7 +141,17 @@
142141
(def-hash crc32
143142
[x options]
144143
(let [crc (CRC32.)]
145-
(when-let [seed (:seed options)]
144+
(when-let [seed (get options :seed)]
145+
(.update crc (byte seed)))
146+
(doseq [^bytes ary (bytes/to-byte-arrays x options)]
147+
(.update crc ary))
148+
(.getValue crc)))
149+
150+
;; CRC64 hash
151+
(def-hash crc64
152+
[x options]
153+
(let [crc (CRC64.)]
154+
(when-let [seed (get options :seed)]
146155
(.update crc (byte seed)))
147156
(doseq [^bytes ary (bytes/to-byte-arrays x options)]
148157
(.update crc ary))
@@ -152,14 +161,14 @@
152161
(def-hash adler32
153162
[x options]
154163
(let [adler (Adler32.)]
155-
(when-let [seed (:seed options)]
164+
(when-let [seed (get options :seed)]
156165
(.update adler (byte seed)))
157166
(doseq [^bytes ary (bytes/to-byte-arrays x options)]
158167
(.update adler ary))
159168
(.getValue adler)))
160169

161170
(defn- hash-digest [^MessageDigest digest bufs options]
162-
(when-let [seed (:seed options)]
171+
(when-let [seed (get options :seed)]
163172
(.update digest (byte seed)))
164173
(doseq [^ByteBuffer buf bufs]
165174
(.update digest buf))
@@ -222,7 +231,8 @@
222231

223232
(let [murmur32 (get @hash-functions :murmur32)
224233
murmur64 (get @hash-functions :murmur64)
225-
murmur128 (get @hash-functions :murmur128)]
234+
murmur128 (get @hash-functions :murmur128)
235+
crc64 (get @hash-functions :crc64)]
226236
(defn hash
227237
"Takes a byte stream, and returns a value representing its hash, which will be an integer if
228238
the hash is 32 or 64-bit, or a byte array otherwise. By default, this will use the murmur64
@@ -236,6 +246,7 @@
236246
:murmur32 (murmur32 bytes options)
237247
:murmur64 (murmur64 bytes options)
238248
:murmur128 (murmur128 bytes options)
249+
:crc64 (crc64 bytes options)
239250
(if-let [f (@hash-functions (keyword function))]
240251
(f bytes options)
241252
(throw
@@ -273,14 +284,6 @@
273284
(IllegalArgumentException.
274285
(str "Don't recognize decompressor '" (name algorithm) "'"))))))
275286

276-
(def-compressor zlib
277-
[x options]
278-
(DeflaterInputStream. (bytes/to-input-stream x options)))
279-
280-
(def-decompressor zlib
281-
[x options]
282-
(InflaterInputStream. (bytes/to-input-stream x options)))
283-
284287
(defn- in->wrapped-out->in
285288
[^InputStream stream output-wrapper options]
286289
(let [chunk-size (get options :chunk-size 65536)
@@ -374,19 +377,27 @@
374377
[x options]
375378
(BZip2CompressorInputStream. (bytes/to-input-stream x options) true))
376379

377-
(def-decompressor lzo
378-
[x options]
379-
(LzoInputStream. (bytes/to-input-stream x options) (LzoDecompressor1x.)))
380-
381-
(def-compressor lzo
382-
[x options]
380+
(def-compressor lz4
381+
[x {:keys [safe? fastest? chunk-size]
382+
:or {safe? false, fastest? false, chunk-size 1e5}
383+
:as options}]
383384
(bytes->wrapped-out->bytes
384385
x
385-
#(LzoOutputStream. %
386-
(-> (LzoLibrary/getInstance) (.newCompressor nil nil))
387-
(* 25 1024))
386+
#(LZ4BlockOutputStream. %
387+
chunk-size
388+
(let [^LZ4Factory factory (if safe?
389+
(LZ4Factory/safeInstance)
390+
(LZ4Factory/fastestInstance))]
391+
(if fastest?
392+
(.fastCompressor factory)
393+
(.highCompressor factory))))
388394
options))
389395

396+
(def-decompressor lz4
397+
[x options]
398+
(LZ4BlockInputStream. (bytes/to-input-stream x options)))
399+
400+
390401
;;;
391402

392403
(defn available-encoders

src/byte_transforms/CRC64.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* CRC64
3+
*
4+
* Author: Lasse Collin <[email protected]>
5+
*
6+
* This file has been put into the public domain.
7+
* You can do whatever you want with this file.
8+
*/
9+
10+
// this file has been slightly modified, but is otherwise
11+
// as originally written by Lasse Collin
12+
package byte_transforms;
13+
14+
public class CRC64 {
15+
16+
private static final long poly = 0xC96C5795D7870F42L;
17+
private static final long crcTable[] = new long[256];
18+
19+
private long crc = -1;
20+
21+
static {
22+
for (int b = 0; b < crcTable.length; ++b) {
23+
long r = b;
24+
for (int i = 0; i < 8; ++i) {
25+
if ((r & 1) == 1)
26+
r = (r >>> 1) ^ poly;
27+
else
28+
r >>>= 1;
29+
}
30+
31+
crcTable[b] = r;
32+
}
33+
}
34+
35+
public CRC64() {
36+
}
37+
38+
public void update(byte b) {
39+
crc = crcTable[(b ^ (int)crc) & 0xFF] ^ (crc >>> 8);
40+
}
41+
42+
public void update(byte[] buf) {
43+
update(buf, 0, buf.length);
44+
}
45+
46+
public void update(byte[] buf, int off, int len) {
47+
int end = off + len;
48+
49+
while (off < end)
50+
crc = crcTable[(buf[off++] ^ (int)crc) & 0xFF] ^ (crc >>> 8);
51+
}
52+
53+
public long getValue() {
54+
return ~crc;
55+
}
56+
}

test/byte_transforms_simple_check.clj

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010

1111
(def compression-type (gen/elements (bt/available-compressors)))
1212

13-
(def concat-compression-type (gen/elements [:gzip :bzip2 :lzo :snappy]))
13+
(def concat-compression-type (gen/elements [:gzip :bzip2 :snappy]))
1414

1515
(def not-empty-byte-array (gen/such-that not-empty gen/bytes))
1616

1717
(defn roundtrip-equiv
1818
[b comp-type]
1919
(java.util.Arrays/equals
20-
b
20+
^bytes b
2121
(-> b
2222
(bt/compress comp-type)
2323
(bt/decompress comp-type)
@@ -26,7 +26,7 @@
2626
(defn concat-roundtrip-equiv
2727
[b chunk-size comp-type]
2828
(java.util.Arrays/equals
29-
b
29+
^bytes b
3030
(->> (bs/to-byte-buffers b {:chunk-size chunk-size})
3131
(map #(bt/compress % comp-type))
3232
(#(bt/decompress % comp-type))

test/byte_transforms_test.clj

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,21 +54,20 @@
5454
(defn benchmark-compression-fn [algorithm ^bytes data]
5555
(let [len (alength data)
5656
now #(System/currentTimeMillis)
57-
start (now)
58-
iterations (loop [cnt 0]
59-
(if (< (- (now) start) 2000)
60-
(do
61-
(-> data
62-
(bt/compress algorithm)
63-
(bt/decompress algorithm)
64-
bs/to-byte-array)
65-
(recur (inc cnt)))
66-
cnt))
67-
end (now)]
68-
(float
69-
(/
70-
(* len iterations 1000 (Math/pow 2 -20))
71-
(- end start)))))
57+
measure (fn [f]
58+
(let [start (now)]
59+
(loop [cnt 0]
60+
(if (< (- (now) start) 2000)
61+
(do (f) (recur (inc cnt)))
62+
(let [end (now)]
63+
(float
64+
(/
65+
(* len cnt 1000 (Math/pow 2 -20))
66+
(- end start))))))))
67+
compressed-data (-> data (bt/compress algorithm) bs/to-byte-array)]
68+
[(measure #(-> data (bt/compress algorithm) (bt/decompress algorithm) bs/to-byte-array))
69+
(measure #(-> data (bt/compress algorithm) bs/to-byte-array))
70+
(measure #(-> compressed-data (bt/decompress algorithm) bs/to-byte-array))]))
7271

7372
(defn measure-compression-fn [algorithm ^bytes data]
7473
(float
@@ -77,12 +76,16 @@
7776
(-> data (bt/compress algorithm) bs/to-byte-array alength))))
7877

7978
(deftest ^:benchmark benchmark-compression-algorithms
80-
(println "\ncompression roundtrip throughput:")
79+
(println "\ncompression throughput:")
8180
(doseq [c (sort (bt/available-compressors))]
8281
;; warmup
8382
(bt/compress warmup-data c)
84-
(let [throughput (benchmark-compression-fn c world-facts)]
85-
(println (format "%12s: %.2f MB/s" c throughput))))
83+
(let [[roundtrip compress decompress] (benchmark-compression-fn c world-facts)]
84+
(println (format "%12s: roundtrip %.2f MB/s, compress %.2f MB/s, decompress %.2f MB/s"
85+
c
86+
roundtrip
87+
compress
88+
decompress))))
8689
(println "\ncompression factor:")
8790
(doseq [c (sort (bt/available-compressors))]
8891
(println (format "%12s: %.2fx" c (measure-compression-fn c world-facts)))))

0 commit comments

Comments
 (0)