mirage · mattiasdrp · Apr 23, 2021 · Apr 30, 2021 · Apr 23, 2021 · May 3, 2021
diff --git a/bench/dune b/bench/dune
@@ -2,7 +2,7 @@
  (name main)
  (public_name repr-bench)
  (package repr-bench)
- (libraries repr bechamel fpath yojson unix)
+ (libraries repr bechamel fpath yojson unix memtrace)
  (preprocess
   (pps ppx_repr)))
 

diff --git a/bench/main.ml b/bench/main.ml
@@ -12,6 +12,9 @@ module Generic_op = struct
       }
         -> op
 
+  let size_of ty v =
+    match T.(unstage (size_of ty)) v with None -> 1024 | Some n -> n
+
   type t = { name : string; operation : op }
 
   let bin_string : t =
@@ -24,12 +27,14 @@ module Generic_op = struct
 
   let bin : t =
     let encode (type a) (ty : a T.t) =
+      let size_of = size_of ty in
       let f = T.unstage (T.encode_bin ty) in
       T.stage
         (fun a ->
-           let buffer = Buffer.create 0 in
-           f a (Buffer.add_string buffer);
-           Buffer.contents buffer
+           let len = size_of a in
+           let byt = Bytes.create len in
+           let off = f a byt 0 in
+           Bytes.to_string (if len = off then byt else Bytes.sub byt 0 off)
           : a -> string)
     in
     let decode (type a) (ty : a T.t) =
@@ -40,12 +45,14 @@ module Generic_op = struct
 
   let pre_hash : t =
     let consume (type a) (ty : a T.t) =
+      let size_of = size_of ty in
       let f = T.unstage (T.pre_hash ty) in
       T.stage
         (fun a ->
-           let buffer = Buffer.create 0 in
-           f a (Buffer.add_string buffer);
-           Buffer.contents buffer
+           let len = size_of a in
+           let byt = Bytes.create len in
+           let off = f a byt 0 in
+           Bytes.to_string (if len = off then byt else Bytes.sub byt 0 off)
           : a -> string)
     in
     { name = "pre_hash"; operation = Consumer { consume } }
@@ -252,6 +259,7 @@ let benchmark () =
 let ignore_eexist f = try f () with Unix.Unix_error (EEXIST, _, _) -> ()
 
 let () =
+  Memtrace.trace_if_requested ();
   Random.self_init ();
   let output_formatter =
     match Sys.argv with

diff --git a/dune-project b/dune-project
@@ -49,6 +49,7 @@ guarantee.
   (ppx_repr (= :version))
   bechamel
   yojson
+  memtrace
   fpath)
  (synopsis "Benchmarks for the `repr` package")
  (description "Benchmarks for the `repr` package"))

diff --git a/repr-bench.opam b/repr-bench.opam
@@ -13,6 +13,7 @@ depends: [
   "ppx_repr" {= version}
   "bechamel"
   "yojson"
+  "memtrace"
   "fpath"
   "odoc" {with-doc}
 ]

diff --git a/rfc.org b/rfc.org
@@ -0,0 +1,265 @@
+#+TITLE:     Repr Encoding
+#+AUTHOR:    mattiasdrp
+#+EMAIL:     [email protected]
+#+DESCRIPTION: This document documents the Repr encoding and the solutions to make it zero(or one)-copy
+#+KEYWORDS:  repr, ocaml
+
+#+begin_src ocaml :results none :exports never
+  #use "topfind" ;;
+  #require "repr";;
+#+end_src
+
+* Current encoding
+
+Encoding right now has the following type: ~`a -> (string -> unit) -> unit~
+
+If you provide a value ~v~ of type ~`a~ and a function ~f~ to handle a string (that is the encoding of ~v~) it will apply ~f~ to the encoding of ~v~.
+
+** Example
+
+#+begin_src ocaml :results value verbatim :exports both :eval no-export
+  let buf = Buffer.create 2;;
+
+  let () =
+       Repr.((unstage @@ encode_bin int) 2 (Buffer.add_string buf));
+       Repr.((unstage @@ encode_bin int) 3 (Buffer.add_string buf));
+       Repr.((unstage @@ encode_bin int) 4 (Buffer.add_string buf));
+       Format.eprintf "%S@." (Buffer.contents buf)
+#+end_src
+
+#+RESULTS:
+: "\002\003\004"
+
+We created a new buffer ~buf~ and provided ~encode_bin~ the function to write in this buffer the encoding that it generated.
+
+Now, if we want to decode from this buffer:
+
+#+begin_src ocaml :results value verbatim :exports both :eval no-export
+  let () =
+    let buf_str = Buffer.contents buf in
+    let off, a = Repr.((unstage @@ decode_bin int) buf_str 0) in
+    let off, b = Repr.((unstage @@ decode_bin int) buf_str off) in
+    let _, c = Repr.((unstage @@ decode_bin int) buf_str off) in
+    Format.eprintf "%d %d %d@." a b c
+#+end_src
+
+#+RESULTS:
+: 2 3 4
+
+The decoder knows exactly the number of bytes it needs to read, will decode this number of bytes from the provided string and return the new offset where the remaining data should be.
+
+** Problem
+
+Now, as you can see, there's a flaw. ~encode_bin~ takes a function of type ~string -> unit~. This means that it needs to create this string to provide it to the function. Summarised, this would look like this:
+
+- Caller wants to encode a value ~v~
+- Caller has a buffer in which the encoding of ~v~ will be written
+- Caller provides ~encode_bin~ with a function to write in this buffer
+- ~encode_bin~ encodes ~v~ as a string
+- ~encode_bin~ applies the function it was provided to the string
+- the string is now useless and can be garbage collected
+
+As you can see in these implementations:
+
+#+begin_src ocaml :exports code :eval non-export
+  let int64 i =
+    let b = Bytes.create 8 in
+    Bytes.set_int64_be b 0 i;
+    unsafe_add_bytes b
+
+  let float f = int64 (Int64.bits_of_float f)
+#+end_src
+
+#+RESULTS:
+: Line 4, characters 2-18:
+: 4 |   unsafe_add_bytes b
+:       ^^^^^^^^^^^^^^^^
+: Error: Unbound value unsafe_add_bytes
+
+These functions allocate a new byte and end with creating a partial execution waiting for the ~string -> unit~ function
+
+This way of doing looks like it's doing useless allocations but how can we get rid of them?
+
+* Solution 1 - One-copy
+
+** Summary
+
+Let Repr allocate a big buffer and write in it letting the caller know at each offset and how many bytes it wrote. The caller can then read in this buffer (that we would call ~intermediate buffer~) to write in its own output (be it a file, a buffer, a stream etc).
+
+** Pros
+
+- The caller doesn't need to provide a write function, just the value and its type
+- The caller has full control on its output (where, when, how they want to write in it)
+
+** Cons
+
+- Since we don't want any extra allocation, when returning the buffer, the offset and the length to the caller we need to avoid allocating a triple for it
+
+This is usually bypassed with continuation passing style:
+
+#+begin_src ocaml :results none :exports code :eval no-export
+let f i j = if i > j then (j, i) else (i, j)
+#+end_src
+
+If we compile with ~ocamlopt -c -dcmm main.ml~ we obtain the following output:
+
+#+begin_src :eval no-export
+(function{main.ml:1,6-44} camlMain__f_81 (i/83: val j/84: val)
+ (if
+   (!= (extcall "caml_greaterthan"{main.ml:1,15-20} i/83 j/84 int,int->val)
+     1)
+   (alloc{main.ml:1,26-32} 2048 j/84 i/83)
+   (alloc{main.ml:1,38-44} 2048 i/83 j/84)))
+#+end_src
+
+Whereas if we write the following function:
+
+#+begin_src ocaml :results none :exports code :eval no-export
+let f k i j = if i > j then k j i else k i j
+#+end_src
+
+We obtain the following output (notice that there are no more allocations):
+
+#+begin_src :eval never-export
+(function{main.ml:1,6-44} camlMain__f_81 (k/83: val i/84: val j/85: val)
+ (if
+   (!= (extcall "caml_greaterthan"{main.ml:1,17-22} i/84 j/85 int,int->val)
+     1)
+   (app{main.ml:1,28-33} "caml_apply2" j/85 i/84 k/83 val)
+   (app{main.ml:1,39-44} "caml_apply2" i/84 j/85 k/83 val)))
+#+end_src
+
+The inconvenient of this way of doing is that it makes a bit harder to use but no allocation is performed (if we don't do it wrong as we'll see right now)
+
+*** Continuation implementation
+
+*Summary:* The continuation ~k~ needs to be a declared function and not a lambda-expression. Lambda-expressions will be created at each execution of ~f~ leading to a worst behaviour in allocations.
+
+**** Example
+
+#+begin_src ocaml :results none :exports code :eval no-export
+let f i j = if i > j then (j, i) else (i, j)
+
+let () =
+  Memtrace.trace_if_requested ();
+  let r = ref 0 in
+  for i = 1 to 1_000_000 do
+    let x, y = f i (i + 1) in
+    r := !r + y - x
+  done;
+  Format.eprintf "%d" !r
+#+end_src
+
+When executing with memtrace we obtain roughly 23M of allocations. If we refactor it to use naive CPS:
+
+#+begin_src ocaml :results none :exports code :eval no-export
+let f i j k = if i > j then k j i else k i j
+
+let () =
+  Memtrace.trace_if_requested ();
+  let r = ref 0 in
+  for i = 1 to 1_000_000 do
+    f i (i + 1) (fun x y -> r := !r + y - x)
+  done;
+  Format.eprintf "%d" !r
+#+end_src
+
+Memtrace will tell us that we allocated roughly 46M. A less naive solution:
+
+#+begin_src ocaml :results none :exports code :eval no-export
+let f i j k = if i > j then k j i else k i j
+
+let () =
+  Memtrace.trace_if_requested ();
+  let r = ref 0 in
+  let add_data x y = r := !r + y - x in
+  for i = 1 to 1_000_000 do
+    f i (i + 1) add_data
+  done;
+  Format.eprintf "%d" !r
+#+end_src
+
+Leading, this time, to 0M of allocations.
+
+* Solution 2 - Zero-copy
+
+** Summary
+
+Repr won't do any allocation. The caller should provide Repr a way to write exactly where it wants and Repr will write the encoding piece by piece directly in the caller owned output (once again, a file, a buffer, a stream etc)
+
+The solution should do something summarised like this:
+
+- Caller wants to encode a value ~v~
+- Caller has an output in which the encoding of ~v~ will be written
+- Caller provides ~encode_bin~ with a function ~f~ to write in this buffer
+- ~encode_bin~ encodes ~v~ directly in the buffer with the function ~f~
+- ~encode_bin~ returns telling the caller how many bytes it wrote allowing the caller to know where the new offset is
+
+** Pros
+
+- No allocations at all from the library. This leads to a finer control over allocations from anyone using it.
+
+** Cons
+
+- The caller needs to provide a way to write in its output
+  - Simple solution: assume we're appending in a buffer and just ask for the pointer to this caller-allocated buffer
+  - Pretty solution: create a functor with all the needed functions to write integers, characters, strings etc
+
+
+* Observations
+
+
+#+begin_src ocaml :results none :exports code :eval no-export
+    Bytes.set_uint32_be b 0 i was previously used
+
+    let set_uint32_be = set_int32_be
+
+    let set_int32_be b i x =
+      if not Sys.big_endian then set_int32_ne b i (swap32 x)
+      else set_int32_ne b i x
+
+    external swap32 : int32 -> int32 = "%bswap_int32"
+
+    static int32_t caml_swap32(int32_t x)
+    {
+      return (((x & 0x000000FF) << 24) |
+              ((x & 0x0000FF00) << 8) |
+              ((x & 0x00FF0000) >> 8) |
+              ((x & 0xFF000000) >> 24));
+    }
+
+    (* swap(C1C2C3C4) = C4C3C2C1 *)
+
+    external set_int32_ne : bytes -> int -> int32 -> unit = "%caml_bytes_set32"
+
+    CAMLprim value caml_bytes_set32(value str, value index, value newval)
+    {
+      unsigned char b1, b2, b3, b4;
+      intnat val;
+      intnat idx = Long_val(index);
+      if (idx < 0 || idx + 3 >= caml_string_length(str)) caml_array_bound_error();
+      val = Int32_val(newval);
+    #ifdef ARCH_BIG_ENDIAN
+      b1 = 0xFF & val >> 24;
+      b2 = 0xFF & val >> 16;
+      b3 = 0xFF & val >> 8;
+      b4 = 0xFF & val;
+    #else
+      b4 = 0xFF & val >> 24;
+      b3 = 0xFF & val >> 16;
+      b2 = 0xFF & val >> 8;
+      b1 = 0xFF & val;
+    #endif
+      Byte_u(str, idx) = b1;
+      Byte_u(str, idx + 1) = b2;
+      Byte_u(str, idx + 2) = b3;
+      Byte_u(str, idx + 3) = b4;
+      return Val_unit;
+    }
+#+end_src
+
+This shows that whatever the architecture, the encoding will be written according to the endianness of the system
+
+* Conclusion
+
+I'd rather implement the second solution since I'm not a huge fan of allocating from Repr. This is clearly a more complicated (implementation wise) solution but a much prettier one.
diff --git a/src/repr/type.ml b/src/repr/type.ml
@@ -36,9 +36,13 @@ let short_hash = function
       let pre_hash = unstage (pre_hash t) in
       stage @@ fun ?seed x ->
       let seed = match seed with None -> 0 | Some t -> t in
-      let h = ref seed in
-      pre_hash x (fun s -> h := Hashtbl.seeded_hash !h s);
-      !h
+      let len =
+        match unstage (Type_size.t t) x with None -> 1024 | Some n -> n
+      in
+      let byt = Bytes.create len in
+      let off = pre_hash x byt 0 in
+      let byt = if len = off then byt else Bytes.sub byt 0 off in
+      Hashtbl.seeded_hash seed (Bytes.to_string byt)
 
 (* Combinators for Repr types *)