diff --git a/bench/dune b/bench/dune index c854a68d..38d1eb10 100644 --- a/bench/dune +++ b/bench/dune @@ -2,7 +2,7 @@ (name main) (public_name repr-bench) (package repr-bench) - (libraries repr bechamel fpath yojson unix) + (libraries repr bechamel fpath yojson unix memtrace) (preprocess (pps ppx_repr))) diff --git a/bench/main.ml b/bench/main.ml index 1b1f586f..18fd5657 100644 --- a/bench/main.ml +++ b/bench/main.ml @@ -12,6 +12,9 @@ module Generic_op = struct } -> op + let size_of ty v = + match T.(unstage (size_of ty)) v with None -> 1024 | Some n -> n + type t = { name : string; operation : op } let bin_string : t = @@ -24,12 +27,14 @@ module Generic_op = struct let bin : t = let encode (type a) (ty : a T.t) = + let size_of = size_of ty in let f = T.unstage (T.encode_bin ty) in T.stage (fun a -> - let buffer = Buffer.create 0 in - f a (Buffer.add_string buffer); - Buffer.contents buffer + let len = size_of a in + let byt = Bytes.create len in + let off = f a byt 0 in + Bytes.to_string (if len = off then byt else Bytes.sub byt 0 off) : a -> string) in let decode (type a) (ty : a T.t) = @@ -40,12 +45,14 @@ module Generic_op = struct let pre_hash : t = let consume (type a) (ty : a T.t) = + let size_of = size_of ty in let f = T.unstage (T.pre_hash ty) in T.stage (fun a -> - let buffer = Buffer.create 0 in - f a (Buffer.add_string buffer); - Buffer.contents buffer + let len = size_of a in + let byt = Bytes.create len in + let off = f a byt 0 in + Bytes.to_string (if len = off then byt else Bytes.sub byt 0 off) : a -> string) in { name = "pre_hash"; operation = Consumer { consume } } @@ -252,6 +259,7 @@ let benchmark () = let ignore_eexist f = try f () with Unix.Unix_error (EEXIST, _, _) -> () let () = + Memtrace.trace_if_requested (); Random.self_init (); let output_formatter = match Sys.argv with diff --git a/dune-project b/dune-project index 61569028..59ce598d 100644 --- a/dune-project +++ b/dune-project @@ -49,6 +49,7 @@ guarantee. (ppx_repr (= :version)) bechamel yojson + memtrace fpath) (synopsis "Benchmarks for the `repr` package") (description "Benchmarks for the `repr` package")) diff --git a/repr-bench.opam b/repr-bench.opam index 2507c2cb..903277e7 100644 --- a/repr-bench.opam +++ b/repr-bench.opam @@ -13,6 +13,7 @@ depends: [ "ppx_repr" {= version} "bechamel" "yojson" + "memtrace" "fpath" "odoc" {with-doc} ] diff --git a/rfc.org b/rfc.org new file mode 100644 index 00000000..2240066e --- /dev/null +++ b/rfc.org @@ -0,0 +1,265 @@ +#+TITLE: Repr Encoding +#+AUTHOR: mattiasdrp +#+EMAIL: mattias@tarides.com +#+DESCRIPTION: This document documents the Repr encoding and the solutions to make it zero(or one)-copy +#+KEYWORDS: repr, ocaml + +#+begin_src ocaml :results none :exports never + #use "topfind" ;; + #require "repr";; +#+end_src + +* Current encoding + +Encoding right now has the following type: ~`a -> (string -> unit) -> unit~ + +If you provide a value ~v~ of type ~`a~ and a function ~f~ to handle a string (that is the encoding of ~v~) it will apply ~f~ to the encoding of ~v~. + +** Example + +#+begin_src ocaml :results value verbatim :exports both :eval no-export + let buf = Buffer.create 2;; + + let () = + Repr.((unstage @@ encode_bin int) 2 (Buffer.add_string buf)); + Repr.((unstage @@ encode_bin int) 3 (Buffer.add_string buf)); + Repr.((unstage @@ encode_bin int) 4 (Buffer.add_string buf)); + Format.eprintf "%S@." (Buffer.contents buf) +#+end_src + +#+RESULTS: +: "\002\003\004" + +We created a new buffer ~buf~ and provided ~encode_bin~ the function to write in this buffer the encoding that it generated. + +Now, if we want to decode from this buffer: + +#+begin_src ocaml :results value verbatim :exports both :eval no-export + let () = + let buf_str = Buffer.contents buf in + let off, a = Repr.((unstage @@ decode_bin int) buf_str 0) in + let off, b = Repr.((unstage @@ decode_bin int) buf_str off) in + let _, c = Repr.((unstage @@ decode_bin int) buf_str off) in + Format.eprintf "%d %d %d@." a b c +#+end_src + +#+RESULTS: +: 2 3 4 + +The decoder knows exactly the number of bytes it needs to read, will decode this number of bytes from the provided string and return the new offset where the remaining data should be. + +** Problem + +Now, as you can see, there's a flaw. ~encode_bin~ takes a function of type ~string -> unit~. This means that it needs to create this string to provide it to the function. Summarised, this would look like this: + +- Caller wants to encode a value ~v~ +- Caller has a buffer in which the encoding of ~v~ will be written +- Caller provides ~encode_bin~ with a function to write in this buffer +- ~encode_bin~ encodes ~v~ as a string +- ~encode_bin~ applies the function it was provided to the string +- the string is now useless and can be garbage collected + +As you can see in these implementations: + +#+begin_src ocaml :exports code :eval non-export + let int64 i = + let b = Bytes.create 8 in + Bytes.set_int64_be b 0 i; + unsafe_add_bytes b + + let float f = int64 (Int64.bits_of_float f) +#+end_src + +#+RESULTS: +: Line 4, characters 2-18: +: 4 | unsafe_add_bytes b +: ^^^^^^^^^^^^^^^^ +: Error: Unbound value unsafe_add_bytes + +These functions allocate a new byte and end with creating a partial execution waiting for the ~string -> unit~ function + +This way of doing looks like it's doing useless allocations but how can we get rid of them? + +* Solution 1 - One-copy + +** Summary + +Let Repr allocate a big buffer and write in it letting the caller know at each offset and how many bytes it wrote. The caller can then read in this buffer (that we would call ~intermediate buffer~) to write in its own output (be it a file, a buffer, a stream etc). + +** Pros + +- The caller doesn't need to provide a write function, just the value and its type +- The caller has full control on its output (where, when, how they want to write in it) + +** Cons + +- Since we don't want any extra allocation, when returning the buffer, the offset and the length to the caller we need to avoid allocating a triple for it + +This is usually bypassed with continuation passing style: + +#+begin_src ocaml :results none :exports code :eval no-export +let f i j = if i > j then (j, i) else (i, j) +#+end_src + +If we compile with ~ocamlopt -c -dcmm main.ml~ we obtain the following output: + +#+begin_src :eval no-export +(function{main.ml:1,6-44} camlMain__f_81 (i/83: val j/84: val) + (if + (!= (extcall "caml_greaterthan"{main.ml:1,15-20} i/83 j/84 int,int->val) + 1) + (alloc{main.ml:1,26-32} 2048 j/84 i/83) + (alloc{main.ml:1,38-44} 2048 i/83 j/84))) +#+end_src + +Whereas if we write the following function: + +#+begin_src ocaml :results none :exports code :eval no-export +let f k i j = if i > j then k j i else k i j +#+end_src + +We obtain the following output (notice that there are no more allocations): + +#+begin_src :eval never-export +(function{main.ml:1,6-44} camlMain__f_81 (k/83: val i/84: val j/85: val) + (if + (!= (extcall "caml_greaterthan"{main.ml:1,17-22} i/84 j/85 int,int->val) + 1) + (app{main.ml:1,28-33} "caml_apply2" j/85 i/84 k/83 val) + (app{main.ml:1,39-44} "caml_apply2" i/84 j/85 k/83 val))) +#+end_src + +The inconvenient of this way of doing is that it makes a bit harder to use but no allocation is performed (if we don't do it wrong as we'll see right now) + +*** Continuation implementation + +*Summary:* The continuation ~k~ needs to be a declared function and not a lambda-expression. Lambda-expressions will be created at each execution of ~f~ leading to a worst behaviour in allocations. + +**** Example + +#+begin_src ocaml :results none :exports code :eval no-export +let f i j = if i > j then (j, i) else (i, j) + +let () = + Memtrace.trace_if_requested (); + let r = ref 0 in + for i = 1 to 1_000_000 do + let x, y = f i (i + 1) in + r := !r + y - x + done; + Format.eprintf "%d" !r +#+end_src + +When executing with memtrace we obtain roughly 23M of allocations. If we refactor it to use naive CPS: + +#+begin_src ocaml :results none :exports code :eval no-export +let f i j k = if i > j then k j i else k i j + +let () = + Memtrace.trace_if_requested (); + let r = ref 0 in + for i = 1 to 1_000_000 do + f i (i + 1) (fun x y -> r := !r + y - x) + done; + Format.eprintf "%d" !r +#+end_src + +Memtrace will tell us that we allocated roughly 46M. A less naive solution: + +#+begin_src ocaml :results none :exports code :eval no-export +let f i j k = if i > j then k j i else k i j + +let () = + Memtrace.trace_if_requested (); + let r = ref 0 in + let add_data x y = r := !r + y - x in + for i = 1 to 1_000_000 do + f i (i + 1) add_data + done; + Format.eprintf "%d" !r +#+end_src + +Leading, this time, to 0M of allocations. + +* Solution 2 - Zero-copy + +** Summary + +Repr won't do any allocation. The caller should provide Repr a way to write exactly where it wants and Repr will write the encoding piece by piece directly in the caller owned output (once again, a file, a buffer, a stream etc) + +The solution should do something summarised like this: + +- Caller wants to encode a value ~v~ +- Caller has an output in which the encoding of ~v~ will be written +- Caller provides ~encode_bin~ with a function ~f~ to write in this buffer +- ~encode_bin~ encodes ~v~ directly in the buffer with the function ~f~ +- ~encode_bin~ returns telling the caller how many bytes it wrote allowing the caller to know where the new offset is + +** Pros + +- No allocations at all from the library. This leads to a finer control over allocations from anyone using it. + +** Cons + +- The caller needs to provide a way to write in its output + - Simple solution: assume we're appending in a buffer and just ask for the pointer to this caller-allocated buffer + - Pretty solution: create a functor with all the needed functions to write integers, characters, strings etc + + +* Observations + + +#+begin_src ocaml :results none :exports code :eval no-export + Bytes.set_uint32_be b 0 i was previously used + + let set_uint32_be = set_int32_be + + let set_int32_be b i x = + if not Sys.big_endian then set_int32_ne b i (swap32 x) + else set_int32_ne b i x + + external swap32 : int32 -> int32 = "%bswap_int32" + + static int32_t caml_swap32(int32_t x) + { + return (((x & 0x000000FF) << 24) | + ((x & 0x0000FF00) << 8) | + ((x & 0x00FF0000) >> 8) | + ((x & 0xFF000000) >> 24)); + } + + (* swap(C1C2C3C4) = C4C3C2C1 *) + + external set_int32_ne : bytes -> int -> int32 -> unit = "%caml_bytes_set32" + + CAMLprim value caml_bytes_set32(value str, value index, value newval) + { + unsigned char b1, b2, b3, b4; + intnat val; + intnat idx = Long_val(index); + if (idx < 0 || idx + 3 >= caml_string_length(str)) caml_array_bound_error(); + val = Int32_val(newval); + #ifdef ARCH_BIG_ENDIAN + b1 = 0xFF & val >> 24; + b2 = 0xFF & val >> 16; + b3 = 0xFF & val >> 8; + b4 = 0xFF & val; + #else + b4 = 0xFF & val >> 24; + b3 = 0xFF & val >> 16; + b2 = 0xFF & val >> 8; + b1 = 0xFF & val; + #endif + Byte_u(str, idx) = b1; + Byte_u(str, idx + 1) = b2; + Byte_u(str, idx + 2) = b3; + Byte_u(str, idx + 3) = b4; + return Val_unit; + } +#+end_src + +This shows that whatever the architecture, the encoding will be written according to the endianness of the system + +* Conclusion + +I'd rather implement the second solution since I'm not a huge fan of allocating from Repr. This is clearly a more complicated (implementation wise) solution but a much prettier one. diff --git a/src/repr/type.ml b/src/repr/type.ml index af81ddad..3bddb343 100644 --- a/src/repr/type.ml +++ b/src/repr/type.ml @@ -36,9 +36,13 @@ let short_hash = function let pre_hash = unstage (pre_hash t) in stage @@ fun ?seed x -> let seed = match seed with None -> 0 | Some t -> t in - let h = ref seed in - pre_hash x (fun s -> h := Hashtbl.seeded_hash !h s); - !h + let len = + match unstage (Type_size.t t) x with None -> 1024 | Some n -> n + in + let byt = Bytes.create len in + let off = pre_hash x byt 0 in + let byt = if len = off then byt else Bytes.sub byt 0 off in + Hashtbl.seeded_hash seed (Bytes.to_string byt) (* Combinators for Repr types *) diff --git a/src/repr/type_binary.ml b/src/repr/type_binary.ml index 7b503607..420587c7 100644 --- a/src/repr/type_binary.ml +++ b/src/repr/type_binary.ml @@ -19,45 +19,48 @@ open Staging open Utils module Encode = struct - let chars = - Array.init 256 (fun i -> Bytes.unsafe_to_string (Bytes.make 1 (Char.chr i))) + let unit () _byt off = off - let unit () _k = () - let unsafe_add_bytes b k = k (Bytes.unsafe_to_string b) - let add_string s k = k s - let char c k = k chars.(Char.code c) + let add_string s byt off = + let ls = String.length s in + Bytes.blit_string s 0 byt off ls; + off + ls - let int8 i k = - assert (i < 256); - k chars.(i) + let add_bytes b byt off = + let lb = Bytes.length b in + Bytes.blit b 0 byt off lb; + off + lb - let int16 i = - let b = Bytes.create 2 in - Bytes.set_uint16_be b 0 i; - unsafe_add_bytes b + let char c byt off = + Bytes.set byt off c; + off + 1 - let int32 i = - let b = Bytes.create 4 in - Bytes.set_int32_be b 0 i; - unsafe_add_bytes b + let byte n byt off = char (Char.chr n) byt off - let int64 i = - let b = Bytes.create 8 in - Bytes.set_int64_be b 0 i; - unsafe_add_bytes b + let int8 i byt off = + Bytes.set_uint8 byt off i; + off + 1 + + let int16 i byt off = + Bytes.set_uint16_be byt off i; + off + 2 + + let int32 (i : int32) byt off = + Bytes.set_int32_be byt off i; + off + 4 + + let int64 (i : int64) byt off = + Bytes.set_int64_be byt off i; + off + 8 let float f = int64 (Int64.bits_of_float f) let bool b = char (if b then '\255' else '\000') - let int i k = - let rec aux n k = - if n >= 0 && n < 128 then k chars.(n) - else - let out = 128 lor (n land 127) in - k chars.(out); - aux (n lsr 7) k - in - aux i k + let rec int n byt off = + if n >= 0 && n < 128 then byte n byt off + else + let out = 128 lor (n land 127) in + byte out byt off |> int (n lsr 7) byt let len n i = match n with @@ -72,57 +75,47 @@ module Encode = struct let unboxed_string _ = stage add_string let boxed_string n = - let len = len n in - stage @@ fun s k -> + stage @@ fun s byt off -> let i = String.length s in - len i k; - add_string s k + len n i byt off |> add_string s byt let string boxed = if boxed then boxed_string else unboxed_string - let unboxed_bytes _ = stage @@ fun b k -> add_string (Bytes.to_string b) k + let unboxed_bytes _ = stage @@ add_bytes let boxed_bytes n = let len = len n in - stage @@ fun s k -> - let i = Bytes.length s in - len i k; - unsafe_add_bytes s k + stage @@ fun b byt off -> + let lb = Bytes.length b in + len lb byt off |> add_bytes b byt let bytes boxed = if boxed then boxed_bytes else unboxed_bytes let list l n = let l = unstage l in - stage (fun x k -> - len n (List.length x) k; - List.iter (fun e -> l e k) x) + stage (fun x byt off -> + let off = len n (List.length x) byt off in + List.fold_left (fun off e -> l e byt off) off x) let array l n = let l = unstage l in - stage (fun x k -> - len n (Array.length x) k; - Array.iter (fun e -> l e k) x) + stage (fun x byt off -> + let off = len n (Array.length x) byt off in + Array.fold_left (fun off e -> l e byt off) off x) let pair a b = let a = unstage a and b = unstage b in - stage (fun (x, y) k -> - a x k; - b y k) + stage (fun (x, y) byt off -> a x byt off |> b y byt) let triple a b c = let a = unstage a and b = unstage b and c = unstage c in - stage (fun (x, y, z) k -> - a x k; - b y k; - c z k) + stage (fun (x, y, z) byt off -> a x byt off |> b y byt |> c z byt) let option o = let o = unstage o in - stage (fun v k -> + stage (fun v byt off -> match v with - | None -> char '\000' k - | Some x -> - char '\255' k; - o x k) + | None -> char '\000' byt off + | Some x -> char '\255' byt off |> o x byt) let rec t : type a. a t -> a encode_bin = function | Self s -> fst (self s) @@ -165,7 +158,7 @@ module Encode = struct and map : type a b. boxed:bool -> (a, b) map -> b encode_bin = fun ~boxed { x; g; _ } -> let encode_bin = unstage (if boxed then t x else unboxed x) in - stage (fun u k -> encode_bin (g u) k) + stage (fun y byt off -> encode_bin (g y) byt off) and prim : type a. boxed:bool -> a prim -> a encode_bin = fun ~boxed -> function @@ -181,21 +174,20 @@ module Encode = struct and record : type a. a record -> a encode_bin = fun r -> - let field_encoders : (a -> (string -> unit) -> unit) list = + let field_encoders : (a -> bytes -> int -> int) list = fields r |> List.map @@ fun (Field f) -> let field_encode = unstage (t f.ftype) in fun x -> field_encode (f.fget x) in - stage (fun x k -> List.iter (fun f -> f x k) field_encoders) + stage (fun x byt off -> + List.fold_left (fun off f -> f x byt off) off field_encoders) and variant : type a. a variant -> a encode_bin = let c0 { ctag0; _ } = stage (int ctag0) in let c1 c = let encode_arg = unstage (t c.ctype1) in - stage (fun v k -> - int c.ctag1 k; - encode_arg v k) + stage (fun v byt off -> int c.ctag1 byt off |> encode_arg v byt) in fun v -> fold_variant { c0; c1 } v end @@ -429,11 +421,10 @@ let to_bin size_of encode_bin = let size_of = unstage size_of in let encode_bin = unstage encode_bin in stage (fun x -> - let seq = encode_bin x in let len = match size_of x with None -> 1024 | Some n -> n in - let buf = Buffer.create len in - seq (Buffer.add_string buf); - Buffer.contents buf) + let byt = Bytes.create len in + let _off = encode_bin x byt 0 in + Bytes.to_string byt) let to_bin_string = let rec aux : type a. a t -> a to_bin_string = diff --git a/src/repr/type_core_intf.ml b/src/repr/type_core_intf.ml index 12d6dff9..3bebce74 100644 --- a/src/repr/type_core_intf.ml +++ b/src/repr/type_core_intf.ml @@ -8,7 +8,7 @@ module Types = struct type 'a encode_json = Jsonm.encoder -> 'a -> unit type json_decoder = { mutable lexemes : Jsonm.lexeme list; d : Jsonm.decoder } type 'a decode_json = json_decoder -> ('a, [ `Msg of string ]) result - type 'a bin_seq = 'a -> (string -> unit) -> unit + type 'a bin_seq = 'a -> bytes -> int -> int type 'a pre_hash = 'a bin_seq staged type 'a encode_bin = 'a bin_seq staged type 'a decode_bin = (string -> int -> int * 'a) staged diff --git a/src/repr/type_intf.ml b/src/repr/type_intf.ml index 8fa723ea..87430ad6 100644 --- a/src/repr/type_intf.ml +++ b/src/repr/type_intf.ml @@ -514,7 +514,7 @@ module type DSL = sig (** {2 Binary Converters} *) - type 'a encode_bin = ('a -> (string -> unit) -> unit) staged + type 'a encode_bin = ('a -> bytes -> int -> int) staged (** The type for binary encoders. *) type 'a decode_bin = (string -> int -> int * 'a) staged diff --git a/test/repr/main.ml b/test/repr/main.ml index 9e0ecefa..8359f0a7 100644 --- a/test/repr/main.ml +++ b/test/repr/main.ml @@ -20,11 +20,15 @@ let of_bin_string t = T.unstage (T.of_bin_string t) let encode_bin t = T.unstage (T.encode_bin t) let decode_bin t = T.unstage (T.decode_bin t) let size_of t = T.unstage (T.size_of t) +let sub_bytes len off byt = if len = off then byt else Bytes.sub byt 0 off +let sub_string len off byt = Bytes.to_string @@ sub_bytes len off byt -let with_buf f = - let buf = Buffer.create 10 in - f (Buffer.add_string buf); - Buffer.contents buf +let with_bytes t v size_of f = + let size_of t v = match (size_of t) v with None -> 1024 | Some n -> n in + let len = size_of t v in + let byt = Bytes.create len in + let off = f t v byt 0 in + sub_string len off byt module Unboxed = struct let decode_bin t = T.unstage (T.Unboxed.decode_bin t) @@ -58,12 +62,12 @@ let test_boxing () = Alcotest.(check string) "foo eq" s foo; Alcotest.(check bool) "foo physeq" true (foo == s); let check msg ty foo = - let msg f = Fmt.strf "%s: %s" msg f in - let buf = with_buf (encode_bin ty foo) in + let msg f = Fmt.str "%s: %s" msg f in + let buf = with_bytes ty foo size_of encode_bin in Alcotest.(check string) (msg "boxed") buf "\003foo"; - let buf = with_buf (Unboxed.encode_bin ty foo) in + let buf = with_bytes ty foo Unboxed.size_of Unboxed.encode_bin in Alcotest.(check string) (msg "unboxed") buf "foo"; - let buf = with_buf (Unboxed.encode_bin (T.boxed ty) foo) in + let buf = with_bytes (T.boxed ty) foo size_of Unboxed.encode_bin in Alcotest.(check string) (msg "force boxed") buf "\003foo" in check "string" T.string foo; @@ -240,6 +244,12 @@ let l = let tl = Alcotest.testable (T.pp l) T.(unstage (equal l)) +let to_bytes ty v f size_of = + let len = match (size_of ty) v with None -> 1024 | Some n -> n in + let byt = Bytes.create len in + let off = f ty v byt 0 in + sub_bytes len off byt + let test_bin () = let s = T.to_string l [ "foo"; "foo" ] in Alcotest.(check string) "hex list" "[\"666f6f\",\"666f6f\"]" s; @@ -250,12 +260,12 @@ let test_bin () = (size_of l [ "foo"; "bar" ]); let s = of_bin_string l "foobar" in Alcotest.(check (ok tl)) "decode list" (Ok [ "foo"; "bar" ]) s; - let buf = Buffer.create 10 in - encode_bin T.string "foo" (Buffer.add_string buf); - Alcotest.(check string) "foo 1" (Buffer.contents buf) "\003foo"; - let buf = Buffer.create 10 in - Unboxed.encode_bin T.string "foo" (Buffer.add_string buf); - Alcotest.(check string) "foo 1" (Buffer.contents buf) "foo"; + let s = Bytes.to_string (to_bytes T.string "foo" encode_bin size_of) in + Alcotest.(check string) "foo 1" s "\003foo"; + let s = + Bytes.to_string (to_bytes T.string "foo" Unboxed.encode_bin Unboxed.size_of) + in + Alcotest.(check string) "foo 1" s "foo"; let _, foo = Unboxed.decode_bin T.string "foo" 0 in Alcotest.(check string) "decode foo 0" foo "foo"; let _, foo = Unboxed.decode_bin T.string "123foo" 3 in @@ -276,9 +286,7 @@ let test_bin () = varints; List.iter (fun (k, v) -> - let buf = Buffer.create 10 in - encode_bin T.int k (Buffer.add_string buf); - let v' = Buffer.contents buf in + let v' = Bytes.to_string @@ to_bytes T.int k encode_bin size_of in Alcotest.(check string) (Fmt.str "decoding %S" v) v v') varints