diff --git a/src/functors.ml b/src/functors.ml index d950e30..ef41862 100644 --- a/src/functors.ml +++ b/src/functors.ml @@ -1306,3 +1306,127 @@ module MakeHashconsedMap(Key: KEY)(Value: HASHED_VALUE)() = struct let compare = Node.compare let to_int = Node.to_int end + +module MakeBucketedHeterogeneous + (Map: BASE_MAP_INTERFACE) + (Buckets: sig + val nb_buckets : int + val bucket_id : 'a Map.key -> int + end) = struct + include Map + type 'map t = 'map Map.t array + + let get = Array.unsafe_get + let set = Array.unsafe_set + + let empty = Obj.magic (Array.make Buckets.nb_buckets Map.empty) + (* This Obj.magic is required to avoid the '_weak type. + It is correct as long as we never mutate empty without copying it first. *) + let empty : 'a t = empty + + let is_empty arr = Array.for_all Map.is_empty arr + let cardinal arr = Array.fold_left (fun a b -> a + Map.cardinal b) 0 arr + + let singleton key value = + let a = Array.copy empty in + set a (Buckets.bucket_id key) (Map.singleton key value); + a + let is_singleton map = + let exception NotSingleton in + try + Array.fold_left (fun prev map -> match prev, Map.is_singleton map with + | None, x -> x + | Some _, None -> prev + | Some _, Some _ -> raise NotSingleton + ) None map + with NotSingleton -> None + + let find key arr = Map.find key (get arr (Buckets.bucket_id key)) + let find_opt key arr = Map.find_opt key (get arr (Buckets.bucket_id key)) + let mem key arr = Map.mem key (get arr (Buckets.bucket_id key)) + + let update key fvalue arr = + let bucket = Buckets.bucket_id key in + let old_map = get arr bucket in + let new_map = Map.update key fvalue old_map in + if old_map == new_map + then arr + else + let arr = Array.copy arr in + set arr bucket new_map; + arr + let add key value arr = update key (fun _ -> Some value) arr + let insert key f arr = update key (fun x -> Some (f x)) arr + let remove key map = update key (fun _ -> None) map + + let iter f arr = Array.iter (Map.iter f) arr + let map f arr = Array.map (Map.map f) arr + let mapi f arr = Array.map (Map.mapi f) arr + let map_no_share f arr = Array.map (Map.map_no_share f) arr + let mapi_no_share f arr = Array.map (Map.mapi_no_share f) arr + let filter f arr = Array.map (Map.filter f) arr + let filter_map f arr = Array.map (Map.filter_map f) arr + let filter_map_no_share f arr = Array.map (Map.filter_map_no_share f) arr + let fold f arr elt = Array.fold_left (fun elt map -> Map.fold f map elt) elt arr + let for_all f arr = Array.for_all (Map.for_all f) arr + + let array_fold_left_2 f a1 a2 acc = + let rec aux i acc = + if i = Buckets.nb_buckets then acc + else aux (i+1) (f (get a1 i) (get a2 i) acc) + in aux 0 acc + + let fold_on_nonequal_inter f arr1 arr2 elt = array_fold_left_2 (Map.fold_on_nonequal_inter f) arr1 arr2 elt + let fold_on_nonequal_union f arr1 arr2 elt = array_fold_left_2 (Map.fold_on_nonequal_union f) arr1 arr2 elt + + let pretty ?(pp_sep=Format.pp_print_cut) pp fmt m = + (* Compact version of array, with empty elements removed + this avoids extra calls to pp_sep *) + let non_empty = Array.fold_right (fun elt lst -> + if Map.is_empty elt then lst else elt::lst) m [] + in + Format.pp_print_list ~pp_sep (Map.pretty ~pp_sep pp) fmt non_empty + + let reflexive_same_domain_for_all2 f a b = Array.for_all2 (Map.reflexive_same_domain_for_all2 f) a b + let reflexive_subset_domain_for_all2 f a b = Array.for_all2 (Map.reflexive_subset_domain_for_all2 f) a b + let nonreflexive_same_domain_for_all2 f a b = Array.for_all2 (Map.nonreflexive_same_domain_for_all2 f) a b + let disjoint a b = Array.for_all2 Map.disjoint a b + + let idempotent_union f a b = Array.map2 (Map.idempotent_union f) a b + let idempotent_inter f a b = Array.map2 (Map.idempotent_inter f) a b + let nonidempotent_inter_no_share f a b = Array.map2 (Map.nonidempotent_inter_no_share f) a b + let idempotent_inter_filter f a b = Array.map2 (Map.idempotent_inter_filter f) a b + let slow_merge f a b = Array.map2 (Map.slow_merge f) a b + let difference f a b = Array.map2 (Map.difference f) a b + let symmetric_difference f a b = Array.map2 (Map.symmetric_difference f) a b + let split k m = + let s = Array.map (Map.split k) m in ( + Array.map (fun (l, _, _) -> l) s, + Array.find_map (fun (_, e, _) -> e) s, + Array.map (fun (_, _, r) -> r) s + ) + let to_seq m = Seq.flat_map Map.to_seq (Array.to_seq m) + + let array_rev_seq a = + let rec aux i () = + if i >= 0 + then Seq.Cons (get a i, aux (i-1)) + else Seq.Nil + in aux (Buckets.nb_buckets - 1) + let to_rev_seq m = Seq.flat_map Map.to_rev_seq (array_rev_seq m) + let rec add_seq: type a. a key_value_pair Seq.t -> a t -> a t = + fun s m -> match s () with + | Seq.Nil -> m + | Seq.Cons(KeyValue(key,value), s) -> add_seq s (add key value m) + let of_seq s = add_seq s empty + let of_list l = of_seq (List.to_seq l) + let to_list m = List.of_seq (to_seq m) + + let get_bucket arr i = get arr i + let set_bucket arr i m = + if m == get arr i then arr + else + let arr = Array.copy arr in + set arr i m; + arr +end diff --git a/src/functors.mli b/src/functors.mli index 6ca8cac..f7d8ab9 100644 --- a/src/functors.mli +++ b/src/functors.mli @@ -202,3 +202,63 @@ module MakeHashconsedHeterogeneousMap(Key: HETEROGENEOUS_KEY)(Value: HETEROGENEO include HASH_CONSED_OPERATIONS with type 'a t := 'a t (** @inline *) end + +(** {1:bucketed Bucketed maps and sets} *) +(** Bucketed maps replace one large map with + a fixed number [nb_buckets] of smaller maps, using a + [bucket_id : key -> 0 .. (nb_buckets - 1)] function which + assigns to each key to a unique bucket. + + This can offer performance improvements because the map operations are + performed on smaller maps. Especially if one bucket is small and frequently + accessed, and the others are much larger. + + Bucketed maps are created on top of a regular maps, so they can be used with + any of the previous map or sets. Their interface is a bit simpler as they + aren't true Patricia trees anymore: + - They lack the {!NODE.val-view} functions and the constructors ({!NODE.branch}, {!NODE.leaf}...) + As such, they can't be used for cross map/set operations in the + {{!HETEROGENEOUS_MAP.WithForeign}[WithForeign]} functors. + - Iteration is no longer guaranteed to be in increasing order of {!KEY.to_int}, + so the {{!BASE_MAP.unsigned_min_binding}[unsigned_min_binding]} and + {{!BASE_MAP.pop_unsigned_minimum}[pop_unsigned_minimum]} functions have been + removed. The other iteration functions are still here, but keep in mind + they no longer iterate in the order of {!KEY.to_int}. Instead, they + usually iterate on bucket order first, and then using {!KEY.to_int} + within a bucket. *) + +(** Create a bucketed map on top of the given map/set + + @since 0.11.0 *) +module MakeBucketedHeterogeneous + (Map: BASE_MAP_INTERFACE) + (Buckets: sig + val nb_buckets : int + (** The total number of buckets, + should be strictly positive and relatively small + (usually between 4 and 20). *) + + val bucket_id : 'a Map.key -> int + (** The bucket number of the given key, + {b must be between [0] and [nb_buckets-1] inclusive}. + Not respecting this constraint will lead to runtime errors, as we use it as + an index for [Array.unsafe_get] and [Array.unsafe_set]. *) + end) : sig + include BASE_MAP_INTERFACE + with type 'key key = 'key Map.key + and type ('key, 'map) value = ('key, 'map) Map.value + + val get_bucket : 'a t -> int -> 'a Map.t + (** [get_bucket a i] returns the [i]-th bucket, that is the submap of [a] containing + only the keys [k] which satisfy [bucket_id k = i]. This is equivalent to + {{!filter}[filter {f=fun k _ -> Buckets.bucket_id k = i}) m]}, but constant time instead + of [O(n)]. + + {b [i] must be between [0] and [nb_buckets-1] inclusive.} *) + + val set_bucket : 'a t -> int -> 'a Map.t -> 'a t + (** [set_bucket a i m] changes the [i]-th bucket to [m]. + {b [m] should only contain keys [k] which satisfy [bucket_id k = i]. } + + {b [i] must be between [0] and [nb_buckets-1] inclusive.} *) +end diff --git a/src/index.mld b/src/index.mld index d5ebfa4..a7b9d1a 100644 --- a/src/index.mld +++ b/src/index.mld @@ -85,11 +85,16 @@ See the {{!examples}examples} to jump right into using this library. be extended to store size information in nodes if needed.} {li Exposes a common interface ({!type:PatriciaTree.NODE.view}) to allow users to write their own pattern matching on the tree structure without depending on the {{!PatriciaTree.NODE}[NODE]} being used.} -{li Additionally, hashconsed versions of heterogeneous/homogeneous maps/sets are +{li Additionally, {{!PatriciaTree.section-hash_consed}hashconsed versions of heterogeneous/homogeneous maps/sets} are available. These provide constant time equality and comparison, and ensure maps/set with the same constants are always physically equal. It comes at the cost of a constant overhead in memory usage (at worst, as hash-consing may allow - memory gains) and constant time overhead when calling constructors.}} + memory gains) and constant time overhead when calling constructors.} +{li Finally, we allow {{!PatriciaTree.section-bucketed}bucketing maps and sets}. + i.e. splitting a large map into [n] smaller ones using a [bucket_id : key -> 0 .. (n-1)] + function. This allows some speed-up, especially if you know you will be + repeatedly accessing a small bucket. +}} {1 Quick overview} @@ -137,6 +142,10 @@ The functors used to build maps and sets are the following: create a new hash-table to store the created nodes. Calling a functor twice with same arguments will lead to two numbering systems for identifiers, and thus the types should not be considered compatible. +} +{li Bucketed maps and sets are built on top of any of the previous maps + using the {{!PatriciaTree.MakeBucketedHeterogeneous}MakeBucketedHeterogeneous} + functor }} {2 Interfaces} diff --git a/src/signatures.ml b/src/signatures.ml index 0761fb1..d11cb5f 100644 --- a/src/signatures.ml +++ b/src/signatures.ml @@ -23,6 +23,27 @@ open Ints +(** Standard names for the types in our maps + + @since 0.11.0 + @canonical PatriciaTree.MAP_TYPES *) +module type MAP_TYPES = sig + type 'key key + (** The type of keys. *) + + type ('key, 'map) value + (** The type of value, which depends on the type of the key and the type of the map. *) + + type 'map t + (** The type of the map, which is parameterized by a type. *) + + val empty : 'map t + (** The empty map *) + + val is_empty: 'map t -> bool + (** Check if the map is empty. Should be constant time. *) +end + (** {1 Nodes} *) (** Nodes are the underlying representation used to build a patricia-tree. The module type specifies the constructors they must provide, and a common @@ -40,20 +61,10 @@ module type NODE = sig (** {2 Types} *) - type 'key key - (** The type of keys. *) - - type ('key, 'map) value - (** The type of value, which depends on the type of the key and the type of the map. *) - - type 'map t - (** The type of the map, which is parameterized by a type. *) + include MAP_TYPES (** @inline *) (** {2 Constructors: build values} *) - val empty : 'map t - (** The empty map *) - val leaf : 'key key -> ('key, 'map) value -> 'map t (** A singleton leaf, similar to {!BASE_MAP.singleton} *) @@ -95,9 +106,6 @@ module type NODE = sig | Leaf : { key : 'key key; value : ('key, 'map) value; } -> 'map view (** A key -> value mapping. *) - val is_empty: 'map t -> bool - (** Check if the map is empty. Should be constant time. *) - val view: 'a t -> 'a view (** Convert the map to a view. Should be constant time. *) end @@ -165,21 +173,15 @@ end (** {2 Base map} *) -(** Base map signature: a generic ['b map] storing bindings +(** Base map interface: a generic ['b map] storing bindings of ['a key] to [('a,'b) values]. - All maps and set are a variation of this type, - sometimes with a simplified interface. - - {!HETEROGENEOUS_MAP} is just a {!BASE_MAP} with a functor {!HETEROGENEOUS_MAP.WithForeign} - for building operations that operate on two maps of different base types; - - {!MAP} specializes the interface for non-generic keys ([key] instead of ['a key]); - - {!HETEROGENEOUS_SET} specializes {!BASE_MAP} for sets ([('a,'b) value = unit]) and - removes the value argument from most operations; - - {!SET} specializes {!HETEROGENEOUS_SET} further by making elements (keys) - non-generic ([elt] instead of ['a elt]). + This is a stripped-down version of {!BASE_MAP}, which excludes functions + that rely too much on the patricia tree shape. - @canonical PatriciaTree.BASE_MAP *) -module type BASE_MAP = sig - include NODE (** @closed *) + @since 0.11.0 + @canonical PatriciaTree.BASE_MAP_INTERFACE *) +module type BASE_MAP_INTERFACE = sig + include MAP_TYPES (** @closed *) (** Existential wrapper for the ['a] parameter in a ['a key], [('a,'map) value] pair *) type 'map key_value_pair = @@ -187,16 +189,6 @@ module type BASE_MAP = sig (** {1 Basic functions} *) - val unsigned_min_binding : 'a t -> 'a key_value_pair - (** [unsigned_min_binding m] is minimal binding [KeyValue(k,v)] of the map, - using the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. - @raises Not_found if the map is empty *) - - val unsigned_max_binding : 'a t -> 'a key_value_pair - (** [unsigned_max_binding m] is maximal binding [KeyValue(k,v)] of the map, - using the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. - @raises Not_found if the map is empty *) - val singleton : 'a key -> ('a, 'b) value -> 'b t (** Create a map with a single binding. *) @@ -221,18 +213,6 @@ module type BASE_MAP = sig (** Returns a map with the element removed, O(log(n)) complexity. Returns a physically equal map if the element is absent. *) - val pop_unsigned_minimum: 'map t -> ('map key_value_pair * 'map t) option - (** [pop_unsigned_minimum m] returns [None] if [is_empty m], or [Some(key,value,m')] where - [(key,value) = unsigned_min_binding m] and [m' = remove m key]. - Uses the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. - O(log(n)) complexity. *) - - val pop_unsigned_maximum: 'map t -> ('map key_value_pair * 'map t) option - (** [pop_unsigned_maximum m] returns [None] if [is_empty m], or [Some(key,value,m')] where - [(key,value) = unsigned_max_binding m] and [m' = remove m key]. - Uses the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. - O(log(n)) complexity. *) - val insert: 'a key -> (('a,'map) value option -> ('a,'map) value) -> 'map t -> 'map t (** [insert key f map] modifies or insert an element of the map; [f] takes [None] if the value was not previously bound, and [Some old] @@ -550,6 +530,53 @@ module type BASE_MAP = sig (** [to_list m] returns the bindings of [m] as a list, in increasing {{!unsigned_lt}unsigned order} of {!KEY.to_int} *) end +(** Base map signature: a generic ['b map] storing bindings + of ['a key] to [('a,'b) values]. + All maps and set are a variation of this type, + sometimes with a simplified interface. + - {!HETEROGENEOUS_MAP} is just a {!BASE_MAP} with a functor {!HETEROGENEOUS_MAP.WithForeign} + for building operations that operate on two maps of different base types; + - {!MAP} specializes the interface for non-generic keys ([key] instead of ['a key]); + - {!HETEROGENEOUS_SET} specializes {!BASE_MAP} for sets ([('a,'b) value = unit]) and + removes the value argument from most operations; + - {!SET} specializes {!HETEROGENEOUS_SET} further by making elements (keys) + non-generic ([elt] instead of ['a elt]). + + @canonical PatriciaTree.BASE_MAP *) +module type BASE_MAP = sig + include NODE (** @closed *) + + include BASE_MAP_INTERFACE + with type 'map t := 'map t + and type 'key key := 'key key + and type ('key, 'map) value := ('key, 'map) value + (** @open *) + + (** {1 Min and Max} *) + + val unsigned_min_binding : 'a t -> 'a key_value_pair + (** [unsigned_min_binding m] is minimal binding [KeyValue(k,v)] of the map, + using the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. + @raises Not_found if the map is empty *) + + val unsigned_max_binding : 'a t -> 'a key_value_pair + (** [unsigned_max_binding m] is maximal binding [KeyValue(k,v)] of the map, + using the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. + @raises Not_found if the map is empty *) + + val pop_unsigned_minimum: 'map t -> ('map key_value_pair * 'map t) option + (** [pop_unsigned_minimum m] returns [None] if [is_empty m], or [Some(key,value,m')] where + [(key,value) = unsigned_min_binding m] and [m' = remove m key]. + Uses the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. + O(log(n)) complexity. *) + + val pop_unsigned_maximum: 'map t -> ('map key_value_pair * 'map t) option + (** [pop_unsigned_maximum m] returns [None] if [is_empty m], or [Some(key,value,m')] where + [(key,value) = unsigned_max_binding m] and [m' = remove m key]. + Uses the {{!unsigned_lt}unsigned order} on {!KEY.to_int}. + O(log(n)) complexity. *) +end + (** {2 Heterogeneous maps and sets} *) (** Maps and sets with generic keys ['a key] and values [('a,'b) value] *)