From 2f70391f229522ab3a21256439a6d50c172cd604 Mon Sep 17 00:00:00 2001
From: Raimo Niskanen <raimo@erlang.org>
Date: Fri, 10 Oct 2025 17:19:59 +0200
Subject: [PATCH 1/3] Shuffle experiments

Write a few shuffle algorithms for comparison.

I have found no formal statement that it is bias free,
but have tried to reason around it.  The algorithm should be
equivalent to generating more random decimals to decide
the shuffle order for elements with the same random number.
It should make no difference if the random decimals are generated
always and ignored, or when needed.

Speed: 1.2 s for 2^20 integers on my laptop.

The classical textbook shuffle.

Speed: 5 s for 2^20 integers on my laptop.

Quite a beautiful algorithm since the `gb_tree` has all
the functionality in itself.

Speed: 5 s for 2^20 integers on my laptop.

The same as the `gb_tree` above, but with a map.  Uses
the map key order instead of the general term order,
which works just fine.

Speed: 2 s for 2^20 integers on my laptop.

Suggested by Richard A. O'Keefe on ErlangForums as
"a random variant of Quicksort".  Shall we name it Quickshuffle?

Really fast. Uses random numbers efficiently by looking at
individual bits for the random split.  Has no overhead
for tagging.  Just creates intermediate lists as garbage.

This generator appears to be equivalent with shuffle1,
using a random number generator with 1 bit.

Speed: 0.8 s for 2^20 integers on my laptop.

The classical textbook shuffle.

Our standard `array` module here outperforms map, probably
because keys does not have to be stored, they are implicit.

Speed: 2 s for 2^20 integers on my laptop.

shuffle3 and shuffle4 have the theoretical limitation that
when the length of the list approaches the generator size,
it will take catastrophically much longer time to generate
a random number that has not been used.

There is no check for the list length being larger than
the generator size in which case it will be impossible
to generate unique random numbers for all list elements,
and the algorithm will simply keep on failing forever.
This is for now a theoretical problem since already for
a list length with log half the generator size
(e.g 2^28 with a generator size 2^56), my laptop
runs out of memory with a VM of about 30 GB.

shuffle1 and shuffle5 avoids that limitation.  shuffle1 by recursing
over the duplicates sublists so it is not affected much
by fairly long lists of duplicates, shuffle5 by using only
individual bits and ranges 2, 6, or 24.

The classical Fisher-Yates algorithm in shuffle2 and shuffle6
does not have this limitation, but generating random numbers
of unlimited length gets increasingly expensive, but should not be
any problem for 2 or even 4 times the generator length, that is
list lengths of well over 2^200, which is well over ridiculous.
---
 lib/stdlib/src/rand.erl | 398 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 397 insertions(+), 1 deletion(-)

diff --git a/lib/stdlib/src/rand.erl b/lib/stdlib/src/rand.erl
index 415807ae27b8..dd642dae42ae 100644
--- a/lib/stdlib/src/rand.erl
+++ b/lib/stdlib/src/rand.erl
@@ -404,7 +404,13 @@ the generator's range:
          uniform_real/0, uniform_real_s/1,
          bytes/1, bytes_s/2,
          jump/0, jump/1,
-         normal/0, normal/2, normal_s/1, normal_s/3
+         normal/0, normal/2, normal_s/1, normal_s/3,
+         shuffle1/1, shuffle1_s/2,
+         shuffle2/1, shuffle2_s/2,
+         shuffle3/1, shuffle3_s/2,
+         shuffle4/1, shuffle4_s/2,
+         shuffle5/1, shuffle5_s/2,
+         shuffle6/1, shuffle6_s/2
 	]).
 
 %% Utilities
@@ -1315,6 +1321,396 @@ normal_s(Mean, Variance, State0) when 0 =< Variance ->
     {X, State} = normal_s(State0),
     {Mean + (math:sqrt(Variance) * X), State}.
 
+%% -------
+
+-spec shuffle1(list()) -> list().
+shuffle1(List) ->
+    {ShuffledList, State} = shuffle1_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle1_s(list(), state()) -> {list(), state()}.
+shuffle1_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
+            {ShuffledList, R1} = shuffle1_r(List, Next, R0, WeakLowBits, []),
+            {ShuffledList, {AlgHandler, R1}}
+    end;
+shuffle1_s(List, {#{max:=_, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            %% Old spec - assume 2 weak low bits
+            WeakLowBits = 2,
+            {ShuffledList, R1} = shuffle1_r(List, Next, R0, WeakLowBits, []),
+            {ShuffledList, {AlgHandler, R1}}
+    end.
+
+%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
+%%
+%% To avoid bias due to duplicate random numbers, the resulting
+%% sorted list is checked for sequences of duplicate keys,
+%% which are recursively shuffled.  This algorithm also
+%% produces a bias free shuffle.
+
+%% Recursion entry point
+shuffle1_r([X, Y], Next, R0, _WeakLowBits, Acc) ->
+    %% Optimization for 2 elements; the most common case for duplicates
+    {V, R1} = Next(R0),
+    if
+        %% Bit 7 should not be weak in any of the generators
+        V band 128 =:= 0 -> {[Y, X | Acc], R1};
+        true             -> {[X, Y | Acc], R1}
+    end;
+shuffle1_r(L, Next, R0, WeakLowBits, Acc) ->
+    shuffle1_tag(L, Next, R0, WeakLowBits, Acc, []).
+
+%% Tag elements with random integers
+shuffle1_tag([], Next, R, WeakLowBits, Acc, TL) ->
+    %% Shuffle1; sort by random tag
+    shuffle1_untag(lists:keysort(1, TL), Next, R, WeakLowBits, Acc);
+shuffle1_tag([X | L], Next, R0, WeakLowBits, Acc, TL) ->
+    {V, R1} = Next(R0),
+    T = V bsr WeakLowBits,
+    shuffle1_tag(L, Next, R1, WeakLowBits, Acc, [{T,X} | TL]).
+
+%% Strip the tag integers
+shuffle1_untag([{T,X}, {T,Y} | TL], Next, R, WeakLowBits, Acc) ->
+    %% Random number duplicate
+    shuffle1_untag(TL, Next, R, WeakLowBits, Acc, [Y, X], T);
+shuffle1_untag([{_,X} | TL], Next, R, WeakLowBits, Acc) ->
+    shuffle1_untag(TL, Next, R, WeakLowBits, [X | Acc]);
+shuffle1_untag([], _Next, R, _WeakLowBits, Acc) ->
+    {Acc, R}.
+%%
+%% Collect duplicates
+shuffle1_untag([{T,X} | TL], Next, R, WeakLowBits, Acc, Dups, T) ->
+    shuffle1_untag(TL, Next, R, WeakLowBits, Acc, [X | Dups], T);
+shuffle1_untag(TL, Next, R0, WeakLowBits, Acc0, Dups, _T) ->
+    %% Shuffle1 the duplicates onto the result
+    {Acc1, R1} = shuffle1_r(Dups, Next, R0, WeakLowBits, Acc0),
+    shuffle1_untag(TL, Next, R1, WeakLowBits, Acc1).
+
+%% -------
+
+-spec shuffle2(list()) -> list().
+shuffle2(List) ->
+    {ShuffledList, State} = shuffle2_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle2_s(list(), state()) -> {list(), state()}.
+shuffle2_s(List, State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            M = maps:from_list(lists:enumerate(List)),
+            N = maps:size(M),
+            shuffle2_s(M, State, N, [])
+    end.
+
+%% Classical Fisher-Yates shuffle, a.k.a Knuth shuffle.
+%% See the Wikipedia article "Fisher-Yates shuffle".
+%%
+%% This variant uses a map with integer keys as array
+%% and is optimized in that it minimizes map updates
+%% since the high index is never used again, so an overwrite
+%% can be used instead of an exchange.
+
+shuffle2_s(M0, State0, N, Acc)
+  when is_map(M0), is_integer(N) ->
+    if
+        N =:= 0 -> {Acc, State0};
+        true ->
+            X = maps:get(N, M0),
+            case uniform_s(N, State0) of
+                {N, State1} ->
+                    shuffle2_s(M0, State1, N - 1, [X | Acc]);
+                {K, State1} when is_integer(K) ->
+                    Y = maps:get(K, M0),
+                    M1 = maps:update(K, X, M0),
+                    shuffle2_s(M1, State1, N - 1, [Y | Acc])
+            end
+    end.
+
+%% -------
+
+-spec shuffle3(list()) -> list().
+shuffle3(List) ->
+    {ShuffledList, State} = shuffle3_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle3_s(list(), state()) -> {list(), state()}.
+shuffle3_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
+            T = gb_trees:empty(),
+            {ShuffledList, R1} = shuffle3_r(List, Next, R0, WeakLowBits, T),
+            {ShuffledList, {AlgHandler, R1}}
+    end;
+shuffle3_s(List, {#{max:=Mask, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List), ?MASK(58) =< Mask ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            %% Old spec - assume 2 weak low bits
+            WeakLowBits = 2,
+            T = gb_trees:empty(),
+            {ShuffledList, R1} = shuffle3_r(List, Next, R0, WeakLowBits, T),
+            {ShuffledList, {AlgHandler, R1}}
+    end.
+
+%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
+%%
+%% To avoid bias due to duplicate random numbers, a gb_tree
+%% is used to check if a random number has already been used,
+%% and if so generate a new random number.
+%%
+%% Because a gb_tree is sorted no sorting needs to be done,
+%% it is enough to extract the values of the gb_tree that are
+%% ordered in key sort order.
+
+shuffle3_r([], _Next, R, _WeakLowBits, T) ->
+    {gb_trees:values(T), R};
+shuffle3_r([X | L] , Next, R0, WeakLowBits, T) ->
+    {V, R1} = Next(R0),
+    K = V bsr WeakLowBits,
+    case gb_trees:is_defined(K, T) of
+        false ->
+            shuffle3_r(L, Next, R1, WeakLowBits, gb_trees:insert(K, X, T));
+        true ->
+            shuffle3_r([X | L], Next, R1, WeakLowBits, T)
+    end.
+
+%% -------
+
+-spec shuffle4(list()) -> list().
+shuffle4(List) ->
+    {ShuffledList, State} = shuffle4_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle4_s(list(), state()) -> {list(), state()}.
+shuffle4_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
+            {ShuffledList, R1} = shuffle4_r(List, Next, R0, WeakLowBits, #{}),
+            {ShuffledList, {AlgHandler, R1}}
+    end;
+shuffle4_s(List, {#{max:=Mask, next:=Next} = AlgHandler, R0} = State)
+  when is_list(List), ?MASK(58) =< Mask ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            %% Old spec - assume 2 weak low bits
+            WeakLowBits = 2,
+            {ShuffledList, R1} = shuffle4_r(List, Next, R0, WeakLowBits, #{}),
+            {ShuffledList, {AlgHandler, R1}}
+    end.
+
+%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
+%%
+%% To avoid bias due to duplicate random numbers, a map
+%% is used to check if a random number has already been used,
+%% and if so generate a new random number.
+%%
+%% Actual sorting doesn't is not needed.  A map is ordered by key
+%% and therefore it is enough to extract the values of the map.
+%% The internal map key order will do just fine.
+
+shuffle4_r([], _Next, R, _WeakLowBits, M) ->
+    {maps:values(M), R};
+shuffle4_r([X | L] , Next, R0, WeakLowBits, M) ->
+    {V, R1} = Next(R0),
+    K = V bsr WeakLowBits,
+    case maps:is_key(K, M) of
+        true ->
+            shuffle4_r([X | L], Next, R1, WeakLowBits, M);
+        false ->
+            shuffle4_r(L, Next, R1, WeakLowBits, maps:put(K, X, M))
+    end.
+
+%% -------
+
+-spec shuffle5(list()) -> list().
+shuffle5(List) ->
+    {ShuffledList, State} = shuffle5_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle5_s(list(), state()) -> {list(), state()}.
+shuffle5_s(List, State) when is_list(List) ->
+    shuffle5_r(List, State, []).
+
+%% Random-split-and-shuffle algorithm suggested by Richard A. O'Keefe
+%% on ErlangForums, as I interpreted it...
+%%
+%% Randomly split the list in two lists,
+%% recursively shuffle the two smaller lists,
+%% randomize the order between the lists according to their size.
+%%
+%% This is equivalent to assigning a random number to each
+%% element and sorting, but extending the numbers on demand
+%% while there still are duplicates.
+
+%% Leaf cases - random permutations for 0..4 elements
+shuffle5_r([], State, Acc) ->
+    {Acc, State};
+shuffle5_r([X], State, Acc) ->
+    {[X | Acc], State};
+shuffle5_r([X, Y], State0, Acc) ->
+    {V, State1} = uniform_s(2, State0),
+    {case V of
+         1 -> [Y, X | Acc];
+         2 -> [X, Y | Acc]
+     end, State1};
+shuffle5_r([X, Y, Z], State0, Acc) ->
+    {V, State1} = uniform_s(6, State0),
+    {case V of
+         1 -> [Z, Y, X | Acc];
+         2 -> [Y, Z, X | Acc];
+         3 -> [Z, X, Y | Acc];
+         4 -> [X, Z, Y | Acc];
+         5 -> [Y, X, Z | Acc];
+         6 -> [X, Y, Z | Acc]
+     end, State1};
+shuffle5_r([X, Y, Z, Q], State0, Acc) ->
+    {V, State1} = uniform_s(24, State0),
+    {case V of
+         1  -> [Q, Z, Y, X | Acc];
+         2  -> [Z, Q, Y, X | Acc];
+         3  -> [Q, Y, Z, X | Acc];
+         4  -> [Y, Q, Z, X | Acc];
+         5  -> [Z, Y, Q, X | Acc];
+         6  -> [Y, Z, Q, X | Acc];
+         7  -> [Q, Z, X, Y | Acc];
+         8  -> [Z, Q, X, Y | Acc];
+         9  -> [Q, X, Z, Y | Acc];
+         10 -> [X, Q, Z, Y | Acc];
+         11 -> [Z, X, Q, Y | Acc];
+         12 -> [X, Z, Q, Y | Acc];
+         13 -> [Q, Y, X, Z | Acc];
+         14 -> [Y, Q, X, Z | Acc];
+         15 -> [Q, X, Y, Z | Acc];
+         16 -> [X, Q, Y, Z | Acc];
+         17 -> [Y, X, Q, Z | Acc];
+         18 -> [X, Y, Q, Z | Acc];
+         19 -> [Z, Y, X, Q | Acc];
+         20 -> [Y, Z, X, Q | Acc];
+         21 -> [Z, X, Y, Q | Acc];
+         22 -> [X, Z, Y, Q | Acc];
+         23 -> [Y, X, Z, Q | Acc];
+         24 -> [X, Y, Z, Q | Acc]
+     end, State1};
+%%
+%% General case - split and recursive shuffle
+shuffle5_r([_, _, _, _ | _] = List, State0, Acc0) ->
+    {Left, Right, State1} = shuffle5_split(List, State0),
+    {Acc1, State2} = shuffle5_r(Left, State1, Acc0),
+    shuffle5_r(Right, State2, Acc1).
+
+%% Split L into two random subsets: Left and Right
+%%
+shuffle5_split(L, State) ->
+    shuffle5_split(L, State, 1, [], []).
+%%
+shuffle5_split([], State, _P, Left, Right) ->
+    {Left, Right, State};
+shuffle5_split([_ | _] = L, State0, 1, Left, Right) ->
+    M = 1 bsl 56,
+    case rand:uniform_s(M, State0) of
+        {V, State1} when is_integer(V), 1 =< V, V =< M ->
+            %% Setting the top bit M here provides the marker
+            %% for when we are out of random bits: P =:= 1
+            shuffle5_split(L, State1, (V - 1) + M, Left, Right)
+    end;
+shuffle5_split([X | L], State, P, Left, Right)
+  when is_integer(P), 1 =< P, P < 1 bsl 57 ->
+    case P band 1 of
+        0 ->
+            shuffle5_split(L, State, P bsr 1, [X | Left], Right);
+        1 ->
+            shuffle5_split(L, State, P bsr 1, Left, [X | Right])
+    end.
+
+%% -------
+
+-spec shuffle6(list()) -> list().
+shuffle6(List) ->
+    {ShuffledList, State} = shuffle6_s(List, seed_get()),
+    _ = seed_put(State),
+    ShuffledList.
+
+-spec shuffle6_s(list(), state()) -> {list(), state()}.
+shuffle6_s(List, State)
+  when is_list(List) ->
+    case List of
+        [] ->
+            {List, State};
+        [_] ->
+            {List, State};
+        _ ->
+            A = array:from_list([[] | List]), % Make it 1 based
+            N = array:size(A),
+            shuffle6_s(A, State, N, [])
+    end.
+
+%% Classical Fisher-Yates shuffle, a.k.a Knuth shuffle.
+%% See the Wikipedia article "Fisher-Yates shuffle".
+%%
+%% Use the 'array' module and insert a dummy element first
+%% to make it effectively 1-based.
+%%
+%% This is the fastest Fisher-Yates among the shuffle algorithms here.
+
+shuffle6_s(A0, State0, N, Acc) when is_integer(N), 0 =< N ->
+    if
+        N =:= 0 -> {Acc, State0};
+        true ->
+            X = array:get(N, A0),
+            case uniform_s(N, State0) of
+                {N, State1} ->
+                    shuffle6_s(A0, State1, N - 1, [X | Acc]);
+                {K, State1} when is_integer(K) ->
+                    Y = array:get(K, A0),
+                    A1 = array:set(K, X, A0),
+                    shuffle6_s(A1, State1, N - 1, [Y | Acc])
+            end
+    end.
+
 %% =====================================================================
 %% Internal functions
 

From a536947f45b1aa1425a55e6063b5b747070ef3d2 Mon Sep 17 00:00:00 2001
From: Raimo Niskanen <raimo@erlang.org>
Date: Thu, 16 Oct 2025 15:54:18 +0200
Subject: [PATCH 2/3] Select the algorithm for `rand:shuffle_s/2`

---
 lib/stdlib/src/rand.erl | 341 +++-------------------------------------
 1 file changed, 24 insertions(+), 317 deletions(-)

diff --git a/lib/stdlib/src/rand.erl b/lib/stdlib/src/rand.erl
index dd642dae42ae..cab7f677c61c 100644
--- a/lib/stdlib/src/rand.erl
+++ b/lib/stdlib/src/rand.erl
@@ -405,12 +405,7 @@ the generator's range:
          bytes/1, bytes_s/2,
          jump/0, jump/1,
          normal/0, normal/2, normal_s/1, normal_s/3,
-         shuffle1/1, shuffle1_s/2,
-         shuffle2/1, shuffle2_s/2,
-         shuffle3/1, shuffle3_s/2,
-         shuffle4/1, shuffle4_s/2,
-         shuffle5/1, shuffle5_s/2,
-         shuffle6/1, shuffle6_s/2
+         shuffle/1, shuffle_s/2
 	]).
 
 %% Utilities
@@ -1321,259 +1316,16 @@ normal_s(Mean, Variance, State0) when 0 =< Variance ->
     {X, State} = normal_s(State0),
     {Mean + (math:sqrt(Variance) * X), State}.
 
-%% -------
 
--spec shuffle1(list()) -> list().
-shuffle1(List) ->
-    {ShuffledList, State} = shuffle1_s(List, seed_get()),
+-spec shuffle(list()) -> list().
+shuffle(List) ->
+    {ShuffledList, State} = shuffle_s(List, seed_get()),
     _ = seed_put(State),
     ShuffledList.
 
--spec shuffle1_s(list(), state()) -> {list(), state()}.
-shuffle1_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
-            {ShuffledList, R1} = shuffle1_r(List, Next, R0, WeakLowBits, []),
-            {ShuffledList, {AlgHandler, R1}}
-    end;
-shuffle1_s(List, {#{max:=_, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            %% Old spec - assume 2 weak low bits
-            WeakLowBits = 2,
-            {ShuffledList, R1} = shuffle1_r(List, Next, R0, WeakLowBits, []),
-            {ShuffledList, {AlgHandler, R1}}
-    end.
-
-%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
-%%
-%% To avoid bias due to duplicate random numbers, the resulting
-%% sorted list is checked for sequences of duplicate keys,
-%% which are recursively shuffled.  This algorithm also
-%% produces a bias free shuffle.
-
-%% Recursion entry point
-shuffle1_r([X, Y], Next, R0, _WeakLowBits, Acc) ->
-    %% Optimization for 2 elements; the most common case for duplicates
-    {V, R1} = Next(R0),
-    if
-        %% Bit 7 should not be weak in any of the generators
-        V band 128 =:= 0 -> {[Y, X | Acc], R1};
-        true             -> {[X, Y | Acc], R1}
-    end;
-shuffle1_r(L, Next, R0, WeakLowBits, Acc) ->
-    shuffle1_tag(L, Next, R0, WeakLowBits, Acc, []).
-
-%% Tag elements with random integers
-shuffle1_tag([], Next, R, WeakLowBits, Acc, TL) ->
-    %% Shuffle1; sort by random tag
-    shuffle1_untag(lists:keysort(1, TL), Next, R, WeakLowBits, Acc);
-shuffle1_tag([X | L], Next, R0, WeakLowBits, Acc, TL) ->
-    {V, R1} = Next(R0),
-    T = V bsr WeakLowBits,
-    shuffle1_tag(L, Next, R1, WeakLowBits, Acc, [{T,X} | TL]).
-
-%% Strip the tag integers
-shuffle1_untag([{T,X}, {T,Y} | TL], Next, R, WeakLowBits, Acc) ->
-    %% Random number duplicate
-    shuffle1_untag(TL, Next, R, WeakLowBits, Acc, [Y, X], T);
-shuffle1_untag([{_,X} | TL], Next, R, WeakLowBits, Acc) ->
-    shuffle1_untag(TL, Next, R, WeakLowBits, [X | Acc]);
-shuffle1_untag([], _Next, R, _WeakLowBits, Acc) ->
-    {Acc, R}.
-%%
-%% Collect duplicates
-shuffle1_untag([{T,X} | TL], Next, R, WeakLowBits, Acc, Dups, T) ->
-    shuffle1_untag(TL, Next, R, WeakLowBits, Acc, [X | Dups], T);
-shuffle1_untag(TL, Next, R0, WeakLowBits, Acc0, Dups, _T) ->
-    %% Shuffle1 the duplicates onto the result
-    {Acc1, R1} = shuffle1_r(Dups, Next, R0, WeakLowBits, Acc0),
-    shuffle1_untag(TL, Next, R1, WeakLowBits, Acc1).
-
-%% -------
-
--spec shuffle2(list()) -> list().
-shuffle2(List) ->
-    {ShuffledList, State} = shuffle2_s(List, seed_get()),
-    _ = seed_put(State),
-    ShuffledList.
-
--spec shuffle2_s(list(), state()) -> {list(), state()}.
-shuffle2_s(List, State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            M = maps:from_list(lists:enumerate(List)),
-            N = maps:size(M),
-            shuffle2_s(M, State, N, [])
-    end.
-
-%% Classical Fisher-Yates shuffle, a.k.a Knuth shuffle.
-%% See the Wikipedia article "Fisher-Yates shuffle".
-%%
-%% This variant uses a map with integer keys as array
-%% and is optimized in that it minimizes map updates
-%% since the high index is never used again, so an overwrite
-%% can be used instead of an exchange.
-
-shuffle2_s(M0, State0, N, Acc)
-  when is_map(M0), is_integer(N) ->
-    if
-        N =:= 0 -> {Acc, State0};
-        true ->
-            X = maps:get(N, M0),
-            case uniform_s(N, State0) of
-                {N, State1} ->
-                    shuffle2_s(M0, State1, N - 1, [X | Acc]);
-                {K, State1} when is_integer(K) ->
-                    Y = maps:get(K, M0),
-                    M1 = maps:update(K, X, M0),
-                    shuffle2_s(M1, State1, N - 1, [Y | Acc])
-            end
-    end.
-
-%% -------
-
--spec shuffle3(list()) -> list().
-shuffle3(List) ->
-    {ShuffledList, State} = shuffle3_s(List, seed_get()),
-    _ = seed_put(State),
-    ShuffledList.
-
--spec shuffle3_s(list(), state()) -> {list(), state()}.
-shuffle3_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
-            T = gb_trees:empty(),
-            {ShuffledList, R1} = shuffle3_r(List, Next, R0, WeakLowBits, T),
-            {ShuffledList, {AlgHandler, R1}}
-    end;
-shuffle3_s(List, {#{max:=Mask, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List), ?MASK(58) =< Mask ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            %% Old spec - assume 2 weak low bits
-            WeakLowBits = 2,
-            T = gb_trees:empty(),
-            {ShuffledList, R1} = shuffle3_r(List, Next, R0, WeakLowBits, T),
-            {ShuffledList, {AlgHandler, R1}}
-    end.
-
-%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
-%%
-%% To avoid bias due to duplicate random numbers, a gb_tree
-%% is used to check if a random number has already been used,
-%% and if so generate a new random number.
-%%
-%% Because a gb_tree is sorted no sorting needs to be done,
-%% it is enough to extract the values of the gb_tree that are
-%% ordered in key sort order.
-
-shuffle3_r([], _Next, R, _WeakLowBits, T) ->
-    {gb_trees:values(T), R};
-shuffle3_r([X | L] , Next, R0, WeakLowBits, T) ->
-    {V, R1} = Next(R0),
-    K = V bsr WeakLowBits,
-    case gb_trees:is_defined(K, T) of
-        false ->
-            shuffle3_r(L, Next, R1, WeakLowBits, gb_trees:insert(K, X, T));
-        true ->
-            shuffle3_r([X | L], Next, R1, WeakLowBits, T)
-    end.
-
-%% -------
-
--spec shuffle4(list()) -> list().
-shuffle4(List) ->
-    {ShuffledList, State} = shuffle4_s(List, seed_get()),
-    _ = seed_put(State),
-    ShuffledList.
-
--spec shuffle4_s(list(), state()) -> {list(), state()}.
-shuffle4_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
-            {ShuffledList, R1} = shuffle4_r(List, Next, R0, WeakLowBits, #{}),
-            {ShuffledList, {AlgHandler, R1}}
-    end;
-shuffle4_s(List, {#{max:=Mask, next:=Next} = AlgHandler, R0} = State)
-  when is_list(List), ?MASK(58) =< Mask ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            %% Old spec - assume 2 weak low bits
-            WeakLowBits = 2,
-            {ShuffledList, R1} = shuffle4_r(List, Next, R0, WeakLowBits, #{}),
-            {ShuffledList, {AlgHandler, R1}}
-    end.
-
-%% See the Wikipedia article "Fisher-Yates shuffle", section "Sorting".
-%%
-%% To avoid bias due to duplicate random numbers, a map
-%% is used to check if a random number has already been used,
-%% and if so generate a new random number.
-%%
-%% Actual sorting doesn't is not needed.  A map is ordered by key
-%% and therefore it is enough to extract the values of the map.
-%% The internal map key order will do just fine.
-
-shuffle4_r([], _Next, R, _WeakLowBits, M) ->
-    {maps:values(M), R};
-shuffle4_r([X | L] , Next, R0, WeakLowBits, M) ->
-    {V, R1} = Next(R0),
-    K = V bsr WeakLowBits,
-    case maps:is_key(K, M) of
-        true ->
-            shuffle4_r([X | L], Next, R1, WeakLowBits, M);
-        false ->
-            shuffle4_r(L, Next, R1, WeakLowBits, maps:put(K, X, M))
-    end.
-
-%% -------
-
--spec shuffle5(list()) -> list().
-shuffle5(List) ->
-    {ShuffledList, State} = shuffle5_s(List, seed_get()),
-    _ = seed_put(State),
-    ShuffledList.
-
--spec shuffle5_s(list(), state()) -> {list(), state()}.
-shuffle5_s(List, State) when is_list(List) ->
-    shuffle5_r(List, State, []).
+-spec shuffle_s(list(), state()) -> {list(), state()}.
+shuffle_s(List, State) when is_list(List) ->
+    shuffle_r(List, State, []).
 
 %% Random-split-and-shuffle algorithm suggested by Richard A. O'Keefe
 %% on ErlangForums, as I interpreted it...
@@ -1587,17 +1339,17 @@ shuffle5_s(List, State) when is_list(List) ->
 %% while there still are duplicates.
 
 %% Leaf cases - random permutations for 0..4 elements
-shuffle5_r([], State, Acc) ->
+shuffle_r([], State, Acc) ->
     {Acc, State};
-shuffle5_r([X], State, Acc) ->
+shuffle_r([X], State, Acc) ->
     {[X | Acc], State};
-shuffle5_r([X, Y], State0, Acc) ->
+shuffle_r([X, Y], State0, Acc) ->
     {V, State1} = uniform_s(2, State0),
     {case V of
          1 -> [Y, X | Acc];
          2 -> [X, Y | Acc]
      end, State1};
-shuffle5_r([X, Y, Z], State0, Acc) ->
+shuffle_r([X, Y, Z], State0, Acc) ->
     {V, State1} = uniform_s(6, State0),
     {case V of
          1 -> [Z, Y, X | Acc];
@@ -1607,7 +1359,7 @@ shuffle5_r([X, Y, Z], State0, Acc) ->
          5 -> [Y, X, Z | Acc];
          6 -> [X, Y, Z | Acc]
      end, State1};
-shuffle5_r([X, Y, Z, Q], State0, Acc) ->
+shuffle_r([X, Y, Z, Q], State0, Acc) ->
     {V, State1} = uniform_s(24, State0),
     {case V of
          1  -> [Q, Z, Y, X | Acc];
@@ -1637,78 +1389,33 @@ shuffle5_r([X, Y, Z, Q], State0, Acc) ->
      end, State1};
 %%
 %% General case - split and recursive shuffle
-shuffle5_r([_, _, _, _ | _] = List, State0, Acc0) ->
-    {Left, Right, State1} = shuffle5_split(List, State0),
-    {Acc1, State2} = shuffle5_r(Left, State1, Acc0),
-    shuffle5_r(Right, State2, Acc1).
+shuffle_r([_, _, _, _ | _] = List, State0, Acc0) ->
+    {Left, Right, State1} = shuffle_split(List, State0),
+    {Acc1, State2} = shuffle_r(Left, State1, Acc0),
+    shuffle_r(Right, State2, Acc1).
 
 %% Split L into two random subsets: Left and Right
 %%
-shuffle5_split(L, State) ->
-    shuffle5_split(L, State, 1, [], []).
+shuffle_split(L, State) ->
+    shuffle_split(L, State, 1, [], []).
 %%
-shuffle5_split([], State, _P, Left, Right) ->
+shuffle_split([], State, _P, Left, Right) ->
     {Left, Right, State};
-shuffle5_split([_ | _] = L, State0, 1, Left, Right) ->
+shuffle_split([_ | _] = L, State0, 1, Left, Right) ->
     M = 1 bsl 56,
     case rand:uniform_s(M, State0) of
         {V, State1} when is_integer(V), 1 =< V, V =< M ->
             %% Setting the top bit M here provides the marker
             %% for when we are out of random bits: P =:= 1
-            shuffle5_split(L, State1, (V - 1) + M, Left, Right)
+            shuffle_split(L, State1, (V - 1) + M, Left, Right)
     end;
-shuffle5_split([X | L], State, P, Left, Right)
+shuffle_split([X | L], State, P, Left, Right)
   when is_integer(P), 1 =< P, P < 1 bsl 57 ->
     case P band 1 of
         0 ->
-            shuffle5_split(L, State, P bsr 1, [X | Left], Right);
+            shuffle_split(L, State, P bsr 1, [X | Left], Right);
         1 ->
-            shuffle5_split(L, State, P bsr 1, Left, [X | Right])
-    end.
-
-%% -------
-
--spec shuffle6(list()) -> list().
-shuffle6(List) ->
-    {ShuffledList, State} = shuffle6_s(List, seed_get()),
-    _ = seed_put(State),
-    ShuffledList.
-
--spec shuffle6_s(list(), state()) -> {list(), state()}.
-shuffle6_s(List, State)
-  when is_list(List) ->
-    case List of
-        [] ->
-            {List, State};
-        [_] ->
-            {List, State};
-        _ ->
-            A = array:from_list([[] | List]), % Make it 1 based
-            N = array:size(A),
-            shuffle6_s(A, State, N, [])
-    end.
-
-%% Classical Fisher-Yates shuffle, a.k.a Knuth shuffle.
-%% See the Wikipedia article "Fisher-Yates shuffle".
-%%
-%% Use the 'array' module and insert a dummy element first
-%% to make it effectively 1-based.
-%%
-%% This is the fastest Fisher-Yates among the shuffle algorithms here.
-
-shuffle6_s(A0, State0, N, Acc) when is_integer(N), 0 =< N ->
-    if
-        N =:= 0 -> {Acc, State0};
-        true ->
-            X = array:get(N, A0),
-            case uniform_s(N, State0) of
-                {N, State1} ->
-                    shuffle6_s(A0, State1, N - 1, [X | Acc]);
-                {K, State1} when is_integer(K) ->
-                    Y = array:get(K, A0),
-                    A1 = array:set(K, X, A0),
-                    shuffle6_s(A1, State1, N - 1, [Y | Acc])
-            end
+            shuffle_split(L, State, P bsr 1, Left, [X | Right])
     end.
 
 %% =====================================================================

From 689833907041a4ecdb52615c74875b3ff8770782 Mon Sep 17 00:00:00 2001
From: Raimo Niskanen <raimo@erlang.org>
Date: Fri, 17 Oct 2025 10:41:40 +0200
Subject: [PATCH 3/3] Finalize the shuffle algorithm

* Explain in comments
* Optimize
* Document
* Write test cases
* Write measurement test case to compare with runner-up algorithm
---
 lib/stdlib/src/rand.erl        | 279 +++++++++++++++--------
 lib/stdlib/test/rand_SUITE.erl | 391 +++++++++++++++++++++++++++------
 2 files changed, 515 insertions(+), 155 deletions(-)

diff --git a/lib/stdlib/src/rand.erl b/lib/stdlib/src/rand.erl
index cab7f677c61c..ac1bc20f2024 100644
--- a/lib/stdlib/src/rand.erl
+++ b/lib/stdlib/src/rand.erl
@@ -1317,106 +1317,209 @@ normal_s(Mean, Variance, State0) when 0 =< Variance ->
     {Mean + (math:sqrt(Variance) * X), State}.
 
 
--spec shuffle(list()) -> list().
+-doc """
+Shuffle a list.
+
+Like `shuffle_s/2` but operates on the state stored in
+the process dictionary.  Returns the shuffled list.
+""".
+-doc(#{group => <<"Plug-in framework API">>,since => <<"OTP 29.0">>}).
+-spec shuffle(List :: list()) -> ShuffledList :: list().
 shuffle(List) ->
     {ShuffledList, State} = shuffle_s(List, seed_get()),
     _ = seed_put(State),
     ShuffledList.
 
--spec shuffle_s(list(), state()) -> {list(), state()}.
-shuffle_s(List, State) when is_list(List) ->
-    shuffle_r(List, State, []).
+-doc """
+Shuffle a list.
+
+From the specified `State` shuffles the elements in argument `List` so that,
+given that the [PRNG algorithm](#algorithms) in `State` is perfect,
+every possible permutation of the elements in the returned `ShuffledList`
+has the same probability.
+
+In other words, the quality of the shuffling depends only on
+the quality of the backend [random number generator](#algorithms)
+and [seed](`seed_s/1`).  If a cryptographically unpredictable
+shuffling is needed, use for example `crypto:rand_seed_alg_s/1`
+to initialize the random number generator.
+
+Returns the shuffled list [`ShuffledList`](`t:list/0`)
+and the [`NewState`](`t:state/0`).
+""".
+-doc(#{group => <<"Plug-in framework API">>,since => <<"OTP 29.0">>}).
+-spec shuffle_s(List, State) ->
+          {ShuffledList :: list(), NewState :: state()}
+              when
+      List         :: list(),
+      State        :: state().
+shuffle_s(List, {#{bits:=_, next:=Next} = AlgHandler, R0})
+  when is_list(List) ->
+    WeakLowBits = maps:get(weak_low_bits, AlgHandler, 0),
+    [P0|S0] = shuffle_init_bitstream(R0, Next, WeakLowBits),
+    {ShuffledList, _P1, [R1|_]=_S1} = shuffle_r(List, [], P0, S0),
+    {ShuffledList, {AlgHandler, R1}};
+shuffle_s(List, {#{max:=_, next:=Next} = AlgHandler, R0})
+  when is_list(List) ->
+    %% Old spec - assume 2 weak low bits
+    WeakLowBits = 2,
+    [P0|S0] = shuffle_init_bitstream(R0, Next, WeakLowBits),
+    {ShuffledList, _P1, [R1|_]=_S1} = shuffle_r(List, [], P0, S0),
+    {ShuffledList, {AlgHandler, R1}}.
 
 %% Random-split-and-shuffle algorithm suggested by Richard A. O'Keefe
-%% on ErlangForums, as I interpreted it...
-%%
-%% Randomly split the list in two lists,
-%% recursively shuffle the two smaller lists,
-%% randomize the order between the lists according to their size.
-%%
-%% This is equivalent to assigning a random number to each
-%% element and sorting, but extending the numbers on demand
-%% while there still are duplicates.
-
-%% Leaf cases - random permutations for 0..4 elements
-shuffle_r([], State, Acc) ->
-    {Acc, State};
-shuffle_r([X], State, Acc) ->
-    {[X | Acc], State};
-shuffle_r([X, Y], State0, Acc) ->
-    {V, State1} = uniform_s(2, State0),
-    {case V of
-         1 -> [Y, X | Acc];
-         2 -> [X, Y | Acc]
-     end, State1};
-shuffle_r([X, Y, Z], State0, Acc) ->
-    {V, State1} = uniform_s(6, State0),
-    {case V of
-         1 -> [Z, Y, X | Acc];
-         2 -> [Y, Z, X | Acc];
-         3 -> [Z, X, Y | Acc];
-         4 -> [X, Z, Y | Acc];
-         5 -> [Y, X, Z | Acc];
-         6 -> [X, Y, Z | Acc]
-     end, State1};
-shuffle_r([X, Y, Z, Q], State0, Acc) ->
-    {V, State1} = uniform_s(24, State0),
-    {case V of
-         1  -> [Q, Z, Y, X | Acc];
-         2  -> [Z, Q, Y, X | Acc];
-         3  -> [Q, Y, Z, X | Acc];
-         4  -> [Y, Q, Z, X | Acc];
-         5  -> [Z, Y, Q, X | Acc];
-         6  -> [Y, Z, Q, X | Acc];
-         7  -> [Q, Z, X, Y | Acc];
-         8  -> [Z, Q, X, Y | Acc];
-         9  -> [Q, X, Z, Y | Acc];
-         10 -> [X, Q, Z, Y | Acc];
-         11 -> [Z, X, Q, Y | Acc];
-         12 -> [X, Z, Q, Y | Acc];
-         13 -> [Q, Y, X, Z | Acc];
-         14 -> [Y, Q, X, Z | Acc];
-         15 -> [Q, X, Y, Z | Acc];
-         16 -> [X, Q, Y, Z | Acc];
-         17 -> [Y, X, Q, Z | Acc];
-         18 -> [X, Y, Q, Z | Acc];
-         19 -> [Z, Y, X, Q | Acc];
-         20 -> [Y, Z, X, Q | Acc];
-         21 -> [Z, X, Y, Q | Acc];
-         22 -> [X, Z, Y, Q | Acc];
-         23 -> [Y, X, Z, Q | Acc];
-         24 -> [X, Y, Z, Q | Acc]
-     end, State1};
-%%
+%% on ErlangForums, as I interpreted it...  "basically a randomized
+%% quicksort", shall we name it Quickshuffle?
+%%
+%% Randomly split the list in two lists, and recursively shuffle
+%% the two smaller lists.
+%%
+%% How the algorithm works and why it is correct can be explained like this:
+%%
+%% The objective is, given a list of elements, to return a random
+%% permutation of those elements so that every possible permutation
+%% has the same probability to be returned.
+%%
+%% One of the two correct and bias free algorithms described on the Wikipedia
+%% page for Fisher-Yates shuffling is to assign a random number to each
+%% element in the list and order the elements according to the numbers.
+%% For this to be correct the generated numbers must not have duplicates.
+%%
+%% This algorithm does that, but assigning a number and ordering
+%% the elements is more or less the same step, which is taken
+%% one binary bit at the time.
+%%
+%% It can be seen as, to each element, assign a fixpoint number
+%% of infinite length starting with bit weight 1/2, continuing with 1/4,
+%% and so on..., but reveal it incrementally.
+%%
+%% The list to shuffle is traversed, and a random bit is generated
+%% for each element.  If it is a 0, the element is assigned the zero bit
+%% by moving it to the head of the list Zero, and if it is a 1,
+%% to the head of the list One.
+%%
+%% Then the list Zero is recursively shuffled onto the accumulator list Acc,
+%% after that the list One.  By that all elements in Zero are ordered
+%% before the ones in One, according to the generated numbers.
+%% The order is actually not important as long as it is consistent.
+%%
+%% The algorithm recurses until the Zero or One list has length
+%% 0 or 1, which is when the generated fixpoint number has no duplicate.
+%% The fixpoint number in itself only exists in the guise of the
+%% recursion call stack, that is whether an element belongs to list
+%% Zero or One on each recursion level.
+%% Here is the bare algorithm:
+%%
+%% quickshuffle([], Acc) -> Acc;
+%% quickshuffle([X], Acc) -> [X | Acc];
+%% quickshuffle([_|_] = L, Acc) ->
+%%     {Zero, One} = quickshuffle_split(L, [], []),
+%%     quickshuffle(One, quickshuffle(Zero, Acc)).
+%%
+%% quickshuffle_split([], Zero, One) ->
+%%     {Zero, One};
+%% quickshuffle_split([X | L], Zero, One) ->
+%%     case random_bit() of
+%%         0 -> quickshuffle_split(L, [X | Zero], One);
+%%         1 -> quickshuffle_split(L, Zero, [X | One])
+%%     end.
+%%
+%% As an optimization, since the algorithm is equivalent to its objective
+%% to randomly permute a list, we can when reaching a small enough list
+%% as in 3 or 2 instead do an explicit random permutation of the list.
+%%
+%% The `random_bit()` can be generated with small overhead by generating
+%% a random word and cache it, then shift out one bit at the time.
+%%
+%% Also, it is faster to do a 4-way split by 2 bits instead of,
+%% as described above, a 2-way split by 1 bit.
+
+%% Leaf cases - random permutations for 0..3 elements
+shuffle_r([], Acc, P, S) ->
+    {Acc, P, S};
+shuffle_r([X], Acc, P, S) ->
+    {[X | Acc], P, S};
+shuffle_r([X, Y], Acc, P, S) ->
+    shuffle_r_2(X, Acc, P, S, Y);
+shuffle_r([X, Y, Z], Acc, P, S) ->
+    shuffle_r_3(X, Acc, P, S, Y, Z);
 %% General case - split and recursive shuffle
-shuffle_r([_, _, _, _ | _] = List, State0, Acc0) ->
-    {Left, Right, State1} = shuffle_split(List, State0),
-    {Acc1, State2} = shuffle_r(Left, State1, Acc0),
-    shuffle_r(Right, State2, Acc1).
+shuffle_r([_, _, _ | _] = List, Acc, P, S) ->
+    %% P and S is bitstream cache and state
+    shuffle_r(List, Acc, P, S, [], [], [], []).
+%%
+%% Split L into 4 random subsets
+%%
+shuffle_r([], Acc0, P0, S0, Zero, One, Two, Three) ->
+    %% Split done, recursively shuffle the splitted lists onto Acc
+    {Acc1, P1, S1} = shuffle_r(Zero, Acc0, P0, S0),
+    {Acc2, P2, S2} = shuffle_r(One, Acc1, P1, S1),
+    {Acc3, P3, S3} = shuffle_r(Two, Acc2, P2, S2),
+    shuffle_r(Three, Acc3, P3, S3);
+shuffle_r([X | L], Acc, P0, S, Zero, One, Two, Three)
+  when is_integer(P0), 3 < P0, P0 < 1 bsl 57 ->
+    P1 = P0 bsr 2,
+    case P0 band 3 of
+        0 -> shuffle_r(L, Acc, P1, S, [X | Zero], One, Two, Three);
+        1 -> shuffle_r(L, Acc, P1, S, Zero, [X | One], Two, Three);
+        2 -> shuffle_r(L, Acc, P1, S, Zero, One, [X | Two], Three);
+        3 -> shuffle_r(L, Acc, P1, S, Zero, One, Two, [X | Three])
+    end;
+shuffle_r([_ | _] = L, Acc, _P, S0, Zero, One, Two, Three) ->
+    [P|S1] = shuffle_new_bits(S0),
+    shuffle_r(L, Acc, P, S1, Zero, One, Two, Three).
+
+%% Permute 2 elements
+shuffle_r_2(X, Acc, P, S, Y)
+  when is_integer(P), 1 < P, P < 1 bsl 57 ->
+    {case P band 1 of
+         0 -> [Y, X | Acc];
+         1 -> [X, Y | Acc]
+     end, P bsr 1, S};
+shuffle_r_2(X, Acc, _P, S0, Y) ->
+    [P|S1] = shuffle_new_bits(S0),
+    shuffle_r_2(X, Acc, P, S1, Y).
+
+%% Permute 3 elements
+%%
+%% Uses 3 random bits per iteration with a probability of 1/4
+%% to reject and retry, which on average is 3 * 4/3
+%% (infinite sum of (1/4)^k) = 4 bits per permutation
+shuffle_r_3(X, Acc, P0, S, Y, Z)
+  when is_integer(P0), 7 < P0, P0 < 1 bsl 57 ->
+    P1 = P0 bsr 3,
+    case P0 band 7 of
+        0 -> {[Z, Y, X | Acc], P1, S};
+        1 -> {[Y, Z, X | Acc], P1, S};
+        2 -> {[Z, X, Y | Acc], P1, S};
+        3 -> {[X, Z, Y | Acc], P1, S};
+        4 -> {[Y, X, Z | Acc], P1, S};
+        5 -> {[X, Y, Z | Acc], P1, S};
+        _ -> % Reject and retry
+            shuffle_r_3(X, Acc, P1, S, Y, Z)
+    end;
+shuffle_r_3(X, Acc, _P, S0, Y, Z) ->
+    [P|S1] = shuffle_new_bits(S0),
+    shuffle_r_3(X, Acc, P, S1, Y, Z).
 
-%% Split L into two random subsets: Left and Right
+-dialyzer({no_improper_lists, shuffle_init_bitstream/3}).
 %%
-shuffle_split(L, State) ->
-    shuffle_split(L, State, 1, [], []).
+shuffle_init_bitstream(R, Next, WeakLowBits) ->
+    P = 1,                      % Marker for out of random bits
+    W = {Next,WeakLowBits},     % Generator
+    S = [R|W],                  % Generator state
+    [P|S].                      % Bit cash and state
+
+-dialyzer({no_improper_lists, shuffle_new_bits/1}).
 %%
-shuffle_split([], State, _P, Left, Right) ->
-    {Left, Right, State};
-shuffle_split([_ | _] = L, State0, 1, Left, Right) ->
+shuffle_new_bits([R0|{Next,WeakLowBits}=W]) ->
+    {V, R1} = Next(R0),
+    %% Setting the top bit M here provides the marker
+    %% for when we are out of random bits: P =:= 1
     M = 1 bsl 56,
-    case rand:uniform_s(M, State0) of
-        {V, State1} when is_integer(V), 1 =< V, V =< M ->
-            %% Setting the top bit M here provides the marker
-            %% for when we are out of random bits: P =:= 1
-            shuffle_split(L, State1, (V - 1) + M, Left, Right)
-    end;
-shuffle_split([X | L], State, P, Left, Right)
-  when is_integer(P), 1 =< P, P < 1 bsl 57 ->
-    case P band 1 of
-        0 ->
-            shuffle_split(L, State, P bsr 1, [X | Left], Right);
-        1 ->
-            shuffle_split(L, State, P bsr 1, Left, [X | Right])
-    end.
+    P = ((V bsr WeakLowBits) band (M-1)) bor M,
+    S = [R1|W],
+    [P|S].
 
 %% =====================================================================
 %% Internal functions
diff --git a/lib/stdlib/test/rand_SUITE.erl b/lib/stdlib/test/rand_SUITE.erl
index c81ec771bb75..5985f350997c 100644
--- a/lib/stdlib/test/rand_SUITE.erl
+++ b/lib/stdlib/test/rand_SUITE.erl
@@ -27,7 +27,26 @@
 
 -include_lib("common_test/include/ct.hrl").
 
--define(LOOP, 1000000).
+%% Testcases
+-export(
+   [seed/1, interval_int/1, interval_float/1,
+    bytes_count/1,
+    shuffle_elements/1, shuffle_reference/1,
+    basic_stats_shuffle/1, measure_shuffle/1,
+    api_eq/1,
+    mwc59_api/1,
+    exsp_next_api/1, exsp_jump_api/1,
+    splitmix64_next_api/1,
+    reference/1,
+    uniform_real_conv/1,
+    plugin/1, measure/1,
+    short_jump/1
+   ]).
+
+%% Manual test functions
+-export([measure_shuffle/2, measure_shuffle/4]).
+
+-define(LOOP, 1000_000).
 
 suite() ->
     [{ct_hooks,[ts_install_cth]},
@@ -36,6 +55,8 @@ suite() ->
 all() ->
     [seed, interval_int, interval_float,
      bytes_count,
+     shuffle_elements, shuffle_reference,
+     basic_stats_shuffle, measure_shuffle,
      api_eq,
      mwc59_api,
      exsp_next_api, exsp_jump_api,
@@ -389,6 +410,155 @@ bytes_count(Config) when is_list(Config) ->
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+%% Check that shuffle doesn't loose or duplicate elements
+
+shuffle_elements(Config) when is_list(Config) ->
+    M = 20,
+    SortedList = lists:seq(0, (1 bsl M) - 1),
+    State = rand:seed(default),
+    case lists:sort(rand:shuffle(SortedList)) of
+        SortedList -> ok;
+        _ ->
+            error({mismatch, State})
+    end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%% Check that shuffle is repeatable
+
+shuffle_reference(Config) when is_list(Config) ->
+    M = 20,
+    Seed = {1,2,3},
+    MD5 = <<56,202,188,237,192,69,132,182,227,54,33,68,45,74,208,89>>,
+    %%
+    SortedList = lists:seq(0, (1 bsl M) - 1),
+    S = rand:seed_s(default, Seed),
+    {ShuffledList, NewS} = rand:shuffle_s(SortedList, S),
+    Data = mk_iolist(ShuffledList, M),
+    case erlang:md5(Data) of
+        MD5 -> ok;
+        WrongMD5 ->
+            error({wrong_checksum, WrongMD5, NewS})
+    end.
+
+mk_iolist([], _M) -> [];
+mk_iolist([X, Y | L], M) ->
+    [<<X:M, Y:M>> | mk_iolist(L, M)].
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%% Check basic stats for shuffle
+
+basic_stats_shuffle(Config) when is_list(Config) ->
+    ct:timetrap({minutes,15}), %% valgrind needs a lot of time
+    Loop            = ?LOOP div 10,
+    Buckets         = 113,
+    CountTolerance  = 0.18,
+    Alg             = default,
+    %%
+    %% One array per position.
+    %% The array is a histogram that counts how many times
+    %% a random number has occured in that position,
+    %% where the random number is the index in the array.
+    SortedList = lists:seq(1, Buckets),
+    S = rand:seed(Alg),
+    Result =
+        lists:filter(
+          fun (R) -> R =/= [] end,
+          [begin
+               Sum             = basic_shuffle_sum(1, Counters),
+               Buckets         = length(Counters),
+               ExpectedAverage = (Buckets + 1) / 2,
+               basic_verify(
+                 Pos, Loop, Sum, ExpectedAverage, Counters, CountTolerance)
+           end ||
+              {Pos, Counters} <-
+                  lists:zip(
+                    SortedList,
+                    basic_shuffle(Loop, SortedList, S))]),
+    Result =:= [] orelse
+        ct:fail({Result, S}),
+    ok.
+
+basic_shuffle(N, SortedList, S) ->
+    Buckets = length(SortedList),
+    AL = lists:duplicate(Buckets, array:new([Buckets + 1, {default,0}])),
+    basic_shuffle(N, SortedList, S, AL).
+%%
+basic_shuffle(N, _SortedList, _S, AL) when N =< 0 ->
+    [tl(array:to_list(A)) || A <- AL];
+basic_shuffle(N, SortedList, S0, AL0) ->
+    {ShuffledList, S1} = rand:shuffle_s(SortedList, S0),
+    AL1 = basic_shuffle_count(ShuffledList, AL0),
+    basic_shuffle(N - 1, SortedList, S1, AL1).
+
+basic_shuffle_count([], [])            -> [];
+basic_shuffle_count([X | L], [A | AL]) ->
+    [array_add(X, 1, A) | basic_shuffle_count(L, AL)].
+
+basic_shuffle_sum(_Number, [])                  -> 0;
+basic_shuffle_sum(Number, [Counter | Counters]) ->
+    Number*Counter + basic_shuffle_sum(Number + 1, Counters).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%% Measure shuffle with PRNG algorithms and against
+%% a known reference shuffle implementation
+
+measure_shuffle(Config) when is_list(Config) ->
+    ct:timetrap({minutes,60}), %% valgrind needs a lot of time
+    case ct:get_timetrap_info() of
+        {_,{_,1}} -> % No scaling
+            Effort = proplists:get_value(measure_effort, Config, 1),
+            measure_shuffle(Effort);
+        {_,{_,Scale}} ->
+            {skip,{will_not_run_in_scaled_time,Scale}}
+    end;
+measure_shuffle(Effort) when is_integer(Effort) ->
+    Algs =
+        [default, exs1024 |
+         case crypto_support() of
+             ok -> [crypto];
+             _  -> []
+         end],
+    measure_shuffle(Effort, Algs).
+
+measure_shuffle(Effort, Algs)
+  when is_integer(Effort), is_list(Algs) ->
+    _ = measure_shuffle(100, us, Algs, 10000 * Effort),
+    _ = measure_shuffle(10_000, ms, Algs, 100 * Effort),
+    _ = measure_shuffle(1000_000, ms, Algs, Effort),
+    ok.
+
+measure_shuffle(Size, Unit, Algs, I)
+  when is_integer(Size), is_atom(Unit), is_list(Algs), is_integer(I) ->
+    ct:log("~nShuffle ~w performance~n", [Size]),
+    [TMark, Overhead | _] = RandResults =
+        measure_1(
+          fun (_Mod, _State) ->
+                  List = lists:seq(1, Size),
+                  fun (St0) ->
+                          case rand:shuffle_s(List, St0) of
+                              {L, St1} when is_list(L) ->
+                                  St1
+                          end
+                  end
+          end, Algs, {I, Unit}),
+    RandResults ++
+        [measure_1(
+           fun (_Mod, _State) ->
+                   List = lists:seq(1, Size),
+                   fun (St0) ->
+                           case shuffle_ref(List, St0) of
+                               {L, St1} when is_list(L) ->
+                                   St1
+                           end
+                   end
+           end, {shuffle_ref,Alg}, {I, Unit}, TMark, Overhead)
+         || Alg <- Algs].
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 %% Check if each algorithm generates the proper sequence.
 reference(Config) when is_list(Config) ->
     [reference_1(Alg) || Alg <- algs()],
@@ -455,33 +625,58 @@ gen(_, _, _, Acc) -> lists:reverse(Acc).
 
 basic_stats_uniform_1(Config) when is_list(Config) ->
     ct:timetrap({minutes,15}), %% valgrind needs a lot of time
+    Loop            = ?LOOP,
+    Buckets         = 100,
+    CountTolerance  = 0.05,
+    ExpectedAverage = 1/2,
     Result =
         lists:filter(
           fun (R) -> R =/= [] end,
-          [basic_uniform_1(Alg, ?LOOP, 100)
-           || Alg <- [default|algs()]]),
+          [begin
+               {Sum, Counters} = basic_uniform_1(Loop, Buckets, Alg),
+               basic_verify(
+                 Alg, Loop, Sum, ExpectedAverage, Counters, CountTolerance)
+           end ||
+              Alg <- [default|algs()]]),
     Result =:= [] orelse
         ct:fail(Result),
     ok.
 
 basic_stats_uniform_2(Config) when is_list(Config) ->
     ct:timetrap({minutes,15}), %% valgrind needs a lot of time
+    Loop            = ?LOOP,
+    Buckets         = 100,
+    CountTolerance  = 0.05,
+    ExpectedAverage = (1 + Buckets) / 2,
     Result =
         lists:filter(
           fun (R) -> R =/= [] end,
-          [basic_uniform_2(Alg, ?LOOP, 100)
-           || Alg <- [default|algs()]]),
+          [begin
+               {Sum, Counters} = basic_uniform_2(Loop, Buckets, Alg),
+               basic_verify(
+                 Alg, Loop, Sum, ExpectedAverage, Counters, CountTolerance)
+           end ||
+              Alg <- [default|algs()]]),
     Result =:= [] orelse
         ct:fail(Result),
     ok.
 
 basic_stats_bytes(Config) when is_list(Config) ->
     ct:timetrap({minutes,15}), %% valgrind needs a lot of time
+    Loop           = ?LOOP div 100,
+    BinSize        = 113,
+    CountTolerance = 0.07,
     Result =
         lists:filter(
           fun (R) -> R =/= [] end,
-          [basic_bytes(Alg, ?LOOP div 100, 113)
-           || Alg <- [default|algs()]]),
+          [begin
+               {ExpectedAverage, Sum, Counters} =
+                   basic_bytes(Loop, BinSize, Alg),
+               basic_verify(
+                 Alg, Loop * BinSize, Sum, ExpectedAverage,
+                 Counters, CountTolerance)
+           end ||
+              Alg <- [default|algs()]]),
     Result =:= [] orelse
         ct:fail(Result),
     ok.
@@ -529,13 +724,13 @@ basic_stats_normal(Config) when is_list(Config) ->
         ct:fail(Result),
     ok.
 
-basic_uniform_1(Alg, Loop, Buckets) ->
+basic_uniform_1(Loop, Buckets, Alg) ->
     basic_uniform_1(
-      0, Loop, Buckets, rand:seed_s(Alg), 0.0,
+      Loop, Buckets, rand:seed_s(Alg), 0.0,
       array:new(Buckets, [{default, 0}])).
 %%
-basic_uniform_1(N, Loop, Buckets, S0, Sum, A0) when N < Loop ->
-    {X,S} =
+basic_uniform_1(N, Buckets, S0, Sum, A) when 0 < N ->
+    {X,S1} =
         case N band 1 of
             0 ->
                 rand:uniform_s(S0);
@@ -543,81 +738,84 @@ basic_uniform_1(N, Loop, Buckets, S0, Sum, A0) when N < Loop ->
                 rand:uniform_real_s(S0)
         end,
     I = trunc(X*Buckets),
-    A = array:set(I, 1+array:get(I,A0), A0),
-    basic_uniform_1(N+1, Loop, Buckets, S, Sum+X, A);
-basic_uniform_1(_N, Loop, Buckets, {#{type:=Alg}, _}, Sum, A) ->
-    AverExp = 1.0 / 2,
-    Counters = array:to_list(A),
-    basic_verify(Alg, Loop, Sum, AverExp, Buckets, Counters).
+    basic_uniform_1(N-1, Buckets, S1, Sum+X, array_add(I, 1, A));
+basic_uniform_1(_0, _Buckets, _S, Sum, A) ->
+    {Sum, array:to_list(A)}.
 
-basic_uniform_2(Alg, Loop, Buckets) ->
+basic_uniform_2(Loop, Buckets, Alg) ->
     basic_uniform_2(
-      0, Loop, Buckets, rand:seed_s(Alg), 0,
-      array:new(Buckets, [ {default, 0}])).
+      Loop, Buckets, rand:seed_s(Alg), 0,
+      array:new(Buckets + 1, [{default, 0}])).
 %%
-basic_uniform_2(N, Loop, Buckets, S0, Sum, A0) when N < Loop ->
+basic_uniform_2(N, Buckets, S0, Sum, A0) when 0 < N ->
     {X,S} = rand:uniform_s(Buckets, S0),
-    A = array:set(X-1, 1+array:get(X-1,A0), A0),
-    basic_uniform_2(N+1, Loop, Buckets, S, Sum+X, A);
-basic_uniform_2(_N, Loop, Buckets, {#{type:=Alg}, _}, Sum, A) ->
-    AverExp = ((Buckets - 1) / 2) + 1,
-    Counters = tl(array:to_list(A)),
-    basic_verify(Alg, Loop, Sum, AverExp, Buckets, Counters).
-
-basic_bytes(Alg, Loop, BytesSize) ->
+    A = array_add(X, 1, A0),
+    basic_uniform_2(N-1, Buckets, S, Sum+X, A);
+basic_uniform_2(_N, _Buckets, _S, Sum, A) ->
+    {Sum, tl(array:to_list(A))}.
+
+basic_bytes(Loop, BinSize, Alg) ->
     basic_bytes(
-      0, Loop, BytesSize, rand:seed_s(Alg), 0,
+      Loop, BinSize, rand:seed_s(Alg), 0,
       array:new(256, [{default, 0}])).
-basic_bytes(N, Loop, BytesSize, S0, Sum0, A0) when N < Loop ->
-    {Bin,S} = rand:bytes_s(BytesSize, S0),
+%%
+basic_bytes(N, BinSize, S0, Sum0, A0) when 0 < N ->
+    {Bin,S} = rand:bytes_s(BinSize, S0),
     {Sum,A} = basic_bytes_incr(Bin, Sum0, A0),
-    basic_bytes(N+1, Loop, BytesSize, S, Sum, A);
-basic_bytes(_N, Loop, BytesSize, {#{type:=Alg}, _}, Sum, A) ->
-    Buckets = 256,
-    AverExp = (Buckets - 1) / 2,
+    basic_bytes(N-1, BinSize, S, Sum, A);
+basic_bytes(_N, _BinSize, _S, Sum, A) ->
     Counters = array:to_list(A),
-    basic_verify(Alg, Loop * BytesSize, Sum, AverExp, Buckets, Counters).
+    ExpectedAverage = (0 + (array:size(A) - 1)) / 2,
+    {ExpectedAverage, Sum, Counters}.
 
 basic_bytes_incr(Bin, Sum, A) ->
     basic_bytes_incr(Bin, Sum, A, 0).
 %%
-basic_bytes_incr(Bin, Sum, A, N) ->
+basic_bytes_incr(Bin, Sum, A, I) ->
     case Bin of
-        <<_:N/binary, B, _/binary>> ->
-            basic_bytes_incr(
-              Bin, Sum+B, array:set(B, array:get(B, A)+1, A), N+1);
+        <<_:I/binary, B, _/binary>> ->
+            basic_bytes_incr(Bin, Sum+B, array_add(B, 1, A), I+1);
         <<_/binary>> ->
             {Sum,A}
     end.
 
-basic_verify(Alg, Loop, Sum, AverExp, Buckets, Counters) ->
-    AverDiff = AverExp * 0.01,
-    Aver = Sum / Loop,
+array_add(I, S, A) ->
+    array:set(I, array:get(I, A) + S, A).
+
+basic_verify(
+  Tag, Loop, Sum, ExpectedAverage, Counters, CountTolerance) ->
+    %%
+    AllowedAverageDiff = ExpectedAverage * 10 / math:sqrt(Loop),
+    Average = Sum / Loop,
     io:format(
       "~.12w: Expected Average: ~.4f, Allowed Diff: ~.4f, Average: ~.4f~n",
-      [Alg, AverExp, AverDiff, Aver]),
+      [Tag, ExpectedAverage, AllowedAverageDiff, Average]),
     %%
-    CountExp = Loop / Buckets,
-    CountDiff = CountExp * 0.1,
+    %% XXX It would be nice and possible, but perhaps too much
+    %% math-stat to calculate a good CountTolerance
+    ExpectedCount = Loop / length(Counters),
+    MinCount = (1 - CountTolerance) * ExpectedCount,
+    MaxCount = (1 + CountTolerance) * ExpectedCount,
     {MinBucket, Min} = lists_where(fun erlang:min/2, Counters),
     {MaxBucket, Max} = lists_where(fun erlang:max/2, Counters),
     io:format(
-      "~.12w: Expected Count: ~p, Allowed Diff: ~p, Min: ~p, Max: ~p~n",
-      [Alg, CountExp, CountDiff, Min, Max]),
+      "~.12w: Expected Count: ~p, MinCount: ~p, MaxCount: ~p, "
+      "Min: ~p, Max: ~p~n",
+      [Tag, ExpectedCount, MinCount, MaxCount, Min, Max]),
     %%
     %% Verify that the basic statistics are ok
     %% be gentle - we don't want to see to many failing tests
     if
-        abs(Aver - AverExp) < AverDiff -> [];
-        true -> [{average, Alg, Aver, AverExp, AverDiff}]
+        abs(Average - ExpectedAverage) < AllowedAverageDiff -> [];
+        true -> [{average, Tag, Average, ExpectedAverage, AllowedAverageDiff}]
     end ++
         if
-            abs(Min - CountExp) < CountDiff -> [];
-            true -> [{min, Alg, {MinBucket,Min}, CountExp, CountDiff}]
+            Min > MinCount -> [];
+            true -> [{min, Tag, {MinBucket,Min}, MinCount}]
         end ++
         if
-            abs(Max - CountExp) < CountDiff -> [];
-            true -> [{max, Alg, {MaxBucket,Max}, CountExp, CountDiff}]
+            Max < MaxCount -> [];
+            true -> [{max, Tag, {MaxBucket,Max}, MaxCount}]
         end.
 
 lists_where(Fun, [X | L]) ->
@@ -1073,8 +1271,8 @@ measure(Config) when is_list(Config) ->
             {skip,{will_not_run_in_scaled_time,Scale}}
     end;
 measure(Effort) when is_integer(Effort) ->
-    Iterations = ?LOOP div 5,
-    do_measure(Iterations * Effort).
+    I = ?LOOP div 5,
+    do_measure(I * Effort).
 
 
 -define(CHECK_UNIFORM_RANGE(Gen, Range, X, St),
@@ -1103,7 +1301,8 @@ measure(Effort) when is_integer(Effort) ->
                 St
         end).
 
-do_measure(Iterations) ->
+do_measure(I) ->
+    Iterations = {I, ns},
     Algs =
         case crypto_support() of
             ok ->
@@ -1645,7 +1844,7 @@ do_measure(Iterations) ->
                   Algs ++ [crypto_bytes, crypto_bytes_cached];
               _ ->
                   Algs
-          end, Iterations div 50),
+          end, setelement(1, Iterations, I div 50)),
     _ =
         measure_1(
           fun (_Mod, _State) ->
@@ -1653,7 +1852,7 @@ do_measure(Iterations) ->
                           ?CHECK_BYTE_SIZE(
                              mwc59_bytes(ByteSize2, St0), ByteSize2, Bin, St1)
                   end
-          end, {mwc59,bytes}, Iterations div 50,
+          end, {mwc59,bytes}, setelement(1, Iterations, I div 50),
           TMarkBytes2, OverheadBytes2),
     %%
     ct:log("~nRNG uniform float performance~n",[]),
@@ -1738,12 +1937,12 @@ do_measure(Iterations) ->
           TMarkNormalFloat, OverheadNormalFloat),
     ok.
 
-measure_loop(State, Fun, I) when 10 =< I ->
+measure_loop(State, Fun, I) when is_integer(I), 10 =< I ->
     %% Loop unrolling to dilute benchmark overhead...
     measure_loop(
       Fun(Fun(Fun(Fun(Fun(  Fun(Fun(Fun(Fun(Fun(State))))) ))))),
       Fun, I - 10);
-measure_loop(State, Fun, I) when 1 =< I ->
+measure_loop(State, Fun, I) when is_integer(I), 1 =< I ->
     measure_loop(Fun(State), Fun, I - 1);
 measure_loop(_, _, _) ->
     ok.
@@ -1764,7 +1963,8 @@ measure_1(InitFun, Algs, Iterations) ->
         [measure_1(InitFun, Alg, Iterations, TMark, Overhead)
          || Alg <- tl(Algs)].
 
-measure_1(InitFun, Alg, Iterations, TMark, Overhead) ->
+measure_1(InitFun, Alg, {I,Unit}, TMark, Overhead)
+  when is_integer(I), is_atom(Unit) ->
     Parent = self(),
     MeasureFun =
         fun () ->
@@ -1773,7 +1973,7 @@ measure_1(InitFun, Alg, Iterations, TMark, Overhead) ->
                 {T, ok} =
                     timer:tc(
                       fun () ->
-                              measure_loop(State, IterFun, Iterations)
+                              measure_loop(State, IterFun, I)
                       end),
                 Time = T - Overhead,
                 Percent =
@@ -1784,9 +1984,12 @@ measure_1(InitFun, Alg, Iterations, TMark, Overhead) ->
                             io_lib:format(
                               "~8.1f%", [(Time * 100 + 50) / TMark])
                     end,
+                {Scale, UnitStr} = scale(Unit),
                 io:format(
-                  "~.24w: ~8.1f ns ~s~n",
-                  [Alg, (Time * 1000 + 500) / Iterations, Percent]),
+                  "~.24w: ~8.1f ~s ~s~n",
+                  [Alg,
+                   (Time + 0.5) / 1000_000 * Scale / I,
+                   UnitStr, Percent]),
                 Parent ! {self(), Time},
                 ok
         end,
@@ -1795,6 +1998,11 @@ measure_1(InitFun, Alg, Iterations, TMark, Overhead) ->
 	{Pid, Msg} -> Msg
     end.
 
+scale(s)  -> {1, "s"};
+scale(ms) -> {1000, "ms"};
+scale(us) -> {1000_000, "µs"};
+scale(ns) -> {1000_000_000, "ns"}.
+
 measure_init(Alg) ->
     case Alg of
         overhead ->
@@ -1832,7 +2040,9 @@ measure_init(Alg) ->
                     {_, S} = rand:seed_s(exsp),
                     {rand, S};
                 splitmix64 ->
-                    {rand, erlang:unique_integer()}
+                    {rand, erlang:unique_integer()};
+                shuffle_ref ->
+                    measure_init(Tag)
             end;
         _ ->
             {rand, rand:seed_s(Alg)}
@@ -2489,3 +2699,50 @@ half_range({#{bits:=Bits}, _}) -> 1 bsl (Bits - 1);
 half_range({#{max:=Max}, _}) -> (Max bsr 1) + 1;
 half_range({#{}, _}) -> 1 bsl 63; % crypto
 half_range({_, _, _}) -> 1 bsl 50. % random
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%% Reference shuffle algorithm
+%%
+%% Decorate, sort, undecorate and shuffle duplicates.
+%% O(N) random number generations if there are no duplicates.
+%% O(N log N) for the whole algorithm due to the sort,
+%% which is as good as it gets.
+
+shuffle_ref([],        State) -> {[], State};
+shuffle_ref([_] = L,   State) -> {L,  State};
+shuffle_ref([_|_] = L, State) -> shuffle_r(L, State, []).
+
+%% Recursion entry point
+shuffle_r([X, Y], State0, Acc) ->
+    %% Optimization for 2 elements; the most common case for duplicates
+    {V, State1} = rand:uniform_s(2, State0),
+    {case V of
+         1 -> [Y, X | Acc];
+         2 -> [X, Y | Acc]
+     end, State1};
+shuffle_r([_, _ | _] = L, State, Acc) ->
+    shuffle_tag(L, State, Acc, []).
+
+%% Tag elements with random integers
+shuffle_tag([X | L], State0, Acc, TL) ->
+    {T, State1} = rand:uniform_s(1 bsl 56, State0),
+    shuffle_tag(L, State1, Acc, [{T,X} | TL]);
+shuffle_tag([], State, Acc, TL) ->
+    shuffle_untag(lists:keysort(1, TL), State, Acc).
+
+%% Strip the tag integers
+shuffle_untag([{T,X}, {T,Y} | TL], State, Acc) ->
+    shuffle_duplicates(TL, State, Acc, T, [Y, X]);
+shuffle_untag([{_,X} | TL], State, Acc) ->
+    shuffle_untag(TL, State, [X | Acc]);
+shuffle_untag([], State, Acc) ->
+    {Acc, State}.
+%%
+%% Collect duplicates
+shuffle_duplicates([{T,Z} | TL], State, Acc, T, Dups) ->
+    shuffle_duplicates(TL, State, Acc, T, [Z | Dups]);
+shuffle_duplicates(TL, State0, Acc0, _T, Dups) when is_list(TL) ->
+    %% Shuffle duplicates onto the result
+    {Acc1, State1} = shuffle_r(Dups, State0, Acc0),
+    shuffle_untag(TL, State1, Acc1).