@@ -17,6 +17,10 @@ const Endian = std.builtin.Endian;
1717const Signedness = std .builtin .Signedness ;
1818const native_endian = builtin .cpu .arch .endian ();
1919
20+ // value based only on a few tests, could probably be adjusted
21+ // it may also depend on the cpu
22+ const recursive_division_threshold = 100 ;
23+
2024/// Returns the number of limbs needed to store `scalar`, which must be a
2125/// primitive integer value.
2226/// Note: A comptime-known upper bound of this value that may be used
@@ -979,18 +983,12 @@ pub const Mutable = struct {
979983 /// The upper bound for q limb count is given by `a.limbs`.
980984 ///
981985 /// `limbs_buffer` is used for temporary storage. The amount required is given by `calcDivLimbsBufferLen`.
982- pub fn divFloor (
983- q : * Mutable ,
984- r : * Mutable ,
985- a : Const ,
986- b : Const ,
987- limbs_buffer : []Limb ,
988- ) void {
986+ pub fn divFloor (q : * Mutable , r : * Mutable , a : Const , b : Const , limbs_buffer : []Limb , opt_allocator : ? Allocator ) void {
989987 const sep = a .limbs .len + 2 ;
990988 const x = a .toMutable (limbs_buffer [0.. sep ]);
991989 const y = b .toMutable (limbs_buffer [sep .. ]);
992990
993- div (q , r , x , y );
991+ div (q , r , x , y , opt_allocator );
994992
995993 // Note, `div` performs truncating division, which satisfies
996994 // @divTrunc(a, b) * b + @rem(a, b) = a
@@ -1106,18 +1104,12 @@ pub const Mutable = struct {
11061104 /// The upper bound for q limb count is given by `a.limbs.len`.
11071105 ///
11081106 /// `limbs_buffer` is used for temporary storage. The amount required is given by `calcDivLimbsBufferLen`.
1109- pub fn divTrunc (
1110- q : * Mutable ,
1111- r : * Mutable ,
1112- a : Const ,
1113- b : Const ,
1114- limbs_buffer : []Limb ,
1115- ) void {
1107+ pub fn divTrunc (q : * Mutable , r : * Mutable , a : Const , b : Const , limbs_buffer : []Limb , opt_allocator : ? Allocator ) void {
11161108 const sep = a .limbs .len + 2 ;
11171109 const x = a .toMutable (limbs_buffer [0.. sep ]);
11181110 const y = b .toMutable (limbs_buffer [sep .. ]);
11191111
1120- div (q , r , x , y );
1112+ div (q , r , x , y , opt_allocator );
11211113 }
11221114
11231115 /// r = a << shift, in other words, r = a * 2^shift
@@ -1441,7 +1433,8 @@ pub const Mutable = struct {
14411433 };
14421434
14431435 while (true ) {
1444- t .divFloor (& rem , a , s .toConst (), limbs_buffer [buf_index .. ]);
1436+ // TODO: pass an allocator or remove the need for it in the division
1437+ t .divFloor (& rem , a , s .toConst (), limbs_buffer [buf_index .. ], null );
14451438 t .add (t .toConst (), s .toConst ());
14461439 u .shiftRight (t .toConst (), 1 );
14471440
@@ -1566,7 +1559,7 @@ pub const Mutable = struct {
15661559 // Truncates by default.
15671560 // Requires no aliasing between all variables
15681561 // a must have the capacity to store a one limb shift
1569- fn div (q : * Mutable , r : * Mutable , a : Mutable , b : Mutable ) void {
1562+ fn div (q : * Mutable , r : * Mutable , a : Mutable , b : Mutable , opt_allocator : ? Allocator ) void {
15701563 if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
15711564 assert (! b .eqlZero ()); // division by zero
15721565 assert (q != r ); // illegal aliasing
@@ -1621,7 +1614,20 @@ pub const Mutable = struct {
16211614 a_limbs [1 ] = result .r [0 ];
16221615 a_limbs [0 ] = result .r [1 ];
16231616 } else {
1624- basecaseDivRem (q .limbs , a_limbs , b_limbs );
1617+ // Currently, an allocator is required to use karatsuba.
1618+ // Recursive division is only faster than the basecase division thanks to better
1619+ // multiplication algorithms. Without them, it is worse due to overhead, so we just
1620+ // default to the basecase
1621+ if (opt_allocator ) | allocator | {
1622+ // if `B.limbs.len` < `recursive_division_threshold`, the recursiveDivRem calls from unbalancedDivision
1623+ // will always immediatly default to basecaseDivRem, so just using the basecase is faster
1624+ if (b_limbs .len < recursive_division_threshold )
1625+ basecaseDivRem (q .limbs , a_limbs , b_limbs )
1626+ else
1627+ unbalancedDivision (q .limbs , a_limbs , b_limbs , allocator );
1628+ } else {
1629+ basecaseDivRem (q .limbs , a_limbs , b_limbs );
1630+ }
16251631 }
16261632
16271633 // we have r < b, so there is at most b.len() limbs
@@ -2195,7 +2201,8 @@ pub const Const = struct {
21952201 while (q .len >= 2 ) {
21962202 // Passing an allocator here would not be helpful since this division is destroying
21972203 // information, not creating it. [TODO citation needed]
2198- q .divTrunc (& r , q .toConst (), b , rest_of_the_limbs_buf );
2204+ // passing an allocator is not useful since b is one limb, so it will use lldiv1
2205+ q .divTrunc (& r , q .toConst (), b , rest_of_the_limbs_buf , null );
21992206
22002207 var r_word = r .limbs [0 ];
22012208 var i : usize = 0 ;
@@ -2903,7 +2910,7 @@ pub const Managed = struct {
29032910 var mr = r .toMutable ();
29042911 const limbs_buffer = try q .allocator .alloc (Limb , calcDivLimbsBufferLen (a .len (), b .len ()));
29052912 defer q .allocator .free (limbs_buffer );
2906- mq .divFloor (& mr , a .toConst (), b .toConst (), limbs_buffer );
2913+ mq .divFloor (& mr , a .toConst (), b .toConst (), limbs_buffer , q . allocator );
29072914 q .setMetadata (mq .positive , mq .len );
29082915 r .setMetadata (mr .positive , mr .len );
29092916 }
@@ -2920,7 +2927,7 @@ pub const Managed = struct {
29202927 var mr = r .toMutable ();
29212928 const limbs_buffer = try q .allocator .alloc (Limb , calcDivLimbsBufferLen (a .len (), b .len ()));
29222929 defer q .allocator .free (limbs_buffer );
2923- mq .divTrunc (& mr , a .toConst (), b .toConst (), limbs_buffer );
2930+ mq .divTrunc (& mr , a .toConst (), b .toConst (), limbs_buffer , q . allocator );
29242931 q .setMetadata (mq .positive , mq .len );
29252932 r .setMetadata (mr .positive , mr .len );
29262933 }
@@ -3116,7 +3123,7 @@ pub const Managed = struct {
31163123/// Different operators which can be used in accumulation style functions
31173124/// (llmulacc, llmulaccKaratsuba, llmulaccLong, llmulLimb). In all these functions,
31183125/// a computed value is accumulated with an existing result.
3119- const AccOp = enum {
3126+ pub const AccOp = enum {
31203127 /// The computed value is added to the result.
31213128 add ,
31223129
@@ -3667,7 +3674,7 @@ fn getllmulLimbAsm(comptime op: AccOp) []const u8 {
36673674
36683675/// r = r (op) a.
36693676/// The result is computed modulo `r.len`.
3670- fn llaccum (comptime op : AccOp , r : []Limb , a : []const Limb ) bool {
3677+ pub fn llaccum (comptime op : AccOp , r : []Limb , a : []const Limb ) bool {
36713678 assert (! slicesOverlap (r , a ) or @intFromPtr (r .ptr ) <= @intFromPtr (a .ptr ));
36723679 assert (r .len >= a .len );
36733680
@@ -3752,7 +3759,7 @@ pub fn llcmp(a: []const Limb, b: []const Limb) math.Order {
37523759/// r = r (op) y * xi
37533760/// returns whether the operation overflowed
37543761/// The result is computed modulo `r.len`.
3755- fn llmulaccLong (comptime op : AccOp , r : []Limb , a : []const Limb , b : []const Limb ) bool {
3762+ pub fn llmulaccLong (comptime op : AccOp , r : []Limb , a : []const Limb , b : []const Limb ) bool {
37563763 assert (r .len >= a .len );
37573764 assert (a .len >= b .len );
37583765
@@ -3770,7 +3777,7 @@ fn llmulaccLong(comptime op: AccOp, r: []Limb, a: []const Limb, b: []const Limb)
37703777//
37713778// usually, if y.len > acc.len, the caller wants a modular operation, and therefore does not care
37723779// about the overflow anyway
3773- fn llmulLimb (comptime op : AccOp , acc : []Limb , y : []const Limb , xi : Limb ) bool {
3780+ pub fn llmulLimb (comptime op : AccOp , acc : []Limb , y : []const Limb , xi : Limb ) bool {
37743781 assert (! slicesOverlap (acc , y ) or @intFromPtr (acc .ptr ) <= @intFromPtr (y .ptr ));
37753782
37763783 if (y .len == 0 ) return false ;
@@ -3877,7 +3884,7 @@ fn llnormalize(a: []const Limb) usize {
38773884}
38783885
38793886/// Knuth 4.3.1, Algorithm S.
3880- fn llsubcarry (r : []Limb , a : []const Limb , b : []const Limb ) Limb {
3887+ pub fn llsubcarry (r : []Limb , a : []const Limb , b : []const Limb ) Limb {
38813888 assert (a .len != 0 and b .len != 0 );
38823889 assert (a .len >= b .len );
38833890 assert (r .len >= a .len );
@@ -3942,6 +3949,121 @@ fn lladd(r: []Limb, a: []const Limb, b: []const Limb) void {
39423949 r [a .len ] = lladdcarry (r , a , b );
39433950}
39443951
3952+ /// Algorithm UnbalancedDivision from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
3953+ ///
3954+ /// `q` = `a` / `b` rem `r`
3955+ ///
3956+ /// Normalization and unnormalization steps are handled by the caller.
3957+ /// `r` is written in `a[0..b.len]` (`a[b.len..]` is NOT zeroed out).
3958+ /// The most significant limbs of `a` (input) can be zeroes.
3959+ ///
3960+ /// requires:
3961+ /// - `b.len` >= 2
3962+ /// - `a.len` >= 3
3963+ /// - `a.len` >= `b.len`
3964+ /// - `b` must be normalized (most significant bit of `b[b.len - 1]` must be set)
3965+ /// - `q.len >= calcDivQLenExact(a, b)` (the quotient must be able to fit in `q`)
3966+ /// a valid bound for q can be obtained more cheaply using `calcDivQLen`
3967+ /// - no overlap between q, a and b
3968+ fn unbalancedDivision (q : []Limb , a : []Limb , b : []const Limb , allocator : std.mem.Allocator ) void {
3969+ if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
3970+ assert (! slicesOverlap (q , a ));
3971+ assert (! slicesOverlap (q , b ));
3972+ assert (! slicesOverlap (a , b ));
3973+ assert (b .len >= 2 );
3974+ assert (a .len >= 3 );
3975+ assert (a .len >= b .len );
3976+ assert (q .len >= calcDivQLenExact (a , b ));
3977+ // b must be normalized
3978+ assert (@clz (b [b .len - 1 ]) == 0 );
3979+ }
3980+ const n = b .len ;
3981+ var m = a .len - b .len ;
3982+
3983+ // We slightly deviate from the paper, by allowing `m <= n`, and, instead of doing a division after
3984+ // the loop, we do it before, in case the quotient takes up m - n + 1 Limbs.
3985+ // For the next loops, the quotient is always guaranteed to fit in n Limbs.
3986+ //
3987+ // `q.len` may be only m limbs instead of m + 1 if the caller know the result will fit
3988+ // (which has already been asserted).
3989+ const k = m % n ;
3990+ recursiveDivRem (q [m - k .. @min (m + 1 , q .len )], a [m - k .. m + n ], b , allocator );
3991+ m -= k ;
3992+
3993+ while (m > 0 ) {
3994+ // At each loop, we divide <r, a[m - n .. m]> by `b`, with r = a[m .. m + n],
3995+ // the remainder from the previous loop. This is effectively a 2 word by 1 word division,
3996+ // except each word is n Limbs long. The process is analogous to `lldiv1`.
3997+ //
3998+ // The quotient is guaranteed to fit in `n` Limbs since r < b (from the previous iteration).
3999+ recursiveDivRem (q [m - n .. m ], a [m - n .. m + n ], b , allocator );
4000+ m -= n ;
4001+ }
4002+ }
4003+
4004+ /// Algorithm RecursiveDivRem from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
4005+ ///
4006+ /// `q` = `a` / `b` rem `r`
4007+ ///
4008+ /// Normalization and unnormalization steps are handled by the caller.
4009+ /// `r` is written in `a[0..b.len]` (`a[b.len..]` is NOT zeroed out).
4010+ /// The most significant limbs of `a` (input) can be zeroes.
4011+ ///
4012+ /// requires:
4013+ /// - `b.len` >= 2
4014+ /// - `a.len` >= 3
4015+ /// - `a.len` >= `b.len` and 2 * `b.len` >= `a.len`
4016+ /// - `b` must be normalized (most significant bit of `b[b.len - 1]` must be set)
4017+ /// - `q.len >= calcDivQLenExact(a, b)` (the quotient must be able to fit in `q`)
4018+ /// a valid bound for q can be obtained more cheaply using `calcDivQLen`
4019+ /// - no overlap between q, a and b
4020+ fn recursiveDivRem (q : []Limb , a : []Limb , b : []const Limb , allocator : std.mem.Allocator ) void {
4021+ if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
4022+ assert (! slicesOverlap (q , a ));
4023+ assert (! slicesOverlap (q , b ));
4024+ assert (! slicesOverlap (a , b ));
4025+ assert (b .len >= 2 );
4026+ assert (a .len >= 3 );
4027+ assert (a .len >= b .len );
4028+ // n >= m
4029+ assert (2 * b .len >= a .len );
4030+ assert (q .len >= std .math .big .int .calcDivQLenExact (a , b ));
4031+ // b must be normalized
4032+ assert (@clz (b [b .len - 1 ]) == 0 );
4033+ }
4034+
4035+ const n = b .len ;
4036+ const m = a .len - n ;
4037+
4038+ if (m < recursive_division_threshold ) return basecaseDivRem (q , a , b );
4039+
4040+ const k = m / 2 ;
4041+
4042+ const b0 = b [0.. k ];
4043+ const b1 = b [k .. ];
4044+ const q1 = q [k .. @min (q .len , m + 1 )];
4045+ const q0 = q [0.. k ];
4046+
4047+ // It is possible to reduce the probability of `a_is_negative`
4048+ // by adding a Limb to a[2*k..] and b[k..]. In practice, I did not
4049+ // see any meaningful speed difference
4050+ recursiveDivRem (q1 , a [2 * k .. ], b1 , allocator );
4051+ var a_is_negative = llmulacc (.sub , allocator , a [k .. ], q1 , b0 );
4052+
4053+ while (a_is_negative ) {
4054+ _ = llaccum (.sub , q1 , &.{1 });
4055+ a_is_negative = ! llaccum (.add , a [k .. ], b );
4056+ }
4057+
4058+ recursiveDivRem (q0 , a [k .. ][0.. n ], b1 , allocator );
4059+ a_is_negative = llmulacc (.sub , allocator , a , q0 , b0 );
4060+
4061+ while (a_is_negative ) {
4062+ _ = llaccum (.sub , q0 , &.{1 });
4063+ a_is_negative = ! llaccum (.add , a , b );
4064+ }
4065+ }
4066+
39454067/// Algorithm BasecaseDivRem from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
39464068/// modified to use Algorithm 5 from "Improved division by invariant integers"
39474069/// by Niels Möller and Torbjörn Granlund
@@ -3962,7 +4084,7 @@ fn lladd(r: []Limb, a: []const Limb, b: []const Limb) void {
39624084/// - no overlap between q, a and b
39634085// note: it is probably possible to make a and q overlap, by having q = a[m..a.len+1]
39644086// but not sure if it is worth it
3965- fn basecaseDivRem (q : []Limb , a : []Limb , b : []const Limb ) void {
4087+ pub fn basecaseDivRem (q : []Limb , a : []Limb , b : []const Limb ) void {
39664088 if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
39674089 assert (! slicesOverlap (q , a ));
39684090 assert (! slicesOverlap (q , b ));
@@ -4041,7 +4163,7 @@ fn basecaseDivRem(q: []Limb, a: []Limb, b: []const Limb) void {
40414163/// Requires:
40424164/// - b to be normalized (its most significant bit must be set)
40434165/// - the quotient must be able to fit in `q`
4044- fn lldiv1 (q : []Limb , a : []Limb , b : Limb ) void {
4166+ pub fn lldiv1 (q : []Limb , a : []Limb , b : Limb ) void {
40454167 if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
40464168 assert (q .len >= calcDivQLenExact (a , &.{b }));
40474169 // b must be normalized
@@ -4087,7 +4209,7 @@ fn lldiv1(q: []Limb, a: []Limb, b: Limb) void {
40874209/// `v` is computed from `d` using `reciprocal_word_3by2`
40884210///
40894211/// `r` is returned in big endian (`r[0]` is the high part of `r` and `r[1]` is its low one)
4090- fn div3by2 (U2 : Limb , U1 : Limb , U0 : Limb , d1 : Limb , d0 : Limb , v : Limb ) struct { q : Limb , r : [2 ]Limb } {
4212+ pub fn div3by2 (U2 : Limb , U1 : Limb , U0 : Limb , d1 : Limb , d0 : Limb , v : Limb ) struct { q : Limb , r : [2 ]Limb } {
40914213 if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
40924214 assert (@clz (d1 ) == 0 );
40934215 assert (order2 (U2 , U1 , d1 , d0 ) == .lt );
@@ -4186,7 +4308,7 @@ fn order2(ahi: Limb, alo: Limb, bhi: Limb, blo: Limb) math.Order {
41864308///
41874309/// Computes (B^3 - 1) / d - B, with B = 2^@bitSizeOf(T) and d = d1 * B + d0
41884310/// `d` (therefore `d1`) must be normalized
4189- fn reciprocalWord3by2 (d1 : Limb , d0 : Limb ) Limb {
4311+ pub fn reciprocalWord3by2 (d1 : Limb , d0 : Limb ) Limb {
41904312 assert (@clz (d1 ) == 0 );
41914313
41924314 var v = reciprocalWord (d1 );
0 commit comments