@@ -17,6 +17,10 @@ const Endian = std.builtin.Endian;
17
17
const Signedness = std .builtin .Signedness ;
18
18
const native_endian = builtin .cpu .arch .endian ();
19
19
20
+ // value based only on a few tests, could probably be adjusted
21
+ // it may also depend on the cpu
22
+ const recursive_division_threshold = 100 ;
23
+
20
24
/// Returns the number of limbs needed to store `scalar`, which must be a
21
25
/// primitive integer value.
22
26
/// Note: A comptime-known upper bound of this value that may be used
@@ -979,18 +983,12 @@ pub const Mutable = struct {
979
983
/// The upper bound for q limb count is given by `a.limbs`.
980
984
///
981
985
/// `limbs_buffer` is used for temporary storage. The amount required is given by `calcDivLimbsBufferLen`.
982
- pub fn divFloor (
983
- q : * Mutable ,
984
- r : * Mutable ,
985
- a : Const ,
986
- b : Const ,
987
- limbs_buffer : []Limb ,
988
- ) void {
986
+ pub fn divFloor (q : * Mutable , r : * Mutable , a : Const , b : Const , limbs_buffer : []Limb , opt_allocator : ? Allocator ) void {
989
987
const sep = a .limbs .len + 2 ;
990
988
const x = a .toMutable (limbs_buffer [0.. sep ]);
991
989
const y = b .toMutable (limbs_buffer [sep .. ]);
992
990
993
- div (q , r , x , y );
991
+ div (q , r , x , y , opt_allocator );
994
992
995
993
// Note, `div` performs truncating division, which satisfies
996
994
// @divTrunc(a, b) * b + @rem(a, b) = a
@@ -1106,18 +1104,12 @@ pub const Mutable = struct {
1106
1104
/// The upper bound for q limb count is given by `a.limbs.len`.
1107
1105
///
1108
1106
/// `limbs_buffer` is used for temporary storage. The amount required is given by `calcDivLimbsBufferLen`.
1109
- pub fn divTrunc (
1110
- q : * Mutable ,
1111
- r : * Mutable ,
1112
- a : Const ,
1113
- b : Const ,
1114
- limbs_buffer : []Limb ,
1115
- ) void {
1107
+ pub fn divTrunc (q : * Mutable , r : * Mutable , a : Const , b : Const , limbs_buffer : []Limb , opt_allocator : ? Allocator ) void {
1116
1108
const sep = a .limbs .len + 2 ;
1117
1109
const x = a .toMutable (limbs_buffer [0.. sep ]);
1118
1110
const y = b .toMutable (limbs_buffer [sep .. ]);
1119
1111
1120
- div (q , r , x , y );
1112
+ div (q , r , x , y , opt_allocator );
1121
1113
}
1122
1114
1123
1115
/// r = a << shift, in other words, r = a * 2^shift
@@ -1441,7 +1433,8 @@ pub const Mutable = struct {
1441
1433
};
1442
1434
1443
1435
while (true ) {
1444
- t .divFloor (& rem , a , s .toConst (), limbs_buffer [buf_index .. ]);
1436
+ // TODO: pass an allocator or remove the need for it in the division
1437
+ t .divFloor (& rem , a , s .toConst (), limbs_buffer [buf_index .. ], null );
1445
1438
t .add (t .toConst (), s .toConst ());
1446
1439
u .shiftRight (t .toConst (), 1 );
1447
1440
@@ -1566,7 +1559,7 @@ pub const Mutable = struct {
1566
1559
// Truncates by default.
1567
1560
// Requires no aliasing between all variables
1568
1561
// a must have the capacity to store a one limb shift
1569
- fn div (q : * Mutable , r : * Mutable , a : Mutable , b : Mutable ) void {
1562
+ fn div (q : * Mutable , r : * Mutable , a : Mutable , b : Mutable , opt_allocator : ? Allocator ) void {
1570
1563
if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
1571
1564
assert (! b .eqlZero ()); // division by zero
1572
1565
assert (q != r ); // illegal aliasing
@@ -1621,7 +1614,20 @@ pub const Mutable = struct {
1621
1614
a_limbs [1 ] = result .r [0 ];
1622
1615
a_limbs [0 ] = result .r [1 ];
1623
1616
} else {
1624
- basecaseDivRem (q .limbs , a_limbs , b_limbs );
1617
+ // Currently, an allocator is required to use karatsuba.
1618
+ // Recursive division is only faster than the basecase division thanks to better
1619
+ // multiplication algorithms. Without them, it is worse due to overhead, so we just
1620
+ // default to the basecase
1621
+ if (opt_allocator ) | allocator | {
1622
+ // if `B.limbs.len` < `recursive_division_threshold`, the recursiveDivRem calls from unbalancedDivision
1623
+ // will always immediatly default to basecaseDivRem, so just using the basecase is faster
1624
+ if (b_limbs .len < recursive_division_threshold )
1625
+ basecaseDivRem (q .limbs , a_limbs , b_limbs )
1626
+ else
1627
+ unbalancedDivision (q .limbs , a_limbs , b_limbs , allocator );
1628
+ } else {
1629
+ basecaseDivRem (q .limbs , a_limbs , b_limbs );
1630
+ }
1625
1631
}
1626
1632
1627
1633
// we have r < b, so there is at most b.len() limbs
@@ -2195,7 +2201,8 @@ pub const Const = struct {
2195
2201
while (q .len >= 2 ) {
2196
2202
// Passing an allocator here would not be helpful since this division is destroying
2197
2203
// information, not creating it. [TODO citation needed]
2198
- q .divTrunc (& r , q .toConst (), b , rest_of_the_limbs_buf );
2204
+ // passing an allocator is not useful since b is one limb, so it will use lldiv1
2205
+ q .divTrunc (& r , q .toConst (), b , rest_of_the_limbs_buf , null );
2199
2206
2200
2207
var r_word = r .limbs [0 ];
2201
2208
var i : usize = 0 ;
@@ -2903,7 +2910,7 @@ pub const Managed = struct {
2903
2910
var mr = r .toMutable ();
2904
2911
const limbs_buffer = try q .allocator .alloc (Limb , calcDivLimbsBufferLen (a .len (), b .len ()));
2905
2912
defer q .allocator .free (limbs_buffer );
2906
- mq .divFloor (& mr , a .toConst (), b .toConst (), limbs_buffer );
2913
+ mq .divFloor (& mr , a .toConst (), b .toConst (), limbs_buffer , q . allocator );
2907
2914
q .setMetadata (mq .positive , mq .len );
2908
2915
r .setMetadata (mr .positive , mr .len );
2909
2916
}
@@ -2920,7 +2927,7 @@ pub const Managed = struct {
2920
2927
var mr = r .toMutable ();
2921
2928
const limbs_buffer = try q .allocator .alloc (Limb , calcDivLimbsBufferLen (a .len (), b .len ()));
2922
2929
defer q .allocator .free (limbs_buffer );
2923
- mq .divTrunc (& mr , a .toConst (), b .toConst (), limbs_buffer );
2930
+ mq .divTrunc (& mr , a .toConst (), b .toConst (), limbs_buffer , q . allocator );
2924
2931
q .setMetadata (mq .positive , mq .len );
2925
2932
r .setMetadata (mr .positive , mr .len );
2926
2933
}
@@ -3116,7 +3123,7 @@ pub const Managed = struct {
3116
3123
/// Different operators which can be used in accumulation style functions
3117
3124
/// (llmulacc, llmulaccKaratsuba, llmulaccLong, llmulLimb). In all these functions,
3118
3125
/// a computed value is accumulated with an existing result.
3119
- const AccOp = enum {
3126
+ pub const AccOp = enum {
3120
3127
/// The computed value is added to the result.
3121
3128
add ,
3122
3129
@@ -3667,7 +3674,7 @@ fn getllmulLimbAsm(comptime op: AccOp) []const u8 {
3667
3674
3668
3675
/// r = r (op) a.
3669
3676
/// The result is computed modulo `r.len`.
3670
- fn llaccum (comptime op : AccOp , r : []Limb , a : []const Limb ) bool {
3677
+ pub fn llaccum (comptime op : AccOp , r : []Limb , a : []const Limb ) bool {
3671
3678
assert (! slicesOverlap (r , a ) or @intFromPtr (r .ptr ) <= @intFromPtr (a .ptr ));
3672
3679
assert (r .len >= a .len );
3673
3680
@@ -3752,7 +3759,7 @@ pub fn llcmp(a: []const Limb, b: []const Limb) math.Order {
3752
3759
/// r = r (op) y * xi
3753
3760
/// returns whether the operation overflowed
3754
3761
/// The result is computed modulo `r.len`.
3755
- fn llmulaccLong (comptime op : AccOp , r : []Limb , a : []const Limb , b : []const Limb ) bool {
3762
+ pub fn llmulaccLong (comptime op : AccOp , r : []Limb , a : []const Limb , b : []const Limb ) bool {
3756
3763
assert (r .len >= a .len );
3757
3764
assert (a .len >= b .len );
3758
3765
@@ -3770,7 +3777,7 @@ fn llmulaccLong(comptime op: AccOp, r: []Limb, a: []const Limb, b: []const Limb)
3770
3777
//
3771
3778
// usually, if y.len > acc.len, the caller wants a modular operation, and therefore does not care
3772
3779
// about the overflow anyway
3773
- fn llmulLimb (comptime op : AccOp , acc : []Limb , y : []const Limb , xi : Limb ) bool {
3780
+ pub fn llmulLimb (comptime op : AccOp , acc : []Limb , y : []const Limb , xi : Limb ) bool {
3774
3781
assert (! slicesOverlap (acc , y ) or @intFromPtr (acc .ptr ) <= @intFromPtr (y .ptr ));
3775
3782
3776
3783
if (y .len == 0 ) return false ;
@@ -3877,7 +3884,7 @@ fn llnormalize(a: []const Limb) usize {
3877
3884
}
3878
3885
3879
3886
/// Knuth 4.3.1, Algorithm S.
3880
- fn llsubcarry (r : []Limb , a : []const Limb , b : []const Limb ) Limb {
3887
+ pub fn llsubcarry (r : []Limb , a : []const Limb , b : []const Limb ) Limb {
3881
3888
assert (a .len != 0 and b .len != 0 );
3882
3889
assert (a .len >= b .len );
3883
3890
assert (r .len >= a .len );
@@ -3942,6 +3949,121 @@ fn lladd(r: []Limb, a: []const Limb, b: []const Limb) void {
3942
3949
r [a .len ] = lladdcarry (r , a , b );
3943
3950
}
3944
3951
3952
+ /// Algorithm UnbalancedDivision from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
3953
+ ///
3954
+ /// `q` = `a` / `b` rem `r`
3955
+ ///
3956
+ /// Normalization and unnormalization steps are handled by the caller.
3957
+ /// `r` is written in `a[0..b.len]` (`a[b.len..]` is NOT zeroed out).
3958
+ /// The most significant limbs of `a` (input) can be zeroes.
3959
+ ///
3960
+ /// requires:
3961
+ /// - `b.len` >= 2
3962
+ /// - `a.len` >= 3
3963
+ /// - `a.len` >= `b.len`
3964
+ /// - `b` must be normalized (most significant bit of `b[b.len - 1]` must be set)
3965
+ /// - `q.len >= calcDivQLenExact(a, b)` (the quotient must be able to fit in `q`)
3966
+ /// a valid bound for q can be obtained more cheaply using `calcDivQLen`
3967
+ /// - no overlap between q, a and b
3968
+ fn unbalancedDivision (q : []Limb , a : []Limb , b : []const Limb , allocator : std.mem.Allocator ) void {
3969
+ if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
3970
+ assert (! slicesOverlap (q , a ));
3971
+ assert (! slicesOverlap (q , b ));
3972
+ assert (! slicesOverlap (a , b ));
3973
+ assert (b .len >= 2 );
3974
+ assert (a .len >= 3 );
3975
+ assert (a .len >= b .len );
3976
+ assert (q .len >= calcDivQLenExact (a , b ));
3977
+ // b must be normalized
3978
+ assert (@clz (b [b .len - 1 ]) == 0 );
3979
+ }
3980
+ const n = b .len ;
3981
+ var m = a .len - b .len ;
3982
+
3983
+ // We slightly deviate from the paper, by allowing `m <= n`, and, instead of doing a division after
3984
+ // the loop, we do it before, in case the quotient takes up m - n + 1 Limbs.
3985
+ // For the next loops, the quotient is always guaranteed to fit in n Limbs.
3986
+ //
3987
+ // `q.len` may be only m limbs instead of m + 1 if the caller know the result will fit
3988
+ // (which has already been asserted).
3989
+ const k = m % n ;
3990
+ recursiveDivRem (q [m - k .. @min (m + 1 , q .len )], a [m - k .. m + n ], b , allocator );
3991
+ m -= k ;
3992
+
3993
+ while (m > 0 ) {
3994
+ // At each loop, we divide <r, a[m - n .. m]> by `b`, with r = a[m .. m + n],
3995
+ // the remainder from the previous loop. This is effectively a 2 word by 1 word division,
3996
+ // except each word is n Limbs long. The process is analogous to `lldiv1`.
3997
+ //
3998
+ // The quotient is guaranteed to fit in `n` Limbs since r < b (from the previous iteration).
3999
+ recursiveDivRem (q [m - n .. m ], a [m - n .. m + n ], b , allocator );
4000
+ m -= n ;
4001
+ }
4002
+ }
4003
+
4004
+ /// Algorithm RecursiveDivRem from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
4005
+ ///
4006
+ /// `q` = `a` / `b` rem `r`
4007
+ ///
4008
+ /// Normalization and unnormalization steps are handled by the caller.
4009
+ /// `r` is written in `a[0..b.len]` (`a[b.len..]` is NOT zeroed out).
4010
+ /// The most significant limbs of `a` (input) can be zeroes.
4011
+ ///
4012
+ /// requires:
4013
+ /// - `b.len` >= 2
4014
+ /// - `a.len` >= 3
4015
+ /// - `a.len` >= `b.len` and 2 * `b.len` >= `a.len`
4016
+ /// - `b` must be normalized (most significant bit of `b[b.len - 1]` must be set)
4017
+ /// - `q.len >= calcDivQLenExact(a, b)` (the quotient must be able to fit in `q`)
4018
+ /// a valid bound for q can be obtained more cheaply using `calcDivQLen`
4019
+ /// - no overlap between q, a and b
4020
+ fn recursiveDivRem (q : []Limb , a : []Limb , b : []const Limb , allocator : std.mem.Allocator ) void {
4021
+ if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
4022
+ assert (! slicesOverlap (q , a ));
4023
+ assert (! slicesOverlap (q , b ));
4024
+ assert (! slicesOverlap (a , b ));
4025
+ assert (b .len >= 2 );
4026
+ assert (a .len >= 3 );
4027
+ assert (a .len >= b .len );
4028
+ // n >= m
4029
+ assert (2 * b .len >= a .len );
4030
+ assert (q .len >= std .math .big .int .calcDivQLenExact (a , b ));
4031
+ // b must be normalized
4032
+ assert (@clz (b [b .len - 1 ]) == 0 );
4033
+ }
4034
+
4035
+ const n = b .len ;
4036
+ const m = a .len - n ;
4037
+
4038
+ if (m < recursive_division_threshold ) return basecaseDivRem (q , a , b );
4039
+
4040
+ const k = m / 2 ;
4041
+
4042
+ const b0 = b [0.. k ];
4043
+ const b1 = b [k .. ];
4044
+ const q1 = q [k .. @min (q .len , m + 1 )];
4045
+ const q0 = q [0.. k ];
4046
+
4047
+ // It is possible to reduce the probability of `a_is_negative`
4048
+ // by adding a Limb to a[2*k..] and b[k..]. In practice, I did not
4049
+ // see any meaningful speed difference
4050
+ recursiveDivRem (q1 , a [2 * k .. ], b1 , allocator );
4051
+ var a_is_negative = llmulacc (.sub , allocator , a [k .. ], q1 , b0 );
4052
+
4053
+ while (a_is_negative ) {
4054
+ _ = llaccum (.sub , q1 , &.{1 });
4055
+ a_is_negative = ! llaccum (.add , a [k .. ], b );
4056
+ }
4057
+
4058
+ recursiveDivRem (q0 , a [k .. ][0.. n ], b1 , allocator );
4059
+ a_is_negative = llmulacc (.sub , allocator , a , q0 , b0 );
4060
+
4061
+ while (a_is_negative ) {
4062
+ _ = llaccum (.sub , q0 , &.{1 });
4063
+ a_is_negative = ! llaccum (.add , a , b );
4064
+ }
4065
+ }
4066
+
3945
4067
/// Algorithm BasecaseDivRem from "Modern Computer Arithmetic" by Richard P. Brent and Paul Zimmermann
3946
4068
/// modified to use Algorithm 5 from "Improved division by invariant integers"
3947
4069
/// by Niels Möller and Torbjörn Granlund
@@ -3962,7 +4084,7 @@ fn lladd(r: []Limb, a: []const Limb, b: []const Limb) void {
3962
4084
/// - no overlap between q, a and b
3963
4085
// note: it is probably possible to make a and q overlap, by having q = a[m..a.len+1]
3964
4086
// but not sure if it is worth it
3965
- fn basecaseDivRem (q : []Limb , a : []Limb , b : []const Limb ) void {
4087
+ pub fn basecaseDivRem (q : []Limb , a : []Limb , b : []const Limb ) void {
3966
4088
if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
3967
4089
assert (! slicesOverlap (q , a ));
3968
4090
assert (! slicesOverlap (q , b ));
@@ -4041,7 +4163,7 @@ fn basecaseDivRem(q: []Limb, a: []Limb, b: []const Limb) void {
4041
4163
/// Requires:
4042
4164
/// - b to be normalized (its most significant bit must be set)
4043
4165
/// - the quotient must be able to fit in `q`
4044
- fn lldiv1 (q : []Limb , a : []Limb , b : Limb ) void {
4166
+ pub fn lldiv1 (q : []Limb , a : []Limb , b : Limb ) void {
4045
4167
if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
4046
4168
assert (q .len >= calcDivQLenExact (a , &.{b }));
4047
4169
// b must be normalized
@@ -4087,7 +4209,7 @@ fn lldiv1(q: []Limb, a: []Limb, b: Limb) void {
4087
4209
/// `v` is computed from `d` using `reciprocal_word_3by2`
4088
4210
///
4089
4211
/// `r` is returned in big endian (`r[0]` is the high part of `r` and `r[1]` is its low one)
4090
- fn div3by2 (U2 : Limb , U1 : Limb , U0 : Limb , d1 : Limb , d0 : Limb , v : Limb ) struct { q : Limb , r : [2 ]Limb } {
4212
+ pub fn div3by2 (U2 : Limb , U1 : Limb , U0 : Limb , d1 : Limb , d0 : Limb , v : Limb ) struct { q : Limb , r : [2 ]Limb } {
4091
4213
if (builtin .mode == .Debug or builtin .mode == .ReleaseSafe ) {
4092
4214
assert (@clz (d1 ) == 0 );
4093
4215
assert (order2 (U2 , U1 , d1 , d0 ) == .lt );
@@ -4186,7 +4308,7 @@ fn order2(ahi: Limb, alo: Limb, bhi: Limb, blo: Limb) math.Order {
4186
4308
///
4187
4309
/// Computes (B^3 - 1) / d - B, with B = 2^@bitSizeOf(T) and d = d1 * B + d0
4188
4310
/// `d` (therefore `d1`) must be normalized
4189
- fn reciprocalWord3by2 (d1 : Limb , d0 : Limb ) Limb {
4311
+ pub fn reciprocalWord3by2 (d1 : Limb , d0 : Limb ) Limb {
4190
4312
assert (@clz (d1 ) == 0 );
4191
4313
4192
4314
var v = reciprocalWord (d1 );
0 commit comments