forked from sensorium/Mozzi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcogl_sqrti.h
238 lines (196 loc) · 7.15 KB
/
cogl_sqrti.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/**
* cogl_sqrti:
* @x: integer value
*
* Very fast fixed point implementation of square root for integers.
*
* This function is at least 6x faster than clib sqrt() on x86, and (this is
* not a typo!) about 500x faster on ARM without FPU. It's error is < 5%
* for arguments < %COGL_SQRTI_ARG_5_PERCENT and < 10% for arguments <
* %COGL_SQRTI_ARG_10_PERCENT. The maximum argument that can be passed to
* this function is COGL_SQRTI_ARG_MAX.
*
* Return value: integer square root.
*
*
* Since: 0.2
*/
#include "mozzi_fixmath.h"
//http://www.codecodex.com/wiki/Calculate_an_integer_square_root
//see Integer Square Roots by Jack W. Crenshaw, figure 2, http://www.embedded.com/electronics-blogs/programmer-s-toolbox/4219659/Integer-Square-Roots
typedef uint8_t uint8_t;
typedef unsigned short int uint16;
typedef unsigned long int uint32;
uint32 // OR uint16 OR uint8_t
isqrt32 (uint32 n) // OR isqrt16 ( uint16 n ) OR isqrt8 ( uint8_t n ) - respectively [ OR overloaded as isqrt (uint16_t?? n) in C++ ]
{
register uint32 // OR register uint16 OR register uint8_t - respectively
root, remainder, place;
root = 0;
remainder = n;
place = 0x40000000; // OR place = 0x4000; OR place = 0x40; - respectively
while (place > remainder)
place = place >> 2;
while (place)
{
if (remainder >= root + place)
{
remainder = remainder - root - place;
root = root + (place << 1);
}
root = root >> 1;
place = place >> 2;
}
return root;
}
//http://www.codecodex.com/wiki/Calculate_an_integer_square_root
uint16 // OR uint16 OR uint8_t
isqrt16 (uint16 n) // OR isqrt16 ( uint16 n ) OR isqrt8 ( uint8_t n ) - respectively [ OR overloaded as isqrt (uint16_t?? n) in C++ ]
{
register uint16 // OR register uint16 OR register uint8_t - respectively
root, remainder, place;
root = 0;
remainder = n;
place = 0x4000; // OR place = 0x4000; OR place = 0x40; - respectively
while (place > remainder)
place = place >> 2;
while (place)
{
if (remainder >= root + place)
{
remainder = remainder - root - place;
root = root + (place << 1);
}
root = root >> 1;
place = place >> 2;
}
return root;
}
/*-- isqrt -----------------------------------------------------------------*/
unsigned long isqrt(unsigned long n) {
unsigned long s, t;
#define sqrtBit(k) \
t = s+(1UL<<(k-1)); t <<= k+1; if (n >= t) { n -= t; s |= 1UL<<k; }
s = 0UL;
#ifdef __alpha
if (n >= 1UL<<62) { n -= 1UL<<62; s = 1UL<<31; }
sqrtBit(30); sqrtBit(29); sqrtBit(28); sqrtBit(27); sqrtBit(26);
sqrtBit(25); sqrtBit(24); sqrtBit(23); sqrtBit(22); sqrtBit(21);
sqrtBit(20); sqrtBit(19); sqrtBit(18); sqrtBit(17); sqrtBit(16);
sqrtBit(15);
#else
if (n >= 1UL<<30) { n -= 1UL<<30; s = 1UL<<15; }
#endif
sqrtBit(14); sqrtBit(13); sqrtBit(12); sqrtBit(11); sqrtBit(10);
sqrtBit(9); sqrtBit(8); sqrtBit(7); sqrtBit(6); sqrtBit(5);
sqrtBit(4); sqrtBit(3); sqrtBit(2); sqrtBit(1);
if (n > s<<1) s |= 1UL;
#undef sqrtBit
return s;
} /* end isqrt */
/**
http://stackoverflow.com/questions/1100090/looking-for-an-efficient-integer-square-root-algorithm-for-arm-thumb2
It's essentially from the Wikipedia article on square-root computing methods.
But it has been changed to use stdint.h types uint32_t etc. Strictly speaking
the return type could be changed to uint16_t.
* \brief Fast Square root algorithm
*
* Fractional parts of the answer are discarded. That is:
* - SquareRoot(3) --> 1
* - SquareRoot(4) --> 2
* - SquareRoot(5) --> 2
* - SquareRoot(8) --> 2
* - SquareRoot(9) --> 3
*
* \param[in] a_nInput - unsigned integer for which to find the square root
*
* \return Integer square root of the input value.
*/
uint16_t SquareRoot(uint32_t a_nInput)
{
uint32_t op = a_nInput;
uint32_t res = 0;
uint32_t one = 1uL << 30; // The second-to-top bit is set: use 1u << 14 for uint16_t type; use 1uL<<30 for uint32_t type
// "one" starts at the highest power of four <= than the argument.
while (one > op)
{
one >>= 2;
}
while (one != 0)
{
if (op >= res + one)
{
op = op - (res + one);
res = res + 2 * one;
}
res >>= 1;
one >>= 2;
}
return res;
}
int cogl_sqrti (int number)
{
/* This is a fixed point implementation of the Quake III sqrt algorithm,
* described, for example, at
* http://www.codemaestro.com/reviews/review00000105.html
*
* While the original QIII is extremely fast, the use of floating division
* and multiplication makes it perform very on arm processors without FPU.
*
* The key to successfully replacing the floating point operations with
* fixed point is in the choice of the fixed point format. The QIII
* algorithm does not calculate the square root, but its reciprocal ('y'
* below), which is only at the end turned to the inverse value. In order
* for the algorithm to produce satisfactory results, the reciprocal value
* must be represented with sufficient precission; the 16.16 we use
* elsewhere in clutter is not good enough, and 10.22 is used instead.
*/
//CoglFixed x;
long x;
uint32_t y_1; /* 10.22 fixed point */
uint32_t f = 0x600000; /* '1.5' as 10.22 fixed */
union
{
float f;
uint32_t i;
} flt, flt2;
flt.f = number;
x = Q15n0_to_Q15n16(number) / 2;
/* The QIII initial estimate */
flt.i = 0x5f3759df - ( flt.i >> 1 );
/* Now, we convert the float to 10.22 fixed. We exploit the mechanism
* described at http://www.d6.com/users/checker/pdfs/gdmfp.pdf.
*
* We want 22 bit fraction; a single precission float uses 23 bit
* mantisa, so we only need to add 2^(23-22) (no need for the 1.5
* multiplier as we are only dealing with positive numbers).
*
* Note: we have to use two separate variables here -- for some reason,
* if we try to use just the flt variable, gcc on ARM optimises the whole
* addition out, and it all goes pear shape, since without it, the bits
* in the float will not be correctly aligned.
*/
flt2.f = flt.f + 2.0;
flt2.i &= 0x7FFFFF;
/* Now we correct the estimate */
y_1 = (flt2.i >> 11) * (flt2.i >> 11);
y_1 = (y_1 >> 8) * (x >> 8);
y_1 = f - y_1;
flt2.i = (flt2.i >> 11) * (y_1 >> 11);
/* If the original argument is less than 342, we do another
* iteration to improve precission (for arguments >= 342, the single
* iteration produces generally better results).
*/
if (x < 171)
{
y_1 = (flt2.i >> 11) * (flt2.i >> 11);
y_1 = (y_1 >> 8) * (x >> 8);
y_1 = f - y_1;
flt2.i = (flt2.i >> 11) * (y_1 >> 11);
}
/* Invert, round and convert from 10.22 to an integer
* 0x1e3c68 is a magical rounding constant that produces slightly
* better results than 0x200000.
*/
return (number * flt2.i + 0x1e3c68) >> 22;
}