Skip to content

Commit 067d881

Browse files
committed
LIB: fast_log2: simplify loop while keeping it auto vectorizable
Signed-off-by: Stefan Westerfeld <[email protected]>
1 parent 586de6d commit 067d881

File tree

1 file changed

+22
-29
lines changed

1 file changed

+22
-29
lines changed

lib/smmath.hh

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -585,36 +585,29 @@ union FloatIEEE754 {
585585
extern inline void
586586
fast_log2 (float *values, int n_values)
587587
{
588-
const int block_size = 4096; // guarantee fixed amount of stack space
589-
int fexp[block_size];
590-
while (n_values)
588+
/* this loop is written in a way that both, gcc and clang should auto vectorize it */
589+
for (int k = 0; k < n_values; k++)
591590
{
592-
int todo = std::min (n_values, block_size);
593-
int *values_i = reinterpret_cast<int *> (values);
594-
for (int k = 0; k < todo; k++)
595-
{
596-
const int EXPONENT_MASK = 0x7F800000;
597-
fexp[k] = (values_i[k] >> 23) - FloatIEEE754::BIAS; // extract exponent without bias (rely on sign bit == 0)
598-
values_i[k] = (values_i[k] & ~EXPONENT_MASK) | FloatIEEE754::BIAS << 23; // reset exponent to 2^0 so v_float is mantissa in [1..2]
599-
}
600-
for (int k = 0; k < todo; k++)
601-
{
602-
float r, x = values[k] - 1.0f;
603-
// x=[0..1]; r = log2 (x + 1);
604-
// h=0.0113916; // offset to reduce error at origin
605-
// f=(1/log(2)) * log(x+1); dom=[0-h;1+h]; p=remez(f, 6, dom, 1);
606-
// p = p - p(0); // discard non-0 offset
607-
// err=p-f; plot(err,[0;1]); plot(f,p,dom); // result in sollya
608-
r = x * -0.0259366993544709205147977455165000143561553284592936f;
609-
r = x * (+0.122047857676447181074792747820717519424533931189428f + r);
610-
r = x * (-0.27814297685064327713977752916286528359628147166014f + r);
611-
r = x * (+0.45764712300320092992105460899527194244236573556309f + r);
612-
r = x * (-0.71816105664624015087225994551041120290062342459945f + r);
613-
r = x * (+1.44254540258782520489769598315182363877204824648687f + r);
614-
values[k] = fexp[k] + r; // log2 (i) + log2 (x)
615-
}
616-
values += todo;
617-
n_values -= todo;
591+
const int EXPONENT_MASK = 0x7F800000;
592+
int iv;
593+
memcpy (&iv, &values[k], sizeof (float)); // iv = *(int *) &values[k]
594+
int fexp = (iv >> 23) - FloatIEEE754::BIAS; // extract exponent without bias (rely on sign bit == 0)
595+
iv = (iv & ~EXPONENT_MASK) | FloatIEEE754::BIAS << 23; // reset exponent to 2^0 so v_float is mantissa in [1..2]
596+
float r, x;
597+
memcpy (&x, &iv, sizeof (float)); // x = *(float *) &iv;
598+
x -= 1;
599+
// x=[0..1]; r = log2 (x + 1);
600+
// h=0.0113916; // offset to reduce error at origin
601+
// f=(1/log(2)) * log(x+1); dom=[0-h;1+h]; p=remez(f, 6, dom, 1);
602+
// p = p - p(0); // discard non-0 offset
603+
// err=p-f; plot(err,[0;1]); plot(f,p,dom); // result in sollya
604+
r = x * -0.0259366993544709205147977455165000143561553284592936f;
605+
r = x * (+0.122047857676447181074792747820717519424533931189428f + r);
606+
r = x * (-0.27814297685064327713977752916286528359628147166014f + r);
607+
r = x * (+0.45764712300320092992105460899527194244236573556309f + r);
608+
r = x * (-0.71816105664624015087225994551041120290062342459945f + r);
609+
r = x * (+1.44254540258782520489769598315182363877204824648687f + r);
610+
values[k] = fexp + r; // log2 (i) + log2 (x)
618611
}
619612
}
620613
////////////// end: code based on log2 code from Anklang/ASE by Tim Janik

0 commit comments

Comments
 (0)