@@ -585,36 +585,29 @@ union FloatIEEE754 {
585
585
extern inline void
586
586
fast_log2 (float *values, int n_values)
587
587
{
588
- const int block_size = 4096 ; // guarantee fixed amount of stack space
589
- int fexp[block_size];
590
- while (n_values)
588
+ /* this loop is written in a way that both, gcc and clang should auto vectorize it */
589
+ for (int k = 0 ; k < n_values; k++)
591
590
{
592
- int todo = std::min (n_values, block_size);
593
- int *values_i = reinterpret_cast <int *> (values);
594
- for (int k = 0 ; k < todo; k++)
595
- {
596
- const int EXPONENT_MASK = 0x7F800000 ;
597
- fexp[k] = (values_i[k] >> 23 ) - FloatIEEE754::BIAS; // extract exponent without bias (rely on sign bit == 0)
598
- values_i[k] = (values_i[k] & ~EXPONENT_MASK) | FloatIEEE754::BIAS << 23 ; // reset exponent to 2^0 so v_float is mantissa in [1..2]
599
- }
600
- for (int k = 0 ; k < todo; k++)
601
- {
602
- float r, x = values[k] - 1 .0f ;
603
- // x=[0..1]; r = log2 (x + 1);
604
- // h=0.0113916; // offset to reduce error at origin
605
- // f=(1/log(2)) * log(x+1); dom=[0-h;1+h]; p=remez(f, 6, dom, 1);
606
- // p = p - p(0); // discard non-0 offset
607
- // err=p-f; plot(err,[0;1]); plot(f,p,dom); // result in sollya
608
- r = x * -0 .0259366993544709205147977455165000143561553284592936f ;
609
- r = x * (+0 .122047857676447181074792747820717519424533931189428f + r);
610
- r = x * (-0 .27814297685064327713977752916286528359628147166014f + r);
611
- r = x * (+0 .45764712300320092992105460899527194244236573556309f + r);
612
- r = x * (-0 .71816105664624015087225994551041120290062342459945f + r);
613
- r = x * (+1 .44254540258782520489769598315182363877204824648687f + r);
614
- values[k] = fexp[k] + r; // log2 (i) + log2 (x)
615
- }
616
- values += todo;
617
- n_values -= todo;
591
+ const int EXPONENT_MASK = 0x7F800000 ;
592
+ int iv;
593
+ memcpy (&iv, &values[k], sizeof (float )); // iv = *(int *) &values[k]
594
+ int fexp = (iv >> 23 ) - FloatIEEE754::BIAS; // extract exponent without bias (rely on sign bit == 0)
595
+ iv = (iv & ~EXPONENT_MASK) | FloatIEEE754::BIAS << 23 ; // reset exponent to 2^0 so v_float is mantissa in [1..2]
596
+ float r, x;
597
+ memcpy (&x, &iv, sizeof (float )); // x = *(float *) &iv;
598
+ x -= 1 ;
599
+ // x=[0..1]; r = log2 (x + 1);
600
+ // h=0.0113916; // offset to reduce error at origin
601
+ // f=(1/log(2)) * log(x+1); dom=[0-h;1+h]; p=remez(f, 6, dom, 1);
602
+ // p = p - p(0); // discard non-0 offset
603
+ // err=p-f; plot(err,[0;1]); plot(f,p,dom); // result in sollya
604
+ r = x * -0 .0259366993544709205147977455165000143561553284592936f ;
605
+ r = x * (+0 .122047857676447181074792747820717519424533931189428f + r);
606
+ r = x * (-0 .27814297685064327713977752916286528359628147166014f + r);
607
+ r = x * (+0 .45764712300320092992105460899527194244236573556309f + r);
608
+ r = x * (-0 .71816105664624015087225994551041120290062342459945f + r);
609
+ r = x * (+1 .44254540258782520489769598315182363877204824648687f + r);
610
+ values[k] = fexp + r; // log2 (i) + log2 (x)
618
611
}
619
612
}
620
613
// //////////// end: code based on log2 code from Anklang/ASE by Tim Janik
0 commit comments