/*
build with gcc -O3 -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a9 -Wall -W neon_mathfun_test.c -lm

results on a macbook with 1.83GHz Core 1 Duo (apple gcc 4.0.1)
command line: gcc -O2 -Wall -W -DHAVE_VECLIB -msse sse_mathfun_test.c -framework Accelerate

checking sines on [0*Pi, 1*Pi]
max deviation from sinf(x): 5.96046e-08 at 0.679296388013*Pi, max deviation from cephes_sin(x): 0
max deviation from cosf(x): 5.96046e-08 at 0.755605310477*Pi, max deviation from cephes_cos(x): 0
deviation of sin(x)^2+cos(x)^2-1: 1.78814e-07 (ref deviation is 1.19209e-07)
   ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-

checking sines on [-1000*Pi, 1000*Pi]
max deviation from sinf(x): 5.96046e-08 at  1821.37678456*Pi, max deviation from cephes_sin(x): 0
max deviation from cosf(x): 5.96046e-08 at -999.820056289*Pi, max deviation from cephes_cos(x): 0
deviation of sin(x)^2+cos(x)^2-1: 1.78814e-07 (ref deviation is 1.19209e-07)
   ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-

checking exp/log [-60, 60]
max (relative) deviation from expf(x): 1.18758e-07 at  45.0583610535, max deviation from cephes_expf(x): 0
max (absolute) deviation from logf(x): 1.19209e-07 at  1.66063511372, max deviation from cephes_logf(x): 0
deviation of x - log(exp(x)): 1.19209e-07 (ref deviation is 5.96046e-08)
   ->> precision OK for the exp_ps / log_ps <<-

exp([        -1000,          -100,           100,          1000]) = [            0,             0, 2.4061436e+38, 2.4061436e+38]
exp([          nan,           inf,          -inf,           nan]) = [2.4061436e+38, 2.4061436e+38,             0, 2.4061436e+38]
log([            0,           -10,         1e+30, 1.0005271e-42]) = [          nan,           nan,     69.077553,    -87.336548]
log([          nan,           inf,          -inf,           nan]) = [   -87.336548,     88.722839,           nan,    -87.336548]
sin([          nan,           inf,          -inf,           nan]) = [          nan,           nan,           nan,           nan]
cos([          nan,           inf,          -inf,           nan]) = [          nan,           nan,           nan,           nan]
sin([       -1e+30,       -100000,         1e+30,        100000]) = [          inf,  -0.035749275,          -inf,   0.035749275]
cos([       -1e+30,       -100000,         1e+30,        100000]) = [          nan,    -0.9993608,           nan,    -0.9993608]
benching                 sinf .. ->    3.2 millions of vector evaluations/second -> 142 cycles/value on a 1830MHz computer
benching                 cosf .. ->    3.2 millions of vector evaluations/second -> 142 cycles/value on a 1830MHz computer
benching         sincos (x87) .. ->    2.8 millions of vector evaluations/second -> 161 cycles/value on a 1830MHz computer
benching                 expf .. ->    3.0 millions of vector evaluations/second -> 148 cycles/value on a 1830MHz computer
benching                 logf .. ->    3.0 millions of vector evaluations/second -> 150 cycles/value on a 1830MHz computer
benching          cephes_sinf .. ->    4.5 millions of vector evaluations/second -> 100 cycles/value on a 1830MHz computer
benching          cephes_cosf .. ->    4.9 millions of vector evaluations/second ->  92 cycles/value on a 1830MHz computer
benching          cephes_expf .. ->    3.0 millions of vector evaluations/second -> 151 cycles/value on a 1830MHz computer
benching          cephes_logf .. ->    2.6 millions of vector evaluations/second -> 172 cycles/value on a 1830MHz computer
benching               sin_ps .. ->   18.1 millions of vector evaluations/second ->  25 cycles/value on a 1830MHz computer
benching               cos_ps .. ->   18.2 millions of vector evaluations/second ->  25 cycles/value on a 1830MHz computer
benching            sincos_ps .. ->   15.0 millions of vector evaluations/second ->  30 cycles/value on a 1830MHz computer
benching               exp_ps .. ->   17.6 millions of vector evaluations/second ->  26 cycles/value on a 1830MHz computer
benching               log_ps .. ->   15.5 millions of vector evaluations/second ->  29 cycles/value on a 1830MHz computer
benching                vsinf .. ->   14.3 millions of vector evaluations/second ->  32 cycles/value on a 1830MHz computer
benching                vcosf .. ->   14.4 millions of vector evaluations/second ->  32 cycles/value on a 1830MHz computer
benching                vexpf .. ->   12.0 millions of vector evaluations/second ->  38 cycles/value on a 1830MHz computer
benching                vlogf .. ->   13.1 millions of vector evaluations/second ->  35 cycles/value on a 1830MHz computer


on a 2600MHz opteron running linux, with the 64 bits acml math vector lib (gcc 4.2):
command line: gcc-4.2 -msse -O3 -Wall -W sse_mathfun_test.c -lm -DHAVE_ACML -I /usr/local/acml3.6.0/pathscale64/include /usr/local/acml3.6.0/pathscale64/lib/libacml_mv.a

benching                 sinf .. ->    6.3 millions of vector evaluations/second -> 103 cycles/value on a 2600MHz computer
benching                 cosf .. ->    5.6 millions of vector evaluations/second -> 115 cycles/value on a 2600MHz computer
benching         sincos (x87) .. ->    4.2 millions of vector evaluations/second -> 153 cycles/value on a 2600MHz computer
benching                 expf .. ->    1.1 millions of vector evaluations/second -> 546 cycles/value on a 2600MHz computer
benching                 logf .. ->    4.7 millions of vector evaluations/second -> 138 cycles/value on a 2600MHz computer
benching          cephes_sinf .. ->   11.6 millions of vector evaluations/second ->  56 cycles/value on a 2600MHz computer
benching          cephes_cosf .. ->    8.7 millions of vector evaluations/second ->  74 cycles/value on a 2600MHz computer
benching          cephes_expf .. ->    3.7 millions of vector evaluations/second -> 172 cycles/value on a 2600MHz computer
benching          cephes_logf .. ->    5.5 millions of vector evaluations/second -> 117 cycles/value on a 2600MHz computer
benching               sin_ps .. ->   26.1 millions of vector evaluations/second ->  25 cycles/value on a 2600MHz computer
benching               cos_ps .. ->   26.1 millions of vector evaluations/second ->  25 cycles/value on a 2600MHz computer
benching            sincos_ps .. ->   23.7 millions of vector evaluations/second ->  27 cycles/value on a 2600MHz computer
benching               exp_ps .. ->   22.9 millions of vector evaluations/second ->  28 cycles/value on a 2600MHz computer
benching               log_ps .. ->   21.6 millions of vector evaluations/second ->  30 cycles/value on a 2600MHz computer
benching       acml vrs4_sinf .. ->   17.9 millions of vector evaluations/second ->  36 cycles/value on a 2600MHz computer
benching       acml vrs4_cosf .. ->   18.3 millions of vector evaluations/second ->  35 cycles/value on a 2600MHz computer
benching       acml vrs4_expf .. ->   28.6 millions of vector evaluations/second ->  23 cycles/value on a 2600MHz computer
benching       acml vrs4_logf .. ->   23.6 millions of vector evaluations/second ->  27 cycles/value on a 2600MHz computer

on a 2GHz athlon-xp 2400+ , using mingw (gcc 3.4.5)
command line:  gcc -mfpmath=sse -msse -O2 -Wall -W sse_mathfun_test.c
benching                 sinf .. ->    3.4 millions of vector evaluations/second -> 144 cycles/value on a 2000MHz computer
benching                 cosf .. ->    5.1 millions of vector evaluations/second ->  97 cycles/value on a 2000MHz computer
benching         sincos (x87) .. ->    2.3 millions of vector evaluations/second -> 214 cycles/value on a 2000MHz computer
benching                 expf .. ->    1.8 millions of vector evaluations/second -> 272 cycles/value on a 2000MHz computer
benching                 logf .. ->    2.5 millions of vector evaluations/second -> 200 cycles/value on a 2000MHz computer
benching          cephes_sinf .. ->    3.7 millions of vector evaluations/second -> 132 cycles/value on a 2000MHz computer
benching          cephes_cosf .. ->    3.2 millions of vector evaluations/second -> 153 cycles/value on a 2000MHz computer
benching          cephes_expf .. ->    1.2 millions of vector evaluations/second -> 407 cycles/value on a 2000MHz computer
benching          cephes_logf .. ->    1.4 millions of vector evaluations/second -> 355 cycles/value on a 2000MHz computer
benching               sin_ps .. ->   17.2 millions of vector evaluations/second ->  29 cycles/value on a 2000MHz computer
benching               cos_ps .. ->   17.0 millions of vector evaluations/second ->  29 cycles/value on a 2000MHz computer
benching            sincos_ps .. ->   14.7 millions of vector evaluations/second ->  34 cycles/value on a 2000MHz computer
benching               exp_ps .. ->   17.2 millions of vector evaluations/second ->  29 cycles/value on a 2000MHz computer
benching               log_ps .. ->   14.7 millions of vector evaluations/second ->  34 cycles/value on a 2000MHz computer


on the same 2GHz athlon-xp 2400+ , using cl.exe (visual c++ express 2005)
command line: cl.exe //arch:SSE //O2 //TP //MD sse_mathfun_test.c

benching                 sinf .. ->    3.1 millions of vector evaluations/second -> 160 cycles/value on a 2000MHz computer
benching                 cosf .. ->    3.9 millions of vector evaluations/second -> 127 cycles/value on a 2000MHz computer
benching         sincos (x87) .. ->    2.8 millions of vector evaluations/second -> 175 cycles/value on a 2000MHz computer
benching                 expf .. ->    2.0 millions of vector evaluations/second -> 239 cycles/value on a 2000MHz computer
benching                 logf .. ->    2.6 millions of vector evaluations/second -> 192 cycles/value on a 2000MHz computer
benching          cephes_sinf .. ->    2.5 millions of vector evaluations/second -> 198 cycles/value on a 2000MHz computer
benching          cephes_cosf .. ->    2.8 millions of vector evaluations/second -> 176 cycles/value on a 2000MHz computer
benching          cephes_expf .. ->    0.9 millions of vector evaluations/second -> 546 cycles/value on a 2000MHz computer
benching          cephes_logf .. ->    1.3 millions of vector evaluations/second -> 370 cycles/value on a 2000MHz computer
benching               sin_ps .. ->   17.2 millions of vector evaluations/second ->  29 cycles/value on a 2000MHz computer
benching               cos_ps .. ->   17.3 millions of vector evaluations/second ->  29 cycles/value on a 2000MHz computer
benching            sincos_ps .. ->   15.5 millions of vector evaluations/second ->  32 cycles/value on a 2000MHz computer
benching               exp_ps .. ->   17.8 millions of vector evaluations/second ->  28 cycles/value on a 2000MHz computer
benching               log_ps .. ->   10.4 millions of vector evaluations/second ->  48 cycles/value on a 2000MHz computer

*/

#include <stdio.h>
#include <math.h>
#include <arm_neon.h>

/* useful when debuggin.. */
void print4(float32x4_t v) {
  float *p = (float*)&v; 
  printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}

void print2i(uint32x2_t v) {
  unsigned *p = (unsigned*)&v;
  printf("[%08x %08x]", p[0], p[1]);
}

void print4i(uint32x4_t v) {
  unsigned *p = (unsigned*)&v;
  printf("[%08x %08x %08x %08x]", p[0], p[1], p[2], p[3]);
}

#include "neon_mathfun.h"
#include <stdlib.h>
#include <time.h>
#include <assert.h>

#ifdef HAVE_SYS_TIMES
#include <sys/times.h>
#include <unistd.h>
#endif

#ifdef HAVE_VECLIB
# include <vecLib/vfp.h>
#endif

typedef union {
  float f[4];
  int i[4];
  v4sf  v;
} V4SF;

#define MAX(a,b) (((a)>(b))?(a):(b))

double frand() {
  return rand()/(double)RAND_MAX;
}

#if defined(HAVE_SYS_TIMES)
  inline double uclock_sec(void) {
    static double ttclk = 0.;
    if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
    struct tms t; return ((double)times(&t)) / ttclk;
  }
# else
  inline double uclock_sec(void)
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
#endif

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

#ifndef M_LN2
#define M_LN2 0.69314718055994530942
#endif

int bitdiff(float a, float b) {
  if (a == b) return 24;
  else if (a == 0) { int j = -log(fabs(b))/M_LN2; if (j > 24) j = 24; return j; }
  else return log(fabs(a))/M_LN2 - log(fabs(b-a))/M_LN2;
}

/* they are defined at the bottom of the file */
float cephes_sinf(float);
float cephes_cosf(float);
float cephes_logf(float);
float cephes_expf(float);

int check_sincos_precision(float xmin, float xmax) {
  unsigned nb_trials = 100000;
  printf("checking sines on [%g*Pi, %g*Pi]\n", xmin, xmax);
  

  float max_err_sin_ref = 0, max_err_sin_cep = 0, max_err_sin_x = 0;
  float max_err_cos_ref = 0, max_err_cos_cep = 0, max_err_cos_x = 0;
  float max_err_sum_sqr_test = 0;
  float max_err_sum_sqr_ref = 0;
  xmin *= M_PI; xmax *= M_PI;
  unsigned i;
  for (i=0; i < nb_trials; ++i) {
    V4SF vx, sin4, cos4, sin4_2, cos4_2;
    vx.f[0] = i*(xmax-xmin)/(nb_trials-1) + xmin;
    vx.f[1] = (i+.5)*(xmax-xmin)/(nb_trials-1) + xmin;
    vx.f[2] = frand()*(xmax-xmin);
    vx.f[3] = (i / 32)*M_PI/((i%32)+1);
    if (vx.f[3] < xmin || vx.f[3] > xmax) vx.f[3] = frand()*(xmax-xmin);
    
    /*
    vx.f[0] = M_PI/2;
    vx.f[1] = M_PI;
    vx.f[2] = M_PI/3;
    vx.f[3] = M_PI/4;
    */
    sin4.v = sin_ps(vx.v);
    cos4.v = cos_ps(vx.v);
    sincos_ps(vx.v, &sin4_2.v, &cos4_2.v);
    unsigned j;
    for (j=0; j < 4; ++j) {
      float x = vx.f[j];
      float sin_test = sin4.f[j];
      float cos_test = cos4.f[j];
      if (sin_test != sin4_2.f[j]) {
        printf("sin / sincos mismatch at x=%g\n", x);
        exit(1); return 1;
      }
      if (cos_test != cos4_2.f[j]) {
        printf("cos / sincos mismatch at x=%g\n", x);
        return 1;
      }
      float sin_ref = sinf(x);
      float sin_cep = cephes_sinf(x);
      float err_sin_ref = fabs(sin_ref - sin_test);
      float err_sin_cep = fabs(sin_cep - sin_test);
      if (err_sin_ref > max_err_sin_ref) {
        max_err_sin_ref = err_sin_ref;
        max_err_sin_x = x;
      }
      max_err_sin_cep = MAX(max_err_sin_cep, err_sin_cep);
      float cos_ref = cosf(x);
      float cos_cep = cephes_cosf(x);
      float err_cos_ref = fabs(cos_ref - cos_test);
      float err_cos_cep = fabs(cos_cep - cos_test);
      if (err_cos_ref > max_err_cos_ref) {
        max_err_cos_ref = err_cos_ref;
        max_err_cos_x = x;
      }
      max_err_cos_cep = MAX(max_err_cos_cep, err_cos_cep);
      float err_sum_sqr_test = fabs(1 - cos_test*cos_test - sin_test*sin_test);
      float err_sum_sqr_ref = fabs(1 - cos_ref*cos_ref - sin_ref*sin_ref);
      max_err_sum_sqr_ref = MAX(max_err_sum_sqr_ref, err_sum_sqr_ref);
      max_err_sum_sqr_test = MAX(max_err_sum_sqr_test, err_sum_sqr_test);
      //printf("sin(%g) = %g %g err=%g\n", x, sin_ref, sin_test, err_sin_ref);
    }
  }
  printf("max deviation from sinf(x): %g at %14.12g*Pi, max deviation from cephes_sin(x): %g\n",
         max_err_sin_ref, max_err_sin_x/M_PI, max_err_sin_cep);
  printf("max deviation from cosf(x): %g at %14.12g*Pi, max deviation from cephes_cos(x): %g\n",
         max_err_cos_ref, max_err_cos_x/M_PI, max_err_cos_cep);

  printf("deviation of sin(x)^2+cos(x)^2-1: %g (ref deviation is %g)\n",
         max_err_sum_sqr_test, max_err_sum_sqr_ref); 

  if (max_err_sum_sqr_ref < 2e-7 && max_err_sin_ref < 2e-7 && max_err_cos_ref < 2e-7) {
    printf("   ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-\n\n");
    return 0;
  } else {
    printf("\n   WRONG PRECISION !! there is a problem\n\n");
    return 1;
  }
}

union float_int_union {
  int i;
  float f;
} QNAN = { 0xFFC00000 }, QNAN2 = { 0x7FC00000 }, PINF = { 0x7F800000 }, MINF = { 0xFF800000 };

int check_explog_precision(float xmin, float xmax) {
  unsigned nb_trials = 100000;
  printf("checking exp/log [%g, %g]\n", xmin, xmax);
  
  float max_err_exp_ref = 0, max_err_exp_cep = 0, max_err_exp_x = 0;
  float max_err_log_ref = 0, max_err_log_cep = 0, max_err_log_x = 0;
  float max_err_logexp_test = 0;
  float max_err_logexp_ref = 0;
  unsigned i;
  for (i=0; i < nb_trials; ++i) {
    V4SF vx, exp4, log4;
    vx.f[0] = frand()*(xmax-xmin)+xmin;
    vx.f[1] = frand()*(xmax-xmin)+xmin;
    vx.f[2] = frand()*(xmax-xmin)+xmin;
    vx.f[3] = frand()*(xmax-xmin)+xmin;
    exp4.v = exp_ps(vx.v);
    log4.v = log_ps(exp4.v);
    unsigned j;
    for (j=0; j < 4; ++j) {
      float x = vx.f[j];
      float exp_test = exp4.f[j];
      float log_test = log4.f[j];
      float exp_ref = expf(x);
      float exp_cep = cephes_expf(x);
      float err_exp_ref = fabs(exp_ref - exp_test)/exp_ref;
      float err_exp_cep = fabs(exp_cep - exp_test)/exp_ref;
      if (err_exp_ref > max_err_exp_ref) {
        max_err_exp_ref = err_exp_ref;
        max_err_exp_x = x;
      }
      max_err_exp_cep = MAX(max_err_exp_cep, err_exp_cep);

      float log_ref = logf(exp_test);
      float log_cep = cephes_logf(exp_test);
      float err_log_ref = fabs(log_ref - log_test);
      float err_log_cep = fabs(log_cep - log_test);
      if (err_log_ref > max_err_log_ref) {
        max_err_log_ref = err_log_ref;
        max_err_log_x = x;
      }
      max_err_log_cep = MAX(max_err_log_cep, err_log_cep);
      float err_logexp_test = fabs(x - log_test);
      float err_logexp_ref = fabs(x - logf(expf(x)));
      max_err_logexp_ref = MAX(max_err_logexp_ref, err_logexp_ref);
      max_err_logexp_test = MAX(max_err_logexp_test, err_logexp_test);
    }
  }
  printf("max (relative) deviation from expf(x): %g at %14.12g, max deviation from cephes_expf(x): %g\n",
         max_err_exp_ref, max_err_exp_x, max_err_exp_cep);
  printf("max (absolute) deviation from logf(x): %g at %14.12g, max deviation from cephes_logf(x): %g\n",
         max_err_log_ref, max_err_log_x, max_err_log_cep);

  printf("deviation of x - log(exp(x)): %g (ref deviation is %g)\n",
         max_err_logexp_test, max_err_logexp_ref); 

  if (max_err_logexp_test < 2e-7 && max_err_exp_ref < 2e-7 && max_err_log_ref < 2e-7) {
    printf("   ->> precision OK for the exp_ps / log_ps <<-\n\n");
    return 0;
  } else {
    printf("\n   WRONG PRECISION !! there is a problem\n\n");
    return 1;
  }
}


void dumb() {
  V4SF x = {{ 0.0903333798051, 0.0903333798051, 0.0903333798051, 0.0903333798051 }};
  V4SF w; w.v = log_ps(x.v);
  float z = cephes_logf(x.f[0]);
  printf("log_ps returned "); print4(w.v); 
  printf("\ncephes returned: %14.12g and logf(%g)=%14.12g\n", z, x.f[0], logf(x.f[0]));

  print4i(vceqq_f32(x.v, x.v)); printf("\n");
  exit(1);
}

void check_special_values() {
  V4SF vx; 
  vx.f[0] = -1000;
  vx.f[1] = -100;
  vx.f[2] = 100;
  vx.f[3] = 1000;
  printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n");
  vx.f[0] = QNAN.f;
  vx.f[1] = PINF.f;
  vx.f[2] = MINF.f;
  vx.f[3] = QNAN2.f;
  printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n");
  vx.f[0] = 0;
  vx.f[1] = -10;
  vx.f[2] = 1e30f;
  vx.f[3] = 1e-42f;
  printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n");
  vx.f[0] = QNAN.f;
  vx.f[1] = PINF.f;
  vx.f[2] = MINF.f;
  vx.f[3] = QNAN2.f;
  printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n");
  printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n");
  printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n");
  vx.f[0] = -1e30;
  vx.f[1] = -100000;
  vx.f[2] = 1e30;
  vx.f[3] = 100000;
  printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n");
  printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n");
}

v4sf set_ps1(float f) {
  return vdupq_n_f32(f);
}

v4sf max_ps(v4sf a, v4sf b) {
  return vmaxq_f32(a, b);
}

v4sf min_ps(v4sf a, v4sf b) {
  return vminq_f32(a, b);
}

#define DECL_SCALAR_FN_BENCH(fn)                     \
  int bench_##fn() {                                 \
    int niter = 10000,i,j;                           \
    float x = 0.5f, y=0;                             \
    for (i=0; i < niter; ++i) {                      \
      for (j=0; j < 4; ++j) {                        \
        x += 1e-6f;                                  \
        y += fn(x+5*(j&1));                                  \
      }                                              \
    }                                                \
    if (y == 2.32132323232f) niter--;                \
    return niter;                                    \
  } 

#define DECL_VECTOR_FN_BENCH(fn)                                        \
  int bench_##fn() {                                                    \
    int niter = 10000,i;                                                \
    v4sf bmin = set_ps1(0.5), bmax = set_ps1(1.0);                      \
    v4sf x = set_ps1(0.75);                                             \
    for (i=0; i < niter; ++i) {                                         \
      x = fn(x); x = min_ps(x, bmax); x = max_ps(x, bmin);              \
    }                                                                   \
    V4SF xx; xx.v = x;                                                  \
    if (xx.f[0] == 2.32132323232f) niter--;                             \
    return niter;                                                       \
  } 


v4sf stupid_sincos_ps(v4sf x) {
  v4sf s, c;
  sincos_ps(x, &s, &c);
  return s;
}

DECL_SCALAR_FN_BENCH(sinf);
DECL_SCALAR_FN_BENCH(cosf);
DECL_SCALAR_FN_BENCH(logf);
DECL_SCALAR_FN_BENCH(expf);
DECL_SCALAR_FN_BENCH(cephes_sinf);
DECL_SCALAR_FN_BENCH(cephes_cosf);
DECL_SCALAR_FN_BENCH(cephes_expf);
DECL_SCALAR_FN_BENCH(cephes_logf);
DECL_VECTOR_FN_BENCH(sin_ps);
DECL_VECTOR_FN_BENCH(cos_ps);
DECL_VECTOR_FN_BENCH(stupid_sincos_ps);
DECL_VECTOR_FN_BENCH(exp_ps);
DECL_VECTOR_FN_BENCH(log_ps);
#ifdef HAVE_VECLIB
DECL_VECTOR_FN_BENCH(vsinf);
DECL_VECTOR_FN_BENCH(vcosf);
DECL_VECTOR_FN_BENCH(vlogf);
DECL_VECTOR_FN_BENCH(vexpf);
#endif
#ifdef HAVE_ACML
DECL_VECTOR_FN_BENCH(__vrs4_sinf);
DECL_VECTOR_FN_BENCH(__vrs4_cosf);
DECL_VECTOR_FN_BENCH(__vrs4_expf);
DECL_VECTOR_FN_BENCH(__vrs4_logf);
#endif

void run_bench(const char *s, int (*fn)()) {
  printf("benching %20s ..", s); fflush(stdout);
  double t0 = uclock_sec(), t1, tmax = 0.3;
  double niter = 0;
  do {
    niter += fn();
    t1 = uclock_sec();
  } while (t1 - t0 < tmax);
#define REF_FREQ_MHZ 1000.0
  printf(" -> %6.1f millions of vector evaluations/second -> %3.0f cycles/value on a %gMHz computer\n", floor(niter/(t1-t0)/1e5)/10, (t1-t0)*REF_FREQ_MHZ*1e6/niter/4, REF_FREQ_MHZ);
}

void sanity_check() {
  printf("doing some sanity checks...\n");
}

int main() {
  //dumb();
  //sanity_check();
  int err = 0;
  err += check_sincos_precision(0., 1.0);
  err += check_sincos_precision(-1000, 1000);
  err += check_explog_precision(-60, 60);

  if (err) {
    printf("some precision tests have failed\n");
  }

  check_special_values();
  run_bench("sinf", bench_sinf);
  run_bench("cosf", bench_cosf);
#ifdef HAVE_SINCOS_X86_FPU
  run_bench("sincos (x87)", bench_stupid_sincos_x86_fpu);
#endif
  run_bench("expf", bench_expf);
  run_bench("logf", bench_logf);

  run_bench("cephes_sinf", bench_cephes_sinf);
  run_bench("cephes_cosf", bench_cephes_cosf);
  run_bench("cephes_expf", bench_cephes_expf);
  run_bench("cephes_logf", bench_cephes_logf);


  run_bench("sin_ps", bench_sin_ps);
  run_bench("cos_ps", bench_cos_ps);
  run_bench("sincos_ps", bench_stupid_sincos_ps);
  run_bench("exp_ps", bench_exp_ps);
  run_bench("log_ps", bench_log_ps);

#ifdef HAVE_VECLIB
  run_bench("vsinf", bench_vsinf);
  run_bench("vcosf", bench_vcosf);
  run_bench("vexpf", bench_vexpf);
  run_bench("vlogf", bench_vlogf);
#endif
#ifdef HAVE_ACML
  run_bench("acml vrs4_sinf", bench___vrs4_sinf);
  run_bench("acml vrs4_cosf", bench___vrs4_cosf);
  run_bench("acml vrs4_expf", bench___vrs4_expf);
  run_bench("acml vrs4_logf", bench___vrs4_logf);
#endif
  return err;
}


/* cephes functions, copied here to serve as a reference */

/*							sinf.c
 *
 *	Circular sine
 *
 *
 *
 * SYNOPSIS:
 *
 * float x, y, sinf();
 *
 * y = sinf( x );
 *
 *
 *
 * DESCRIPTION:
 *
 * Range reduction is into intervals of pi/4.  The reduction
 * error is nearly eliminated by contriving an extended precision
 * modular arithmetic.
 *
 * Two polynomial approximating functions are employed.
 * Between 0 and pi/4 the sine is approximated by
 *      x  +  x**3 P(x**2).
 * Between pi/4 and pi/2 the cosine is represented as
 *      1  -  x**2 Q(x**2).
 *
 *
 * ACCURACY:
 *
 *                      Relative error:
 * arithmetic   domain      # trials      peak       rms
 *    IEEE    -4096,+4096   100,000      1.2e-7     3.0e-8
 *    IEEE    -8192,+8192   100,000      3.0e-7     3.0e-8
 * 
 * ERROR MESSAGES:
 *
 *   message           condition        value returned
 * sin total loss      x > 2^24              0.0
 *
 * Partial loss of accuracy begins to occur at x = 2^13
 * = 8192. Results may be meaningless for x >= 2^24
 * The routine as implemented flags a TLOSS error
 * for x >= 2^24 and returns 0.0.
 */

/*							cosf.c
 *
 *	Circular cosine
 *
 *
 *
 * SYNOPSIS:
 *
 * float x, y, cosf();
 *
 * y = cosf( x );
 *
 *
 *
 * DESCRIPTION:
 *
 * Range reduction is into intervals of pi/4.  The reduction
 * error is nearly eliminated by contriving an extended precision
 * modular arithmetic.
 *
 * Two polynomial approximating functions are employed.
 * Between 0 and pi/4 the cosine is approximated by
 *      1  -  x**2 Q(x**2).
 * Between pi/4 and pi/2 the sine is represented as
 *      x  +  x**3 P(x**2).
 *
 *
 * ACCURACY:
 *
 *                      Relative error:
 * arithmetic   domain      # trials      peak         rms
 *    IEEE    -8192,+8192   100,000      3.0e-7     3.0e-8
 */

/*
Cephes Math Library Release 2.2:  June, 1992
Copyright 1985, 1987, 1988, 1992 by Stephen L. Moshier
Direct inquiries to 30 Frost Street, Cambridge, MA 02140
*/


/* Single precision circular sine
 * test interval: [-pi/4, +pi/4]
 * trials: 10000
 * peak relative error: 6.8e-8
 * rms relative error: 2.6e-8
 */


static float FOPI = 1.27323954473516;
static float PIO4F = 0.7853981633974483096;
/* Note, these constants are for a 32-bit significand: */
/*
  static float DP1 =  0.7853851318359375;
  static float DP2 =  1.30315311253070831298828125e-5;
  static float DP3 =  3.03855025325309630e-11;
  static float lossth = 65536.;
*/

/* These are for a 24-bit significand: */
static float DP1 = 0.78515625;
static float DP2 = 2.4187564849853515625e-4;
static float DP3 = 3.77489497744594108e-8;
static float lossth = 8192.;
static float T24M1 = 16777215.;

static float sincof[] = {
  -1.9515295891E-4,
  8.3321608736E-3,
  -1.6666654611E-1
};
static float coscof[] = {
  2.443315711809948E-005,
  -1.388731625493765E-003,
  4.166664568298827E-002
};

float cephes_sinf( float xx )
{
  float *p;
  float x, y, z;
  register unsigned long j;
  register int sign;

  sign = 1;
  x = xx;
  if( xx < 0 )
    {
      sign = -1;
      x = -xx;
    }
  if( x > T24M1 )
    {
      //mtherr( "sinf", TLOSS );
      return(0.0);
    }
  j = FOPI * x; /* integer part of x/(PI/4) */
  y = j;
  /* map zeros to origin */
  if( j & 1 )
    {
      j += 1;
      y += 1.0;
    }
  j &= 7; /* octant modulo 360 degrees */
  /* reflect in x axis */
  if( j > 3)
    {
      sign = -sign;
      j -= 4;
    }
  if( x > lossth )
    {
      //mtherr( "sinf", PLOSS );
      x = x - y * PIO4F;
    }
  else
    {
      /* Extended precision modular arithmetic */
      x = ((x - y * DP1) - y * DP2) - y * DP3;
    }
  /*einits();*/
  z = x * x;
  //printf("my_sinf: corrected oldx, x, y = %14.10g, %14.10g, %14.10g\n", oldx, x, y);
  if( (j==1) || (j==2) )
    {
      /* measured relative error in +/- pi/4 is 7.8e-8 */
      /*
        y = ((  2.443315711809948E-005 * z
        - 1.388731625493765E-003) * z
        + 4.166664568298827E-002) * z * z;
      */
      p = coscof;
      y = *p++;
      y = y * z + *p++;
      y = y * z + *p++;
      y *= z; y *= z;
      y -= 0.5 * z;
      y += 1.0;
    }
  else
    {
      /* Theoretical relative error = 3.8e-9 in [-pi/4, +pi/4] */
      /*
        y = ((-1.9515295891E-4 * z
        + 8.3321608736E-3) * z
        - 1.6666654611E-1) * z * x;
        y += x;
      */
      p = sincof;
      y = *p++;
      y = y * z + *p++;
      y = y * z + *p++;
      y *= z; y *= x;
      y += x;
    }
  /*einitd();*/
  //printf("my_sinf: j=%d result = %14.10g * %d\n", j, y, sign);
  if(sign < 0)
    y = -y;
  return( y);
}


/* Single precision circular cosine
 * test interval: [-pi/4, +pi/4]
 * trials: 10000
 * peak relative error: 8.3e-8
 * rms relative error: 2.2e-8
 */

float cephes_cosf( float xx )
{
  float x, y, z;
  int j, sign;

  /* make argument positive */
  sign = 1;
  x = xx;
  if( x < 0 )
    x = -x;

  if( x > T24M1 )
    {
      //mtherr( "cosf", TLOSS );
      return(0.0);
    }

  j = FOPI * x; /* integer part of x/PIO4 */
  y = j;
  /* integer and fractional part modulo one octant */
  if( j & 1 )	/* map zeros to origin */
    {
      j += 1;
      y += 1.0;
    }
  j &= 7;
  if( j > 3)
    {
      j -=4;
      sign = -sign;
    }

  if( j > 1 )
    sign = -sign;

  if( x > lossth )
    {
      //mtherr( "cosf", PLOSS );
      x = x - y * PIO4F;
    }
  else
    /* Extended precision modular arithmetic */
    x = ((x - y * DP1) - y * DP2) - y * DP3;

  //printf("xx = %g -> x corrected = %g sign=%d j=%d y=%g\n", xx, x, sign, j, y);

  z = x * x;

  if( (j==1) || (j==2) )
    {
      y = (((-1.9515295891E-4f * z
             + 8.3321608736E-3f) * z
            - 1.6666654611E-1f) * z * x)
        + x;
    }
  else
    {
      y = ((  2.443315711809948E-005f * z
              - 1.388731625493765E-003f) * z
           + 4.166664568298827E-002f) * z * z;
      y -= 0.5 * z;
      y += 1.0;
    }
  if(sign < 0)
    y = -y;
  return( y );
}

/*							expf.c
 *
 *	Exponential function
 *
 *
 *
 * SYNOPSIS:
 *
 * float x, y, expf();
 *
 * y = expf( x );
 *
 *
 *
 * DESCRIPTION:
 *
 * Returns e (2.71828...) raised to the x power.
 *
 * Range reduction is accomplished by separating the argument
 * into an integer k and fraction f such that
 *
 *     x    k  f
 *    e  = 2  e.
 *
 * A polynomial is used to approximate exp(f)
 * in the basic range [-0.5, 0.5].
 *
 *
 * ACCURACY:
 *
 *                      Relative error:
 * arithmetic   domain     # trials      peak         rms
 *    IEEE      +- MAXLOG   100000      1.7e-7      2.8e-8
 *
 *
 * Error amplification in the exponential function can be
 * a serious matter.  The error propagation involves
 * exp( X(1+delta) ) = exp(X) ( 1 + X*delta + ... ),
 * which shows that a 1 lsb error in representing X produces
 * a relative error of X times 1 lsb in the function.
 * While the routine gives an accurate result for arguments
 * that are exactly represented by a double precision
 * computer number, the result contains amplified roundoff
 * error for large arguments not exactly represented.
 *
 *
 * ERROR MESSAGES:
 *
 *   message         condition      value returned
 * expf underflow    x < MINLOGF         0.0
 * expf overflow     x > MAXLOGF         MAXNUMF
 *
 */

/*
Cephes Math Library Release 2.2:  June, 1992
Copyright 1984, 1987, 1989 by Stephen L. Moshier
Direct inquiries to 30 Frost Street, Cambridge, MA 02140
*/

/* Single precision exponential function.
 * test interval: [-0.5, +0.5]
 * trials: 80000
 * peak relative error: 7.6e-8
 * rms relative error: 2.8e-8
 */

static float MAXNUMF = 3.4028234663852885981170418348451692544e38;
static float MAXLOGF = 88.72283905206835;
static float MINLOGF = -103.278929903431851103; /* log(2^-149) */


static float LOG2EF = 1.44269504088896341;

static float C1 =   0.693359375;
static float C2 =  -2.12194440e-4;


float cephes_expf(float xx) {
float x, z;
int n;

x = xx;


if( x > MAXLOGF)
	{
    //mtherr( "expf", OVERFLOW );
	return( MAXNUMF );
	}

if( x < MINLOGF )
	{
    //mtherr( "expf", UNDERFLOW );
	return(0.0);
	}

/* Express e**x = e**g 2**n
 *   = e**g e**( n loge(2) )
 *   = e**( g + n loge(2) )
 */
z = floorf( LOG2EF * x + 0.5 ); /* floor() truncates toward -infinity. */

x -= z * C1;
x -= z * C2;
n = z;

z = x * x;
/* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */
z =
((((( 1.9875691500E-4f  * x
   + 1.3981999507E-3f) * x
   + 8.3334519073E-3f) * x
   + 4.1665795894E-2f) * x
   + 1.6666665459E-1f) * x
   + 5.0000001201E-1f) * z
   + x
   + 1.0;

/* multiply by power of 2 */
x = ldexpf( z, n );

return( x );
}

/*							logf.c
 *
 *	Natural logarithm
 *
 *
 *
 * SYNOPSIS:
 *
 * float x, y, logf();
 *
 * y = logf( x );
 *
 *
 *
 * DESCRIPTION:
 *
 * Returns the base e (2.718...) logarithm of x.
 *
 * The argument is separated into its exponent and fractional
 * parts.  If the exponent is between -1 and +1, the logarithm
 * of the fraction is approximated by
 *
 *     log(1+x) = x - 0.5 x**2 + x**3 P(x)
 *
 *
 *
 * ACCURACY:
 *
 *                      Relative error:
 * arithmetic   domain     # trials      peak         rms
 *    IEEE      0.5, 2.0    100000       7.6e-8     2.7e-8
 *    IEEE      1, MAXNUMF  100000                  2.6e-8
 *
 * In the tests over the interval [1, MAXNUM], the logarithms
 * of the random arguments were uniformly distributed over
 * [0, MAXLOGF].
 *
 * ERROR MESSAGES:
 *
 * logf singularity:  x = 0; returns MINLOG
 * logf domain:       x < 0; returns MINLOG
 */

/*
Cephes Math Library Release 2.2:  June, 1992
Copyright 1984, 1987, 1988, 1992 by Stephen L. Moshier
Direct inquiries to 30 Frost Street, Cambridge, MA 02140
*/

/* Single precision natural logarithm
 * test interval: [sqrt(2)/2, sqrt(2)]
 * trials: 10000
 * peak relative error: 7.1e-8
 * rms relative error: 2.7e-8
 */
float LOGE2F = 0.693147180559945309;
float SQRTHF = 0.707106781186547524;
float PIF = 3.141592653589793238;
float PIO2F = 1.5707963267948966192;
float MACHEPF = 5.9604644775390625E-8;

float cephes_logf( float xx ) {
register float y;
float x, z, fe;
int e;

x = xx;
fe = 0.0;
/* Test for domain */
if( x <= 0.0 )
	{
    // ERROR
    return( MINLOGF );
	}

x = frexpf( x, &e );
// printf("\nmy_logf: frexp -> e = %d x = %g\n", e, x);
if( x < SQRTHF )
	{
	e -= 1;
	x = x + x - 1.0; /*  2x - 1  */
	}	
else
	{
	x = x - 1.0;
	}
z = x * x;
/* 3.4e-9 */
/*
p = logfcof;
y = *p++ * x;
for( i=0; i<8; i++ )
	{
	y += *p++;
	y *= x;
	}
y *= z;
*/

y =
(((((((( 7.0376836292E-2f * x
- 1.1514610310E-1f) * x
+ 1.1676998740E-1f) * x
- 1.2420140846E-1f) * x
+ 1.4249322787E-1f) * x
- 1.6668057665E-1f) * x
+ 2.0000714765E-1f) * x
- 2.4999993993E-1f) * x
+ 3.3333331174E-1f) * x * z;

// printf("my_logf: poly = %g\n", y);

if( e )
	{
	fe = e;
	y += -2.12194440e-4f * fe;
	}
y +=  -0.5 * z;  /* y - 0.5 x^2 */

// printf("my_logf: x = %g y = %g\n", x, y);
z = x + y;   /* ... + x  */

if( e )
	z += 0.693359375f * fe;


return( z );
}