#ifndef _INC_RECSQRT_H
#define _INC_RECSQRT_H
/*------------------------------------------------------

  recsqrt.h -- 
  A faster-than-blazes implementation of reciprocal
  square-root (1 / sqrt(f)), float recsqrt(float).
  Using recsqrt() on binary-irrational numbers can save ~40%!
  NOTE: 1/sqrt() is faster, however, on ints cast to floats!
  Profile your code both ways to be sure.

  If you wish to use this, one of your source-files must
  #define RECSQRT_INIT prior to including this header.
  
  Note: the algorithm assumes input lies in (0.5 to 2.56M]
  (CRecSqrtTable_s_fLowBound, CRecSqrtTable_s_fHighBound],
  numbers chosen as a typical range occupied by vector
  magnitudes on high-resolution video.  
  For inputs outside this range, the algorithm falls-back
  to 1.0f / sqrt(f), only with added overhead.

  Note: REVIEW marks design decisions I haven't profiled.
  Need to run Intel's VTune2.1 dynamic-assembler analysis.

  Author:  Norm Bryar       Feb., '97

  ----------------------------------------------------*/

  #ifndef _INC_MATH
    #include <math.h>    // pow, sqrt
  #endif // _INC_MATH


    // The recsqrt() algorithm does a small number of 
    // Newton-Raphson iterations on an initial guess
    // generated by a piece-wise linear polynomial over
    // a restricted input range (a <= x <= b].
    // The range (a,b] is partitioned into sub-intervals
    // according to the paper
    //    "Optimal Partitioning of Newton's Method
    //     for Calculating Roots,"  
    //     Gunter Meinardus and G.D. Taylor,
    //     Mathematics of Computation, v35 no 152 October 1980,
    //     pp 1221-1230.
    // This partioning is critical to outperforming the Pentium's
    // intrinsic form of 1.0/sqrt(f).
    // Essentially, the subintervals are given by
    // ( a*(b/a)^j/v, a*(b/a)^(j+1)/v], each of which has a
    // linear equation optimally-adjusted (via gamma)
    // to provide the minimum-error first-guess for
    // Newton-Raphson iteration.
    // The Newton-Raphson algorithm, xN+1 = xN - f/f',
    // uses f = (1/x^2 - R) as the reciprocal root of R.
    // This form for f yields iterations w/ no divisions!

    class CRecSqrtTable
    {                
    public:
        enum { s_ctIntervals = 7 };   // chosen so m_fsubinterval
                                      // fits on one cache line

        CRecSqrtTable( );

		// If __inline is honored, __fastcall is ignored
		// Debug builds don't honor inlining.
        __inline float __fastcall GetStartingPoint( float x );        

    private:
        void CalcApproxCoeficients( );
        void CalcSubIntervals( );                

    private:
        struct lineareq 
        {
            float   a;
            float   b;
        };
		// We search m_fsubinterval first.
		// To maximize elements found in cache
		// we make it a seperate array.
		// REVIEW: profile it where interval is in lineareq
		// which might minimize cache misses getting a,b.
        static float       m_fsubinterval[ s_ctIntervals + 1 ];
        static lineareq    m_lineareq[ s_ctIntervals ];
    };
    
    #define CRecSqrtTable_s_fLowBound    0.5f
    #define CRecSqrtTable_s_fHighBound   2560000.0f

#ifdef RECSQRT_INIT

    inline CRecSqrtTable::CRecSqrtTable()
    {
        CalcSubIntervals( );
        CalcApproxCoeficients( );
    }

    inline void CRecSqrtTable::CalcSubIntervals( )
    {
        double  dGenerator;
        int     iInterval;
        
        m_fsubinterval[0] = CRecSqrtTable_s_fLowBound;

        dGenerator = CRecSqrtTable_s_fHighBound / 
                     CRecSqrtTable_s_fLowBound;
        iInterval = 1;
        while( iInterval <= s_ctIntervals )
        {
            m_fsubinterval[iInterval] = (float) 
                ((double) CRecSqrtTable_s_fLowBound * 
                 pow( dGenerator, 
                      ((double) iInterval) / s_ctIntervals ));
            ++iInterval;
        }
    }

        // REVIEW: We can always calculate this once, offline,
        // and just make a const array of initialized data.
        // This would probably go in .rdata and shorten
        // load-times and shrink workingset.
    inline void CRecSqrtTable::CalcApproxCoeficients( )
    {
        const  double three_3_2 =  5.196152422707;  // 3^3/2
        double alpha;
        double beta;
        double gamma;
        double lamda;
        double lamda_ba_geommean;
        double lamda_ba_normmean;
        float  a;
        float  b;
        int    iInterval = 0;

        while( iInterval < s_ctIntervals )
        {
            a = m_fsubinterval[ iInterval ];
            b = m_fsubinterval[ iInterval + 1 ];

            lamda_ba_geommean = three_3_2 * sqrt(a * b ) * 
                                (sqrt(b) + sqrt(a));
            lamda_ba_normmean = (b + sqrt(a * b) + a);
            lamda_ba_normmean *= 2 * sqrt(lamda_ba_normmean);
            lamda = (lamda_ba_normmean - lamda_ba_geommean) /
                    (lamda_ba_normmean + lamda_ba_geommean);

            alpha = (lamda - 1.0) / 
                    (sqrt(a*b) * (sqrt(b) + sqrt(a)));
            beta  = -(b + sqrt(a*b) + a) * alpha;
            gamma = 1.0; // sqrt( 3.0 / (3.0 - lamda*lamda) );

            m_lineareq[iInterval].a = (float) (gamma * alpha);
            m_lineareq[iInterval].b = (float) (gamma * beta);
            ++iInterval;
        }
    }    

    float                   CRecSqrtTable::m_fsubinterval[ CRecSqrtTable::s_ctIntervals + 1 ];
    CRecSqrtTable::lineareq CRecSqrtTable::m_lineareq[ CRecSqrtTable::s_ctIntervals ];
    CRecSqrtTable           g_recsqrttable;

#else  // user must have defined RECSQRT_INIT in another cpp file

    extern CRecSqrtTable g_recsqrttable;

#endif // RECSQRT_INIT

       // __fastcall for use in _DEBUG builds (ie /Ob0 - no inlining)
    inline float __fastcall CRecSqrtTable::GetStartingPoint( float x )
    {
            // intentional bit-wise AND, 
            // produces less asm instructions and 
            // no short-circuit branch to foul branch-prediction.
            // Pentium assumes this if will pass and will have
            // prefetched the instructions inside the if.
        if( (x >  CRecSqrtTable_s_fLowBound) & 
            (x <= CRecSqrtTable_s_fHighBound) )
        {
            register int  i = s_ctIntervals;

                // As the sub-intervals are much larger at the top
                // we begin at the top and walk down.
                // For even-distribution of x, yields fewest loops
                // Range test above ensures i < s_ctIntervals
                // If you *know* you're args cluster around 1,
                // you may want while(x > m[++i]) NULL; --i;
                // REVIEW: Cache assumes sequential, increasing access;
                // Consider ordering 2M at [0] and 0.5 at [N] instead
	            // thus maximizing cache hits as we increment i.
            while( x <= m_fsubinterval[--i] )
                NULL;
            
            return (m_lineareq[i].a * x) +m_lineareq[i].b;
        }
        return (float) (1.0f / sqrt((double) x));
    }


    inline float __fastcall recsqrt( float flt )
    {
        register float  x;
    
        x = g_recsqrttable.GetStartingPoint( flt );
        flt *= 0.5f;    
        x = (1.5f - flt * x * x) * x;
        x = (1.5f - flt * x * x) * x;
        return x;
    }

#endif // _INC_RECSQRT_H