cbloom rants

11/30/2010

11-30-10 - Tchebychev

A little utility, and example of how I use templates : (this is not meant to be efficient, it's for offline apps, not realtime use)

template < int n >
struct Tchebychev
{
double operator () ( double x )
{
return 2.0 * x * Tchebychev < n-1>()(x) - Tchebychev < n-2>()(x);
}
};

template < >
struct Tchebychev < 0 >
{
double operator () ( double x )
{
return 1.0;
}
};

template < >
struct Tchebychev < 1 >
{
double operator () ( double x )
{
return x;
}
};

double TchebychevN(int n,double x)
{
switch(n) {
case 0 : return Tchebychev<0>()(x);
case 1 : return Tchebychev<1>()(x);
case 2 : return Tchebychev<2>()(x);
case 3 : return Tchebychev<3>()(x);
case 4 : return Tchebychev<4>()(x);
case 5 : return Tchebychev<5>()(x);
case 6 : return Tchebychev<6>()(x);
case 7 : return Tchebychev<7>()(x);
case 8 : return Tchebychev<8>()(x);
case 9 : return Tchebychev<9>()(x);
NO_DEFAULT_CASE
}
return 0;
}

template < typename functor >
void FindTchebychevCoefficients(functor f,double lo = 0.0, double hi = 1.0, int N = (1<<20))
{
double PI_over_N = PI/N;

#define NUM_T   6

double t[NUM_T] = { 0 };

for(int k=0;k < N;k++)
{
double pk = PI_over_N * (k + 0.5);
double x_k = cos(pk);
double farg = lo + (hi - lo) * 0.5 * (x_k+1.0);
double fval = f( farg );
for(int j=0;j < NUM_T;j++)
{
t[j] += fval * cos( pk * j );
}
}
for(int j=0;j < NUM_T;j++)
{
t[j] *= 1.0 / N;
if ( j != 0 )
t[j] *= 2.0;

//lprintfvar(t[j]);
lprintf("t %d: %16.9f , %16.8g\n",j,t[j],t[j]);
}

double errSqr[NUM_T] = { 0 };
double errMax[NUM_T] = { 0 };
for(int i=0;i < N;i++)
{
double xunit = (i+0.5)/N;
double farg = lo + (hi - lo) * xunit;
double xt = xunit*2.0 - 1.0;
double y = f(farg);
double p = 0.0;
for(int j=0;j < NUM_T;j++)
{
p += t[j] * TchebychevN(j,xt);
errSqr[j] += fsquare(y-p);
errMax[j] = MAX( errMax[j] , fabs(y-p) );
}
}
for(int j=0;j < NUM_T;j++)
{
lprintf("%d : errSqr = %g errMax = %g\n",j,errSqr[j]/N,errMax[j]);
}
}

You can also very easily find inner products by using the Integrate<> template I posted earlier plus this simple product adaptor :

template < typename f1, typename f2 >
struct FunctorProduct
{
double operator () ( double x )
{
return f1()(x) * f2()(x);
}
};

(eg. you can trivially find Hermite or Legendre series this way by doing inner products).

Another handy helper is a range remapper :

template < typename functor >
struct RemapFmTo
{
double m_fmLo; double m_fmHi;
double m_toLo; double m_toHi;

RemapFmTo(
double fmLo, double fmHi,
double toLo, double toHi )
: m_fmLo(fmLo), m_fmHi(fmHi), m_toLo(toLo), m_toHi(toHi)
{
}

double operator () ( double x )
{
double t = (x - m_fmLo) / (m_fmHi - m_fmLo);
double y = t * (m_toHi - m_toLo) + m_toLo;
return functor()(y);
}
};

template < typename functor >
struct RemapUnitTo
{
double m_toLo; double m_toHi;

RemapUnitTo(
double toLo, double toHi )
:  m_toLo(toLo), m_toHi(toHi)
{
}

double operator () ( double x )
{
double y = x * (m_toHi - m_toLo) + m_toLo;
return functor()(y);
}
};

Now we can trivially do what we did before to find the optimal approximation in a known range :

struct SqrtOnePlusX
{
double operator () ( double x )
{
return sqrt( 1 + x );
}
};

RemapUnitTo< SqrtOnePlusX > tf(-0.075f,0.075f);
FindTchebychevCoefficients( tf );

and the output is :

t 0:      0.999647973 ,       0.99964797
t 1:      0.037519818 ,      0.037519818
t 2:     -0.000352182 ,   -0.00035218223
t 3:      0.000006612 ,   6.6121459e-006
t 4:     -0.000000155 ,  -1.5518238e-007
t 5:      0.000000004 ,   4.0790168e-009
0 : errSqr = 0.000469204 errMax = 0.0378787
1 : errSqr = 5.78832e-008 errMax = 0.000358952
2 : errSqr = 2.12381e-011 errMax = 6.77147e-006
3 : errSqr = 1.18518e-014 errMax = 1.59378e-007
4 : errSqr = 8.23726e-018 errMax = 4.19794e-009
5 : errSqr = 6.54835e-021 errMax = 1.19024e-010

I guess Remez minimax polynomials are better, but this is so easy and it gives you a good starting point, then you can just numerically optimize after this anyway.

obviously the TchebychevN dispatcher sucks and is silly, but you don't actually use it in practice; you know which polynomials you want and you use something like :

double approx =
c0 * Tchebychev<0>(x) +
c1 * Tchebychev<1>(x) +
c2 * Tchebychev<2>(x) +
c3 * Tchebychev<3>(x);

which the compiler handles quite well.

I also did Legendre polynomials :

template < int n >
struct Legendre
{
double operator () ( double x )
{
double L_n1 = Legendre < n-1 >()(x);
double L_n2 = Legendre < n-2 >()(x);
const double B = (n-1)/(double)n; // n-1/n
return (1+B)*x*L_n1 - B*L_n2;
}
};

template< > struct Legendre<0>
{
double operator () ( double x ) { return 1.0; }
};

template< > struct Legendre<1>
{
double operator () ( double x ) { return x; }
};

// but ryg's loop variants are mostly just better :

__forceinline double legendre_ryg(int n, double x)
{
if (n == 0) return 1.0;
if (n == 1) return x;

double t = x, t1 = 1, t2;
for (int i=1; i < n; i++) {
t2 = t1;
t1 = t;
const double B = (i)/(double)(i+1);
t = (1+B)*x*t1 - B*t2;
}
return t;
}

__forceinline double chebyshev_ryg(int n, double x)
{
if (n == 0) return 1.0;
if (n == 1) return x;

double t = x, t1 = 1, t2;
for (int i=1; i < n; i++) {
t2 = t1;
t1 = t;
t = 2.0 * x * t1 - t2;
}
return t;
}

// you can find Legendre coefficients like so :

RemapUnitTo< Legendre<0> > ShiftedLegendre0(-1,1);

double c0 = Integrate(0.0,1.0,steps,MakeFunctorProduct(functor(),ShiftedLegendre0));
// (don't forget normalization)

// using new helper :

template < typename f1, typename f2 >
FunctorProduct< f1,f2 > MakeFunctorProduct( f1 _f1 , f2 _f2 ) { return FunctorProduct< f1,f2 >(_f1,_f2); }

Per Vognsen said...

Neat! I was worried the compiler wouldn't be able to detect the common subexpressions due to the Fibonacci-like template recursion pattern, but MSVC compiles it beautifully with either x87 or SSE instructions.

Before checking the quality of the assembly generated for your code, I tried my hand at a "clever" variation, but the "cleverness" thwarted the compiler's CSE attempts, so the generated code ballooned to several times the expected size. Yet another reminder that simpler is often better.

ryg said...

What's the point in doing the Tchebychev polynomial evaluation using templates? The obvious implementation (loop) is more readable, works for arbitrary user-defined N without a wrapper function, and the compiler unrolls+inlines it just fine for constant N. I'm not opposed to TMP, but in this case, KISS applies - especially if it's for tool use only.

Per Vognsen said...

Ryg: I agree it probably isn't worth it for an offline tool. But if you're saying you can get the same code with a constant n, inlining and loop unrolling, I cannot replicate that under MSVC 2008 with all the relevant optimization settings cranked up.

Here's my test code: http://gist.github.com/723088

I assume chebyshev_ryg() is what you had in mind.

ryg said...

Use a for. The VC++ loop optimizer isn't good with "non-standard loop shapes" and the same is true for other compilers, so it's become a habit for me.

My code is here: https://gist.github.com/723121

With VC++ 2005 at /arch:SSE2 /O2, I get one unnecessary extra movapd per loop iteration (probably an artifact of the way I shuffle the values around, using your implementation might work better there) but otherwise I'm happy with the result.

Per Vognsen said...

Good catch. Based on that, I agree that the loop version is strictly superior.

Yeah, most compilers have heuristics to detect 'repeat N times' loop shapes and don't seem to do bounded loop simulation very well in general. I've run into this before but I have the regrettable habit of using while(n--) loops in situations like these because I have an aesthetic aversion to loops with unused iterator variables.

ryg said...

Improved version without extra movs: https://gist.github.com/723131
Turns out the problem was that the compiler didn't hoist the 2*x out of the loop for some reason.

ryg said...

Per: VC++ will also unroll if you use "while (n-- > 0)", but not for "n--" which would also allow negative values (and hence overflows). The exact condition is that you need a loop with an induction variable that the compiler knows can't overflow/wrap. From my experience, GCC isn't as good at this, and the for variant is a safer bet.

Per Vognsen said...

Ryg: I see. My n is a signed integer but its value is known at compile time to be 10, a positive number, so it's annoying that the variable type's signedness enters into it.

One feature that would be a cool addition to Clang/LLVM would be heuristic diagnostics from the optimizer with a humane touch. For example, it could try to detect closely missed opportunities like the one in my loop code, or it could explain the sequence of steps by which it arrived at a given chunk of generated code.

One of the biggest problems with modern optimizing compilers is that they are often too magical and impenetrable. I've long wished for a move in a different direction where optimizers would become more like interactive assistants to programmers that increase their productivity and augment their intelligence rather than trying to replace it.

ryg said...

The signedness enters into the way the IR looks, and only certain patterns are eligible for loop unrolling. The problem is that the heavily multi-pass design of current optimizers has a hard time expressing "global" knowledge about code; every pass just expects something in a (more or less) normal form and leaves it alone if it doesn't understand it. That significantly reduces complexity (and optimizing compilers are way too complex already, so it's a good call!) but leads to this kind of "missed opportunities".

Reporting back information is a hard problem. All loop optimization takes place after lowering (and other optimizations!), so it's very hard to point at a source location when you can't optimize something.

"One feature that would be a cool addition to Clang/LLVM would be heuristic diagnostics from the optimizer with a humane touch."
Some special compilers for DSPs do this for high-yield optimizations (e.g. "I could unroll this loop and make it much faster if only 'x' was marked as restrict"). And it's awesome!

"One of the biggest problems with modern optimizing compilers is that they are often too magical and impenetrable."
Agreed. Two orthogonal issues:
1. Source language -> ASM is skipping too many layers when you're working in a HLL (and even plain C is too high-level once your compiler starts breaking out the sophisticated alias analysis or polyhedral loop optimizations). Solution: Expose the IR, support tracing through the opt. passes to see what happens when. (LLVM does this right!)
2. Tell, don't infer. Compilers don't need a better crystal ball; it's fine to have codegen annotations in a systems programming language. Think HLSL. I'm totally fine with writing "[force_unroll] while (n--) {" - that's way better (and less brittle, and faster to compile, and more readable) than having to break out TMP to force it.

cbloom said...

" What's the point in doing the Tchebychev polynomial evaluation using templates? The obvious implementation (loop) is more readable, "

I guess I just disagree that the loop is more readable. To me the recursion reads the way you would write the math, and is extremely readable and obvious. I would have to think carefully to write the loop right.

Also, IMO the knowledge required to understand and write the template is very small, and it's also useful general purpose knowledge. OTOH the knowledge to know when the loop optimization works is large and platform-specific.

I mean, you guys are asking for a way to control the compiler more, to manually tell it "this is a constant, please unfold this math" , when in fact that mechanism exists already, it's called templates.

Also, compilers generally handle the template method better and it's more reliable. Constant-loop optimizations seem to always flake out on me because a function got too complex or whatever nonsense. Followup in next comment cuz this is getting too long.

cbloom said...

.. some of my comments deleted because I don't want to talk about efficiency, it's a red herring.

You're right that the switch to dispatch from variable back to template arg is ugly.

That piece is a fundamental problem with template-loop-unrolling (eg. variable-to-constant hoisting), which is a pattern I like.

ryg said...

"To me the recursion reads the way you would write the math, and is extremely readable and obvious."
Agreed, but I still find the amount of syntactic red tape for templates appalling. (If only C/C++ compilers were better with recursive functions...).

"OTOH the knowledge to know when the loop optimization works is large and platform-specific."
Such is the knowledge of what C++ subset you can actually use on which platform. Yes, in theory C++ is one standard, but in practice every compiler supports a different subset and for a lot of compilers you're stuck with old g++ versions or other compilers, some of which just blatantly miscompile things like partial template specializations. Saw lots of fun bugs with pointer-to-members too, and that's been in C++ forever!

"I mean, you guys are asking for a way to control the compiler more, to manually tell it "this is a constant, please unfold this math" , when in fact that mechanism exists already, it's called templates."
Templates "solve" this problem by accident not design and only with severe caveats:
1. Unrolling a loop N times using templates will instantiate N template functions. If they're static and unreferenced, the compiler will throw them away (but still waste the time generating them), if not, they're left in the object file. If they're inlined, that's O(N^2) unrolled iterations compiled and left to be dead-stripped by the linker (plus symbols, plus debug info). In short, unrolling a loop N times via templates is not just syntactically awkward, it also implies tons of extra work for the compiler and possibly the linker, more symbols, and more IO (all of which significantly affects build times for the worse).
2. It doesn't actually "unroll" at all in non-release builds. This way of writing code is critically dependent on aggressive inlining to work at all. Extra-bad in this case because the two recursive calls turn a recurrence that can be calculated by 1 function using O(1) space in O(N) time into a monster with O(N) functions that use O(N) stack space and O(2^N) time! If you're really just using templates to unroll loops, that's simply slow in debug builds, but the performance with recursive functions like this is just completely unacceptable for production code IMO.

"Also, compilers generally handle the template method better and it's more reliable."
More reliable only in the sense that it always unrolls in some way, but the degenerate case where you hit inline limits at some point and get calls for the remaining iterations is still bad, and the cutoff point depends on the compiler/platform (and compiler switches, even!). Fundamentally, everything that relies on optimizations that the compiler might or might not do is flaky, and while inlining is a bit more consistent than loop optimizations in that regard, you still need to babysit the compiler and always read the generated code because you just can't trust it to do the right thing. That's precisely why I'd like there to be a way to actually force the compiler to do certain transforms on code no matter what.

cbloom said...

"Extra-bad in this case because the two recursive calls turn a recurrence that can be calculated by 1 function using O(1) space in O(N) time into a monster with O(N) functions that use O(N) stack space and O(2^N) time!"

Yeah this is the big issue, it makes the loop method just vastly superior in this case.