[Boost-users] boost.proto optimization on MSVC (2005/2008)

5 Dec 2007

      Hi,

I've been trying out boost.proto lately and I've noticed that for my
expressions it tends to force my operands to be created on the stack. For
example, instead of optimizing out an object completely when everything 
about it
is known at compile-time, the compiler (MSVC 2005 or 2008) is creating that
object on the stack anyway.. even though it's not necessary. Even worse, it
actually folds the simpler expressions at compile-time and never even 
touches
the data that it just put on the stack. The problem is compounded with 
larger
expressions involving many operands. It even seems to build the larger
expression objects on the stack unnecessarily (and this looks like it starts 
to
break constant folding).

Here's some of the generated assembly:

int main()
{
00401800  sub         esp,18h
        vector2< float[3] > v1 = { 0, 1, 2 };
00401803  xorps       xmm0,xmm0
        vector2< float[3] > v2 = { 3, 4, 5 };
        vector2< float[3] > v3 = { 6, 7, 8 };

        // Add two vectors lazily and get the 2nd element.
#if 1
        // this sticks temporary junk all over the stack, even with 2 terms
        std::cout << (v1 + v2)[0];
00401806  fld         dword ptr [__real@40400000 (402128h)]
0040180C  movss       dword ptr [esp],xmm0
00401811  movss       xmm0,dword ptr [__real@3f800000 (402124h)]
00401819  movss       dword ptr [esp+4],xmm0
0040181F  movss       xmm0,dword ptr [__real@40000000 (402120h)]
00401827  movss       dword ptr [esp+8],xmm0
0040182D  movss       xmm0,dword ptr [__real@40400000 (402128h)]
00401835  movss       dword ptr [esp+0Ch],xmm0
0040183B  movss       xmm0,dword ptr [__real@40800000 (40211Ch)]
00401843  push        ecx
00401844  mov         ecx,dword ptr [__imp_std::cout (402044h)]
0040184A  fstp        dword ptr [esp]
0040184D  movss       dword ptr [esp+14h],xmm0
00401853  movss       xmm0,dword ptr [__real@40a00000 (402118h)]
0040185B  movss       dword ptr [esp+18h],xmm0
00401861  call        dword ptr
[__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< 
(402040h)]

        std::cin.get();
00401867  mov         ecx,dword ptr [__imp_std::cin (40203Ch)]
0040186D  call        dword ptr
[__imp_std::basic_istream<char,std::char_traits<char> >::get (402038h)]

        return 0;
00401873  xor         eax,eax
}
00401875  add         esp,18h
00401878  ret

I've come up with a couple workarounds but these only work for specific 
cases:

// (1) This optimizes out nicely, but only works for adding 2 objects.
typedef proto::terminal< vector2< float[3] > >::type expr_type;
std::cout << (vector_expr< expr_type >( expr_type::make(v1) ) + vector_expr<
expr_type >( expr_type::make(v2) ))[0];

// (2) This never optimizes out
std::cout << (v1 + v2)[0];

I'd noticed that the reference at traits.hpp line 144 seemed to play a major
role (it was the only significant difference between the expression trees
generated by the above two snippets.)

// (3) This makes (2) optimize just like (1), but doesn't help 3+
// HACK TO REMOVE REF AT boost\xpressive\proto\traits.hpp LINE 144
namespace boost { namespace proto { namespace result_of {
template<typename T, typename EnableIf>
struct as_arg<T, VectorDomain, EnableIf>
{
        typedef expr<proto::tag::terminal, args0<T /*&*/> > expr_type;
        typedef typename VectorDomain::template apply<expr_type>::type type;

        template<typename T2>
        static type call(T2 &t)
        {
                return VectorDomain::make(expr_type::make(t));
        }
};
}}}

Finally, here is the entire source I'm using:

#include <cstddef>
#include <iostream>
#include <boost/xpressive/proto/proto.hpp>
#include <boost/xpressive/proto/context.hpp>
using namespace boost;

template< typename T>
struct vector2;

template< typename T >
struct vector2<T[3]>
{
        T v[3];

        typedef T value_type;

        T const & operator [](std::size_t i) const
        {
                return this->v[i];
        }

        T & operator [](std::size_t i)
        {
                return this->v[i];
        }
};

// This grammar describes which lazy vector expressions
// are allowed; namely, vector terminals and addition
// and subtraction of lazy vector expressions.
struct VectorGrammar
: proto::or_
         <
                 proto::terminal< vector2< proto::_ > >
         ,       proto::plus< VectorGrammar, VectorGrammar >
         >
{
};

// Expressions in the lazy vector domain must conform
// to the lazy vector grammar
struct VectorDomain;

// Here is an evaluation context that indexes into a lazy vector
// expression, and combines the result.
struct subscript_context
{
private:
        std::size_t i;
public:
        subscript_context(std::size_t i) : i(i)
        {
        }

        // Use default_eval for all the operations ...
        template< typename Expr, typename Tag = typename Expr::proto_tag >
        struct eval : proto::default_eval< Expr, subscript_context const >
        {
        };

        // ... except for terminals, which we index with our subscript
        template< typename Expr >
        struct eval< Expr, proto::tag::terminal >
        {
                typedef typename proto::result_of::arg< Expr
...
::type::value_type
result_type;
result_type operator ()(Expr const & expr, subscript_context
const & ctx) const
                {
                        return proto::arg(expr)[ctx.i];
                }
        };
};

// Here is the domain-specific expression wrapper, which overrides
// operator [] to evaluate the expression using the subscript_context.
template< typename Expr >
struct vector_expr : proto::extends<Expr, vector_expr< Expr >, VectorDomain
...
{
        typedef proto::extends< Expr, vector_expr< Expr >, VectorDomain > 
base_type;

        vector_expr(Expr const & expr) : base_type(expr)
        {
        }

        // Use the subscript_context to implement subscripting
        // of a lazy vector expression tree.
        typename proto::result_of::eval< Expr, subscript_context const
...
::type
        operator [](std::size_t i) const
        {
                subscript_context const ctx(i);
                return proto::eval(*this, ctx);
        }
};
// Tell proto that in the VectorDomain, all
// expressions should be wrapped in vector_expr<>
struct VectorDomain : proto::domain< proto::generator< vector_expr >,
VectorGrammar >
{
};

template< typename T >
struct is_vector
: mpl::false_
{
};

template< typename T >
struct is_vector< vector2< T > >
: mpl::true_
{
};

BOOST_PROTO_DEFINE_BINARY_OPERATOR(+, boost::proto::tag::plus, is_vector,
VectorDomain);

int main()
{
        vector2< float[3] > v1 = { 0, 1, 2 };
        vector2< float[3] > v2 = { 3, 4, 5 };
        vector2< float[3] > v3 = { 6, 7, 8 };

        // Add two vectors lazily and get the 2nd element.
#if 1
        // this sticks write-only temporary junk all over the stack, even 
with 2
terms
        std::cout << (v1 + v2)[0];// + v3 + v3 + v3 + v3 + v3)[0];
#else
        // this optimizes out completely for 2 terms... expanding to 3 terms
does not
        typedef proto::terminal< vector2< float[3] > >::type expr_type;
        std::cout << (vector_expr< expr_type >( expr_type::make(v2) )
                    + vector_expr< expr_type >( expr_type::make(v3) ))[0];
#endif

        std::cin.get();

        return 0;
}

Thanks in advance.
//

AJD < n.outpost@gmail.com >