diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp
new file mode 100644
index 000000000..fd42402c0
--- /dev/null
+++ b/booster/tree/xgboost_svdf_tree.hpp
@@ -0,0 +1,556 @@
+#ifndef _XGBOOST_APEX_TREE_HPP_
+#define _XGBOOST_APEX_TREE_HPP_
+/*!
+ * \file xgboost_svdf_tree.hpp
+ * \brief implementation of regression tree, with layerwise support
+ *        this file is adapted from GBRT implementation in SVDFeature project
+ * \author Tianqi Chen: tqchen@apex.sjtu.edu.cn, tianqi.tchen@gmail.com 
+ */
+#include <algorithm>
+#include "xgboost_tree_model.h"
+#include "../../utils/xgboost_random.h"
+#include "../../utils/xgboost_matrix_csr.h"
+
+namespace xgboost{
+    namespace booster{
+        const bool rt_debug = false;
+        // whether to check bugs
+        const bool check_bug = false;
+    
+        const float rt_eps = 1e-5f;
+        const float rt_2eps = rt_eps * 2.0f;
+        
+        inline double sqr( double a ){
+            return a * a;
+        }
+        
+        inline void assert_sorted( unsigned *idset, int len ){
+            if( !rt_debug || !check_bug ) return;
+            for( int i = 1; i < len; i ++ ){
+                utils::Assert( idset[i-1] < idset[i], "idset not sorted" );
+            }
+        }
+    };
+    
+    namespace booster{        
+        // node stat used in rtree
+        struct RTreeNodeStat{
+            // loss chg caused by current split
+            float loss_chg;
+            // weight of current node
+            float base_weight;
+            // number of child that is leaf node known up to now
+            int  leaf_child_cnt;
+        };
+
+        // structure of Regression Tree
+        class RTree: public TreeModel<float,RTreeNodeStat>{        
+        };
+        
+        // selecter of rtree to find the suitable candidate
+        class RTSelecter{
+        public:
+            struct Entry{
+                float  loss_chg;
+                size_t start;
+                int    len;
+                unsigned sindex;
+                float    split_value;
+                Entry(){}
+                Entry( float loss_chg, size_t start, int len, unsigned split_index, float split_value, bool default_left ){
+                    this->loss_chg = loss_chg;
+                    this->start    = start;
+                    this->len      = len;
+                    if( default_left ) split_index |= (1U << 31);
+                    this->sindex = split_index;
+                    this->split_value = split_value;
+                }
+                inline unsigned split_index( void ) const{
+                    return sindex & ( (1U<<31) - 1U );
+                }
+                inline bool default_left( void ) const{
+                    return (sindex >> 31) != 0;
+                }
+            };
+        private:
+            Entry best_entry;
+            const TreeParamTrain &param;
+        public:
+            RTSelecter( const TreeParamTrain &p ):param( p ){
+                memset( &best_entry, 0, sizeof(best_entry) );
+                best_entry.loss_chg = 0.0f;
+            }
+            inline void push_back( const Entry &e ){
+                if( e.loss_chg > best_entry.loss_chg ) best_entry = e;
+            }
+            inline const Entry & select( void ){            
+                return best_entry;                
+            }
+        };
+        
+        // updater of rtree, allows the parameters to be stored inside, key solver
+        class RTreeUpdater{
+        protected:
+            // training task, element of single task
+            struct Task{
+                // node id in tree
+                int nid;
+                // idset pointer, instance id in [idset,idset+len)
+                unsigned *idset;
+                // length of idset
+                unsigned len;            
+                // base_weight of parent
+                float parent_base_weight;
+                Task(){}
+                Task( int nid, unsigned *idset, unsigned len, float pweight = 0.0f ){
+                    this->nid = nid;
+                    this->idset = idset;
+                    this->len = len;
+                    this->parent_base_weight = pweight;
+                }
+            };
+            
+            // sparse column entry
+            struct SCEntry{
+                // feature value 
+                float    fvalue;
+                // row index in grad
+                unsigned rindex;
+                SCEntry(){}
+                SCEntry( float fvalue, unsigned rindex ){
+                    this->fvalue = fvalue; this->rindex = rindex;
+                }
+                inline bool operator<( const SCEntry &p ) const{
+                    return fvalue < p.fvalue;
+                }
+            };
+        private:
+            // training parameter
+            const TreeParamTrain &param;
+            // parameters, reference
+            RTree &tree;
+            std::vector<float> &grad;
+            std::vector<float> &hess;
+            const FMatrixS::Image &smat;
+            const std::vector<unsigned> &group_id;
+        private:
+            // maximum depth up to now
+            int max_depth;
+            // number of nodes being pruned
+            int num_pruned;
+            // stack to store current task
+            std::vector<Task> task_stack;
+            // temporal space for index set
+            std::vector<unsigned> idset;
+        private:
+            // task management: NOTE DFS here
+            inline void add_task( Task tsk ){
+                task_stack.push_back( tsk );
+            }
+            inline bool next_task( Task &tsk ){
+                if( task_stack.size() == 0 ) return false;
+                tsk = task_stack.back(); 
+                task_stack.pop_back();
+                return true;
+            } 
+        private:
+            // try to prune off current leaf, return true if successful
+            inline void try_prune_leaf( int nid, int depth ){
+                if( tree[ nid ].is_root() ) return;
+                int pid = tree[ nid ].parent();
+                RTree::NodeStat &s = tree.stat( pid );
+                s.leaf_child_cnt ++;
+                
+                if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
+                    // need to be pruned
+                    tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
+                    // add statistics to number of nodes pruned
+                    num_pruned += 2;
+                    // tail recursion
+                    this->try_prune_leaf( pid, depth - 1 );
+                }
+            }        
+            // make leaf for current node :)
+            inline void make_leaf( Task tsk, double sum_grad, double sum_hess, bool compute ){
+                for( unsigned i = 0; i < tsk.len; i ++ ){
+                    const unsigned ridx = tsk.idset[i];
+                    if( compute ){
+                        sum_grad += grad[ ridx ];
+                        sum_hess += hess[ ridx ];
+                }
+                }
+                tree[ tsk.nid ].set_leaf( param.learning_rate * param.CalcWeight( sum_grad, sum_hess, tsk.parent_base_weight ) );
+                this->try_prune_leaf( tsk.nid, tree.GetDepth( tsk.nid ) );
+            }
+        private:
+            // make split for current task, re-arrange positions in idset
+            inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double base_weight ){
+                // before split, first prepare statistics
+                RTree::NodeStat &s = tree.stat( tsk.nid );
+                s.loss_chg = loss_chg; 
+                s.leaf_child_cnt = 0;
+                s.base_weight = static_cast<float>( base_weight );
+                
+                // add childs to current node
+                tree.AddChilds( tsk.nid );
+                // assert that idset is sorted
+                assert_sorted( tsk.idset, tsk.len );
+                // use merge sort style to get the solution
+                std::vector<unsigned> qset;
+                for( int i = 0; i < num; i ++ ){
+                    qset.push_back( entry[i].rindex );
+                }
+                std::sort( qset.begin(), qset.end() );
+                // do merge sort style, make the other set, remove elements in qset
+                for( unsigned i = 0, top = 0; i < tsk.len; i ++ ){
+                    if( top < qset.size() ){
+                        if( tsk.idset[ i ] != qset[ top ] ){
+                            tsk.idset[ i - top ] = tsk.idset[ i ];
+                        }else{
+                            top ++;
+                        }
+                    }else{
+                        tsk.idset[ i - qset.size() ] = tsk.idset[ i ];
+                    }
+                }
+                // get two parts 
+                RTree::Node &n = tree[ tsk.nid ];
+                Task def_part( n.default_left() ? n.cleft() : n.cright(), tsk.idset, tsk.len - qset.size(), s.base_weight );
+                Task spl_part( n.default_left() ? n.cright(): n.cleft() , tsk.idset + def_part.len, qset.size(), s.base_weight );  
+                // fill back split part
+                for( unsigned i = 0; i < spl_part.len; i ++ ){
+                    spl_part.idset[ i ] = qset[ i ];
+                }
+                // add tasks to the queue
+                this->add_task( def_part ); 
+                this->add_task( spl_part );
+            }
+            
+            // enumerate split point of the tree
+            inline void enumerate_split( RTSelecter &sglobal, int tlen,
+                                         double rsum_grad, double rsum_hess, double root_cost,
+                                         const SCEntry *entry, size_t start, size_t end, 
+                                         int findex, float parent_base_weight ){
+                // local selecter
+                RTSelecter slocal( param );
+                
+                if( param.default_direction != 1 ){
+                    // forward process, default right
+                    double csum_grad = 0.0, csum_hess = 0.0;
+                    for( size_t j = start; j < end; j ++ ){
+                        const unsigned ridx = entry[ j ].rindex;
+                        csum_grad += grad[ ridx ];
+                        csum_hess += hess[ ridx ];
+                        // check for split
+                        if( j == end - 1 || entry[j].fvalue + rt_2eps < entry[ j + 1 ].fvalue ){
+                            if( csum_hess < param.min_child_weight ) continue;
+                            const double dsum_hess = rsum_hess - csum_hess;
+                            if( dsum_hess < param.min_child_weight ) break;                        
+                            // change of loss 
+                            double loss_chg = 
+                            param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + 
+                                param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
+                            
+                            const int clen = static_cast<int>( j + 1 - start );
+                            // add candidate to selecter
+                            slocal.push_back( RTSelecter::Entry( loss_chg, start, clen, findex, 
+                                                                 j == end - 1 ? entry[j].fvalue + rt_eps : 0.5 * (entry[j].fvalue+entry[j+1].fvalue),
+                                                                 false ) );
+                        }
+                    }
+                }
+                
+                if( param.default_direction != 2 ){
+                    // backward process, default left
+                    double csum_grad = 0.0, csum_hess = 0.0;
+                    for( size_t j = end; j > start; j -- ){
+                        const unsigned ridx = entry[ j - 1 ].rindex;
+                        csum_grad += grad[ ridx ];
+                        csum_hess += hess[ ridx ];
+                        // check for split
+                        if( j == start + 1 || entry[ j - 2 ].fvalue + rt_2eps < entry[ j - 1 ].fvalue ){
+                            if( csum_hess < param.min_child_weight ) continue;
+                            const double dsum_hess = rsum_hess - csum_hess;
+                            if( dsum_hess < param.min_child_weight ) break;
+                            double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + 
+                                param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
+                            const int clen = static_cast<int>( end - j + 1 );
+                            // add candidate to selecter
+                            slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
+                                                                 j == start + 1 ? entry[j-1].fvalue - rt_eps : 0.5 * (entry[j-2].fvalue + entry[j-1].fvalue), 
+                                                                 true ) );
+                        }
+                    }
+                }
+                sglobal.push_back( slocal.select() );
+            }
+            
+        private:
+            // temporal storage for expand column major
+            std::vector<size_t>  tmp_rptr;        
+            // find split for current task, another implementation of expand in column major manner
+            // should be more memory frugal, avoid global sorting across feature       
+            inline void expand( Task tsk ){
+                // assert that idset is sorted
+                // if reach maximum depth, make leaf from current node
+                int depth = tree.GetDepth( tsk.nid );
+                // update statistiss
+                if( depth > max_depth ) max_depth = depth; 
+                // if bigger than max depth
+                if( depth >= param.max_depth ){
+                    this->make_leaf( tsk, 0.0, 0.0, true ); return;
+                }
+                // convert to column major CSR format
+                const int nrows = tree.param.num_feature;
+                if( tmp_rptr.size() == 0 ){
+                    // initialize tmp storage in first usage
+                    tmp_rptr.resize( nrows + 1 ); 
+                    std::fill( tmp_rptr.begin(), tmp_rptr.end(), 0 );
+                }
+                // records the columns
+                std::vector<SCEntry> entry;
+                // records the active features
+                std::vector<size_t>  aclist;
+                utils::SparseCSRMBuilder<SCEntry,true> builder( tmp_rptr, entry, aclist );
+                builder.InitBudget( nrows );
+                // statistics of root
+                double rsum_grad = 0.0, rsum_hess = 0.0;            
+                for( unsigned i = 0; i < tsk.len; i ++ ){
+                    const unsigned ridx = tsk.idset[i];
+                    rsum_grad  += grad[ ridx ];
+                    rsum_hess  += hess[ ridx ];
+                    
+                    FMatrixS::Line sp = smat[ ridx ];
+                    for( unsigned j = 0; j < sp.len; j ++ ){
+                        builder.AddBudget( sp.findex[j] );
+                    }
+                }
+                
+                // if minimum split weight is not meet
+                if( param.cannot_split( rsum_hess, depth )  ){
+                    this->make_leaf( tsk, rsum_grad, rsum_hess, false ); builder.Cleanup(); return; 
+                }
+                
+                builder.InitStorage();
+                for( unsigned i = 0; i < tsk.len; i ++ ){
+                    const unsigned ridx = tsk.idset[i];
+                    FMatrixS::Line sp = smat[ ridx ];
+                    for( unsigned j = 0; j < sp.len; j ++ ){
+                        builder.PushElem( sp.findex[j], SCEntry( sp.fvalue[j], ridx ) );
+                }                
+                }
+                // --- end of building column major matrix ---
+                // after this point, tmp_rptr and entry is ready to use
+                
+                // global selecter
+                RTSelecter sglobal( param );
+                // cost root 
+                const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess );
+                // KEY: layerwise, weight of current node if it is leaf
+                const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
+                // enumerate feature index
+                for( size_t i = 0; i < aclist.size(); i ++ ){
+                    int findex = static_cast<int>( aclist[i] );                
+                    size_t start = tmp_rptr[ findex ];
+                    size_t end   = tmp_rptr[ findex + 1 ];
+                    utils::Assert( start < end, "bug" );
+                    // local sort can be faster when the features are sparse
+                    std::sort( entry.begin() + start, entry.begin() + end );
+                    // local selecter
+                    this->enumerate_split( sglobal, tsk.len,
+                                           rsum_grad, rsum_hess, root_cost,
+                                           &entry[0], start, end, findex, base_weight );
+                }
+                // Cleanup tmp_rptr for next use
+                builder.Cleanup();
+                // get the best solution
+                const RTSelecter::Entry &e = sglobal.select();
+                // allowed to split
+                if( e.loss_chg > rt_eps ){
+                    // add splits
+                    tree[ tsk.nid ].set_split( e.split_index(), e.split_value, e.default_left() );
+                    // re-arrange idset, push tasks
+                    this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, base_weight ); 
+                }else{
+                    // make leaf if we didn't meet requirement
+                    this->make_leaf( tsk, rsum_grad, rsum_hess, false );
+                }
+            }
+        private:
+            // initialize the tasks
+            inline void init_tasks( size_t ngrads ){
+                // add group partition if necessary
+                if( group_id.size() == 0 ){       
+                    if( param.subsample > 1.0f - 1e-6f ){ 
+                        idset.resize( 0 );
+                        for( size_t i = 0; i < ngrads; i ++ ){
+                            if( hess[i] < 0.0f ) continue;
+                            idset.push_back( (unsigned)i );
+                        } 
+                    }else{
+                        idset.resize( 0 );
+                        for( size_t i = 0; i < ngrads; i ++ ){
+                            if( random::SampleBinary( param.subsample ) != 0 ){
+                                idset.push_back( (unsigned)i );
+                            }
+                        } 
+                    }
+                    this->add_task( Task( 0, &idset[0], idset.size() ) ); return;
+                }
+                
+                utils::Assert( group_id.size() == ngrads, "number of groups must be exact" );            
+                {// new method for grouping, use CSR builder
+                    std::vector<size_t>   rptr;
+                    utils::SparseCSRMBuilder<unsigned> builder( rptr, idset );
+                    builder.InitBudget( tree.param.num_roots );
+                    for( size_t i = 0; i < group_id.size(); i ++ ){
+                        // drop invalid elements
+                        if( hess[ i ] < 0.0f ) continue;
+                        utils::Assert( group_id[ i ] < (unsigned)tree.param.num_roots, 
+                                       "group id exceed number of roots" );
+                        builder.AddBudget( group_id[ i ] );
+                    }
+                    builder.InitStorage();
+                    for( size_t i = 0; i < group_id.size(); i ++ ){
+                        // drop invalid elements
+                        if( hess[ i ] < 0.0f ) continue;
+                        builder.PushElem( group_id[ i ], static_cast<unsigned>(i) );
+                    }
+                    for( size_t i = 1; i < rptr.size(); i ++ ){
+                        const size_t start = rptr[ i - 1 ], end = rptr[ i ];
+                        if( start < end ){
+                            this->add_task( Task( i - 1, &idset[ start ], end - start ) );
+                        }
+                    }
+                }
+            }
+        public:
+            RTreeUpdater( const TreeParamTrain &pparam, 
+                          RTree &ptree,
+                          std::vector<float> &pgrad,
+                          std::vector<float> &phess,
+                          const FMatrixS::Image &psmat, 
+                          const std::vector<unsigned> &pgroup_id ):
+                param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ),
+                smat( psmat ), group_id( pgroup_id ){
+            }
+            inline int do_boost( int &num_pruned ){
+                this->init_tasks( grad.size() );
+                this->max_depth = 0;
+                this->num_pruned = 0;
+                Task tsk;
+                while( this->next_task( tsk ) ){
+                    this->expand( tsk );
+                }
+                num_pruned = this->num_pruned;
+                return max_depth;
+            }
+        };
+
+    
+
+        class RTreeTrainer : public IBooster{
+        private:
+            int silent;
+            // tree of current shape 
+            RTree tree;
+            TreeParamTrain param;
+        private:
+            std::vector<float> tmp_feat;
+            std::vector<bool>  tmp_funknown;
+            inline void init_tmpfeat( void ){
+                if( tmp_feat.size() != (size_t)tree.param.num_feature ){
+                    tmp_feat.resize( tree.param.num_feature );
+                    tmp_funknown.resize( tree.param.num_feature );
+                    std::fill( tmp_funknown.begin(), tmp_funknown.end(), true );
+                }
+            }
+        public:
+            virtual void SetParam( const char *name, const char *val ){
+                if( !strcmp( name, "silent") )  silent = atoi( val );
+                param.SetParam( name, val );
+                tree.param.SetParam( name, val );
+            }
+            virtual void LoadModel( utils::IStream &fi ){
+                tree.LoadModel( fi );
+            }
+            virtual void SaveModel( utils::IStream &fo ) const{
+                tree.SaveModel( fo );
+            }
+            virtual void InitModel( void ){
+                tree.InitModel();
+            }
+        private:
+            inline int get_next( int pid, float fvalue, bool is_unknown ){
+                float split_value = tree[ pid ].split_cond();
+                if( is_unknown ){
+                    if( tree[ pid ].default_left() ) return tree[ pid ].cleft();
+                else return tree[ pid ].cright();
+                }else{
+                    if( fvalue < split_value ) return tree[ pid ].cleft();
+                    else return tree[ pid ].cright();
+                }
+            }
+        public:
+            virtual void DoBoost( std::vector<float> &grad, 
+                                  std::vector<float> &hess,
+                                  const FMatrixS::Image &smat,
+                                  const std::vector<unsigned> &group_id ){
+                utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
+                if( !silent ){
+                    printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
+                }
+                // start with a id set
+                RTreeUpdater updater( param, tree, grad, hess, smat, group_id );
+                int num_pruned;
+                tree.param.max_depth = updater.do_boost( num_pruned );
+                
+                if( !silent ){
+                    printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", 
+                            tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
+                }
+            }
+            
+            virtual int GetLeafIndex( const std::vector<float> &feat,
+                                      const std::vector<bool>  &funknown,
+                                  unsigned gid = 0 ){
+                // start from groups that belongs to current data
+                int pid = (int)gid;
+                // tranverse tree
+                while( !tree[ pid ].is_leaf() ){
+                    unsigned split_index = tree[ pid ].split_index();
+                    pid = this->get_next( pid, feat[ split_index ], funknown[ split_index ] );
+                }
+                return pid;
+            }
+            virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){
+                this->init_tmpfeat();
+                for( unsigned i = 0; i < feat.len; i ++ ){
+                    utils::Assert( feat.findex[i] < (unsigned)tmp_funknown.size() , "input feature execeed bound" );
+                    tmp_funknown[ feat.findex[i] ] = false;
+                    tmp_feat[ feat.findex[i] ] = feat.fvalue[i];
+                } 
+                int pid = this->GetLeafIndex( tmp_feat, tmp_funknown, gid );
+                // set back
+                for( unsigned i = 0; i < feat.len; i ++ ){
+                    tmp_funknown[ feat.findex[i] ] = true;
+                }            
+                return tree[ pid ].leaf_value();
+            }
+            virtual float Predict( const std::vector<float> &feat, 
+                                   const std::vector<bool>  &funknown,
+                                   unsigned gid = 0 ){
+                utils::Assert( feat.size() >= (size_t)tree.param.num_feature,
+                               "input data smaller than num feature" );
+                int pid = this->GetLeafIndex( feat, funknown, gid );
+                return tree[ pid ].leaf_value();
+            }        
+        public:
+            RTreeTrainer( void ){ silent = 0; }
+            virtual ~RTreeTrainer( void ){}
+        };
+    };
+};
+#endif
+
+
diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h
new file mode 100644
index 000000000..fb9f4fdf5
--- /dev/null
+++ b/booster/tree/xgboost_tree_model.h
@@ -0,0 +1,415 @@
+#ifndef _XGBOOST_TREE_MODEL_H_
+#define _XGBOOST_TREE_MODEL_H_
+/*!
+ * \file xgboost_tree_model.h
+ * \brief generic definition of model structure used in tree models
+ *        used to support learning of boosting tree
+ * \author Tianqi Chen: tianqi.tchen@gmail.com
+ */
+#include <cstring>
+#include "../../utils/xgboost_utils.h"
+#include "../../utils/xgboost_stream.h"
+
+namespace xgboost{
+    namespace booster{
+        /*!
+         * \brief template class of TreeModel 
+         * \tparam TSplitCond data type to indicate split condition
+         * \tparam TNodeStat auxiliary statistics of node to help tree building
+         */
+        template<typename TSplitCond,typename TNodeStat>
+        class TreeModel{
+        public:
+            /*! \brief data type to indicate split condition */
+            typedef TNodeStat  NodeStat;
+            /*! \brief auxiliary statistics of node to help tree building */
+            typedef TSplitCond SplitCond;
+        public:
+            /*! \brief parameters of the tree */
+            struct Param{
+                /*! \brief number of start root */
+                int num_roots;
+                /*! \brief total number of nodes */
+                int num_nodes;
+                /*!\brief number of deleted nodes */
+                int num_deleted;
+                /*! \brief maximum depth, this is a statistics of the tree */
+                int max_depth;
+                /*! \brief  number of features used for tree construction */
+                int num_feature;
+                /*! \brief reserved part */
+                int reserved[ 32 ];
+                /*! \brief constructor */
+                Param( void ){
+                    max_depth = 0;
+                    memset( reserved, 0, sizeof( reserved ) );
+                }
+                /*! 
+                 * \brief set parameters from outside 
+                 * \param name name of the parameter
+                 * \param val  value of the parameter
+                 */
+                inline void SetParam( const char *name, const char *val ){
+                    if( !strcmp("num_roots", name ) )    num_roots = atoi( val );
+                    if( !strcmp("num_feature", name ) )  num_feature = atoi( val );
+                }
+            };
+            /*! \brief tree node */
+            class Node{
+            private:
+                friend class TreeModel<TSplitCond,TNodeStat>;
+                /*! 
+                 * \brief in leaf node, we have weights, in non-leaf nodes, 
+                 *        we have split condition 
+                 */
+                union Info{
+                    float leaf_value;
+                    TSplitCond split_cond;
+                };
+            private:
+                // pointer to parent, highest bit is used to indicate whether it's a left child or not 
+                int sparent;
+                // pointer to left, right
+                int left, right;
+                // split feature index, left split or right split depends on the highest bit
+                unsigned sindex;            
+                // extra info
+                Info info;
+            private:
+                inline void set_parent( int pidx, bool is_left_child = true ){
+                    if( is_left_child ) pidx |= (1U << 31);
+                    this->sparent = pidx;
+                }
+            public:
+                /*! \brief index of left child */
+                inline int cleft( void ) const{
+                    return this->left;
+                }
+                /*! \brief index of right child */
+                inline int cright( void ) const{
+                    return this->right;
+                }
+                /*! \brief feature index of split condition */
+                inline unsigned split_index( void ) const{
+                    return sindex & ( (1U<<31) - 1U );
+                }
+                /*! \brief when feature is unknown, whether goes to left child */
+                inline bool default_left( void ) const{
+                    return (sindex >> 31) != 0;
+                } 
+                /*! \brief whether current node is leaf node */
+                inline bool is_leaf( void ) const{
+                    return left == -1;
+                }
+                /*! \brief get leaf value of leaf node */
+                inline float leaf_value( void ) const{
+                    return (this->info).leaf_value;
+                }
+                /*! \brief get split condition of the node */
+                inline TSplitCond split_cond( void ) const{
+                    return (this->info).split_cond;
+                }
+                /*! \brief get parent of the node */
+                inline int parent( void ) const{
+                    return sparent & ( (1U << 31) - 1 );
+                } 
+                /*! \brief whether current node is left child */
+                inline bool is_left_child( void ) const{
+                    return ( sparent & (1U << 31)) != 0;
+                }
+                /*! \brief whether current node is root */
+                inline bool is_root( void ) const{
+                    return sparent == -1;
+                }
+                /*! 
+                 * \brief set the right child 
+                 * \param nide node id to right child
+                 */
+                inline void set_right_child( int nid ){
+                    this->right = nid;
+                }
+                /*! 
+                 * \brief set split condition of current node 
+                 * \param split_index feature index to split
+                 * \param split_cond  split condition
+                 * \param default_left the default direction when feature is unknown
+                 */
+                inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
+                    if( default_left ) split_index |= (1U << 31);
+                    this->sindex = split_index;
+                    (this->info).split_cond = split_cond;
+                }
+                /*! 
+                 * \brief set the leaf value of the node
+                 * \param value leaf value
+                 * \param right right index, could be used to store 
+                 *        additional information
+                 */
+                inline void set_leaf( float value, int right = -1 ){
+                    (this->info).leaf_value = value;
+                    this->left = -1;
+                    this->right = right;
+                }
+            };
+        protected:
+            // vector of nodes
+            std::vector<Node> nodes;
+            // stats of nodes
+            std::vector<TNodeStat> stats;
+        protected:
+            // free node space, used during training process
+            std::vector<int>  deleted_nodes;
+            // allocate a new node, 
+            // !!!!!! NOTE: may cause BUG here, nodes.resize
+            inline int AllocNode( void ){
+                if( param.num_deleted != 0 ){
+                    int nd = deleted_nodes.back();
+                    deleted_nodes.pop_back();
+                    param.num_deleted --;
+                    return nd;
+                }
+                int nd = param.num_nodes ++;
+                nodes.resize( param.num_nodes );
+                stats.resize( param.num_nodes );
+                return nd;
+            }
+            // delete a tree node
+            inline void DeleteNode( int nid ){
+                utils::Assert( nid >= param.num_roots, "can not delete root");
+                deleted_nodes.push_back( nid );
+                nodes[ nid ].set_parent( -1 );
+                param.num_deleted ++;
+            }
+        public:
+            /*! 
+             * \brief change a non leaf node to a leaf node, delete its children
+             * \param rid node id of the node
+             * \param new leaf value
+             */
+            inline void ChangeToLeaf( int rid, float value ){
+                utils::Assert( nodes[ nodes[rid].left  ].is_leaf(), "can not delete a non termial child");
+                utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child");
+                this->DeleteNode( nodes[ rid ].left ); 
+                this->DeleteNode( nodes[ rid ].right );
+                nodes[ rid ].set_leaf( value );
+            }
+        public:
+            /*! \brief model parameter */
+            Param param;
+        public:
+            /*! \brief constructor */
+            TreeModel( void ){
+                param.num_nodes = 1;
+                param.num_roots = 1;
+                param.num_deleted = 0;
+                nodes.resize( 1 );
+            }
+            /*! \brief get node given nid */
+            inline Node &operator[]( int nid ){
+                return nodes[ nid ];
+            }
+            /*! \brief get node statistics given nid */
+            inline NodeStat &stat( int nid ){
+                return stats[ nid ];
+            }
+            /*! \brief initialize the model */
+            inline void InitModel( void ){
+                param.num_nodes = param.num_roots;
+                nodes.resize( param.num_nodes );
+                stats.resize( param.num_nodes );
+                for( int i = 0; i < param.num_nodes; i ++ ){
+                    nodes[i].set_leaf( 0.0f );
+                    nodes[i].set_parent( -1 );
+                }
+            }
+            /*! 
+             * \brief load model from stream
+             * \param fi input stream
+             */
+            inline void LoadModel( utils::IStream &fi ){
+                utils::Assert( fi.Read( &param, sizeof(Param) ) > 0, "TreeModel" );
+                nodes.resize( param.num_nodes );
+                utils::Assert( fi.Read( &nodes[0], sizeof(Node) * nodes.size() ) > 0, "TreeModel::Node" );
+                
+                deleted_nodes.resize( 0 );
+                for( int i = param.num_roots; i < param.num_nodes; i ++ ){
+                    if( nodes[i].is_root() ) deleted_nodes.push_back( i );
+                }
+                utils::Assert( (int)deleted_nodes.size() == param.num_deleted, "number of deleted nodes do not match" );
+            }
+            /*! 
+             * \brief save model to stream
+             * \param fo output stream
+             */
+            inline void SaveModel( utils::IStream &fo ) const{
+                utils::Assert( param.num_nodes == (int)nodes.size() );
+                fo.Write( &param, sizeof(Param) );
+                fo.Write( &nodes[0], sizeof(Node) * nodes.size() );
+            }
+            /*! 
+             * \brief add child nodes to node
+             * \param nid node id to add childs
+             */
+            inline void AddChilds( int nid ){
+                int pleft  = this->AllocNode();
+                int pright = this->AllocNode();
+                nodes[ nid ].left  = pleft;
+                nodes[ nid ].right = pright;
+                nodes[ nodes[ nid ].left  ].set_parent( nid, true );
+                nodes[ nodes[ nid ].right ].set_parent( nid, false );
+            }
+            /*! 
+             * \brief only add a right child to a leaf node 
+             * \param node id to add right child
+             */
+            inline void AddRightChild( int nid ){
+                int pright = this->AllocNode();
+                nodes[ nid ].right  = pright;
+                nodes[ nodes[ nid ].right  ].set_parent( nid, false );
+            }
+            /*!
+             * \brief get current depth
+             * \param nid node id
+             * \param pass_rchild whether right child is not counted in depth
+             */
+            inline int GetDepth( int nid, bool pass_rchild = false ) const{
+                int depth = 0;
+                while( !nodes[ nid ].is_root() ){
+                    if( !pass_rchild || nodes[ nid ].is_left_child() ) depth ++;
+                    nid = nodes[ nid ].parent();
+                }
+                return depth;
+            }
+            /*! \brief number of extra nodes besides the root */
+            inline int num_extra_nodes( void ) const {
+                return param.num_nodes - param.num_roots - param.num_deleted;
+            }
+        };
+    };
+
+    namespace booster{
+        /*! \brief training parameters for regression tree */
+        struct TreeParamTrain{
+            // learning step size for a time
+            float learning_rate;
+            // minimum loss change required for a split
+            float min_split_loss;
+            // maximum depth of a tree
+            int   max_depth;
+            //----- the rest parameters are less important ----
+            // minimum amount of hessian(weight) allowed in a child
+            float min_child_weight;
+            // weight decay parameter used to control leaf fitting
+            float reg_lambda;
+            // reg method
+            int   reg_method;
+            // default direction choice
+            int   default_direction;
+            // whether we want to do subsample
+            float subsample;
+            // whether to use layerwise aware regularization
+            int   use_layerwise;
+            /*! \brief constructor */
+            TreeParamTrain( void ){
+                learning_rate = 0.3f;
+                min_child_weight = 1.0f;
+                max_depth = 6;
+                reg_lambda = 1.0f;
+                reg_method = 2;
+                default_direction = 0;
+                subsample = 1.0f;
+                use_layerwise = 0;
+            }
+            /*! 
+             * \brief set parameters from outside 
+             * \param name name of the parameter
+             * \param val  value of the parameter
+             */            
+            inline void SetParam( const char *name, const char *val ){
+                // sync-names 
+                if( !strcmp( name, "gamma") )  min_split_loss = (float)atof( val );
+                if( !strcmp( name, "eta") )    learning_rate  = (float)atof( val );
+                if( !strcmp( name, "lambda") ) reg_lambda = (float)atof( val );
+                // normal tree prameters
+                if( !strcmp( name, "learning_rate") )     learning_rate = (float)atof( val );
+                if( !strcmp( name, "min_child_weight") )  min_child_weight = (float)atof( val );
+                if( !strcmp( name, "min_split_loss") )    min_split_loss = (float)atof( val );
+                if( !strcmp( name, "max_depth") )         max_depth = atoi( val );           
+                if( !strcmp( name, "reg_lambda") )        reg_lambda = (float)atof( val );
+                if( !strcmp( name, "reg_method") )        reg_method = (float)atof( val );
+                if( !strcmp( name, "subsample") )         subsample  = (float)atof( val );
+                if( !strcmp( name, "use_layerwise") )     use_layerwise = atoi( val );
+                if( !strcmp( name, "default_direction") ) {
+                    if( !strcmp( val, "learn") )  default_direction = 0;
+                    if( !strcmp( val, "left") )   default_direction = 1;
+                    if( !strcmp( val, "right") )  default_direction = 2;
+                }
+            }
+        protected:
+            // functions for L1 cost
+            static inline double ThresholdL1( double w, double lambda ){
+                if( w > +lambda ) return w - lambda;
+                if( w < -lambda ) return w + lambda;
+                return 0.0;
+            }
+            inline double CalcWeight( double sum_grad, double sum_hess )const{
+                if( sum_hess < min_child_weight ){
+                    return 0.0;
+                }else{
+                    switch( reg_method ){
+                    case 1: return - ThresholdL1( sum_grad, reg_lambda ) / sum_hess;
+                    case 2: return - sum_grad / ( sum_hess + reg_lambda );
+                        // elstic net
+                    case 3: return - ThresholdL1( sum_grad, 0.5 * reg_lambda ) / ( sum_hess + 0.5 * reg_lambda );
+                    default: return - sum_grad / sum_hess;
+                    }
+                }
+            }
+        private:
+            inline static double Sqr( double a ){
+                return a * a;
+            }
+        public:
+            // calculate the cost of loss function
+            inline double CalcCost( double sum_grad, double sum_hess ) const{
+                if( sum_hess < min_child_weight ){
+                    return 0.0;
+                }
+                switch( reg_method ){
+                case 1 : return Sqr( ThresholdL1( sum_grad, reg_lambda ) ) / sum_hess;
+                case 2 : return Sqr( sum_grad ) / ( sum_hess + reg_lambda );
+                    // elstic net
+                case 3 : return Sqr( ThresholdL1( sum_grad, 0.5 * reg_lambda ) ) / ( sum_hess + 0.5 * reg_lambda );
+                default: return Sqr( sum_grad ) / sum_hess;
+                }        
+            }
+            // KEY:layerwise
+            // calculate cost of root
+            inline double CalcRootCost( double sum_grad, double sum_hess ) const{
+                if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
+                else return 0.0;
+            }
+            // KEY:layerwise
+            // calculate the cost after split
+            // base_weight: the base_weight of parent           
+            inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{
+                if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
+                else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess );
+            }
+            // calculate the weight of leaf
+            inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
+                if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
+                else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
+            }           
+            /*! \brief given the loss change, whether we need to invode prunning */
+            inline bool need_prune( double loss_chg, int depth ) const{
+                return loss_chg < min_split_loss;
+            }
+            /*! \brief whether we can split with current hessian */
+            inline bool cannot_split( double sum_hess, int depth ) const{
+                return sum_hess < min_child_weight * 2.0; 
+            }
+        };
+    };
+};
+#endif
diff --git a/booster/xgboost.cpp b/booster/xgboost.cpp
index 381067d08..6e04b3166 100644
--- a/booster/xgboost.cpp
+++ b/booster/xgboost.cpp
@@ -10,6 +10,9 @@
 #include "xgboost.h"
 #include "../utils/xgboost_utils.h"
 #include "xgboost_gbmbase.h"
+// implementations of boosters
+#include "tree/xgboost_svdf_tree.hpp"
+
 namespace xgboost{
     namespace booster{
         /*! 
diff --git a/booster/xgboost.h b/booster/xgboost.h
index f1bfdb7ba..5e7bd47f6 100644
--- a/booster/xgboost.h
+++ b/booster/xgboost.h
@@ -3,6 +3,7 @@
 /*!
  * \file xgboost.h
  * \brief the general gradient boosting interface
+ * 
  * \author Tianqi Chen: tianqi.tchen@gmail.com
  */
 #include <vector>
diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h
index 66d51ca7a..c43f79871 100644
--- a/booster/xgboost_gbmbase.h
+++ b/booster/xgboost_gbmbase.h
@@ -53,7 +53,7 @@ namespace xgboost{
                 /*! \brief type of tree used */
                 int booster_type;
                 /*! \brief number of root: default 0, means single tree */
-                int num_root;
+                int num_roots;
                 /*! \brief number of features to be used by boosters */
                 int num_feature;
                 /*! \brief size of predicton buffer allocated for buffering boosting computation */
@@ -69,7 +69,7 @@ namespace xgboost{
                 Param( void ){
                     num_boosters = 0; 
                     booster_type = 0;
-                    num_root = num_feature = 0;                    
+                    num_roots = num_feature = 0;                    
                     do_reboost = 0;
                     num_pbuffer = 0;
                     memset( reserved, 0, sizeof( reserved ) );     
@@ -83,7 +83,7 @@ namespace xgboost{
                     if( !strcmp("booster_type", name ) )     booster_type = atoi( val );
                     if( !strcmp("num_pbuffer", name ) )      num_pbuffer = atoi( val );
                     if( !strcmp("do_reboost", name ) )       do_reboost  = atoi( val );
-                    if( !strcmp("bst:num_root", name ) )     num_root = atoi( val );
+                    if( !strcmp("bst:num_roots", name ) )    num_roots = atoi( val );
                     if( !strcmp("bst:num_feature", name ) )  num_feature = atoi( val );
                 }
             };
@@ -98,9 +98,12 @@ namespace xgboost{
              * \param val  value of the parameter
              */
             inline void SetParam( const char *name, const char *val ){
-                if( strncmp( name, "bst:", 4 ) == 0 ){
+                if( !strncmp( name, "bst:", 4 ) ){
                     cfg.PushBack( name + 4, val );
                 }
+                if( !strcmp( name, "silent") ){
+                    cfg.PushBack( name, val );
+                }
                 if( boosters.size() == 0 ) param.SetParam( name, val );
             }
             /*! 
diff --git a/utils/xgboost_matrix_csr.h b/utils/xgboost_matrix_csr.h
new file mode 100644
index 000000000..58c54f77b
--- /dev/null
+++ b/utils/xgboost_matrix_csr.h
@@ -0,0 +1,152 @@
+/*!
+ * \file xgboost_matrix_csr.h
+ * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
+ * \author Tianqi Chen: tianqi.tchen@gmail.com
+*/
+#ifndef _XGBOOST_MATRIX_CSR_H_
+#define _XGBOOST_MATRIX_CSR_H_
+#include <vector>
+#include <algorithm>
+#include "xgboost_utils.h"
+
+namespace xgboost{
+    namespace utils{
+        /*! 
+         * \brief a class used to help construct CSR format matrix, 
+         *        can be used to convert row major CSR to column major CSR
+         * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
+         * \tparam whether enabling the usage of aclist, this option must be enabled manually
+         */
+        template<typename IndexType,bool UseAcList = false>
+        struct SparseCSRMBuilder{
+        private:
+            /*! \brief dummy variable used in the indicator matrix construction */
+            std::vector<size_t> dummy_aclist;
+            /*! \brief pointer to each of the row */
+            std::vector<size_t>    &rptr;
+            /*! \brief index of nonzero entries in each row */
+            std::vector<IndexType> &findex;
+            /*! \brief a list of active rows, used when many rows are empty */
+            std::vector<size_t>    &aclist;
+        public:
+            SparseCSRMBuilder( std::vector<size_t> &p_rptr,
+                               std::vector<IndexType> &p_findex )
+                :rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){
+                Assert( !UseAcList, "enabling bug" );
+            }  
+            /*! \brief use with caution! rptr must be cleaned before use */     
+            SparseCSRMBuilder( std::vector<size_t> &p_rptr,
+                               std::vector<IndexType> &p_findex,
+                               std::vector<size_t> &p_aclist )
+                :rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){
+                Assert( UseAcList, "must manually enable the option use aclist" );
+            }
+        public:
+            /*! 
+             * \brief step 1: initialize the number of rows in the data
+             * \nrows number of rows in the matrix 
+             */
+            inline void InitBudget( size_t nrows ){
+                if( !UseAcList ){
+                    rptr.resize( nrows + 1 );
+                    std::fill( rptr.begin(), rptr.end(), 0 );
+                }else{
+                    Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" );
+                    this->Cleanup();
+                }
+            }
+            /*! 
+             * \brief step 2: add budget to each rows, this function is called when aclist is used
+             * \param row_id the id of the row
+             * \param nelem  number of element budget add to this row
+             */
+            inline void AddBudget( size_t row_id, size_t nelem = 1 ){
+                if( UseAcList ){
+                    if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id );
+                }
+                rptr[ row_id + 1 ] += nelem;
+            }
+            /*! \brief step 3: initialize the necessary storage */
+            inline void InitStorage( void ){
+                // initialize rptr to be beginning of each segment
+                size_t start = 0;
+                if( !UseAcList ){
+                    for( size_t i = 1; i < rptr.size(); i ++ ){
+                        size_t rlen = rptr[ i ];
+                        rptr[ i ] = start;
+                        start += rlen;
+                    }
+                }else{
+                    // case with active list
+                    std::sort( aclist.begin(), aclist.end() );
+                    
+                    for( size_t i = 0; i < aclist.size(); i ++ ){
+                        size_t ridx = aclist[ i ];
+                        size_t rlen = rptr[ ridx + 1 ];
+                        rptr[ ridx + 1 ] = start;
+                        // set previous rptr to right position if previous feature is not active
+                        if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
+                        start += rlen;
+                    }
+                }
+                findex.resize( start );
+            }
+            /*! 
+             * \brief step 4: 
+             * used in indicator matrix construction, add new 
+             * element to each row, the number of calls shall be exactly same as add_budget 
+             */
+            inline void PushElem( size_t row_id, IndexType col_id ){ 
+                size_t &rp = rptr[ row_id + 1 ];
+                findex[ rp ++ ] = col_id;
+            }
+            /*! 
+             * \brief step 5: only needed when aclist is used
+             * clean up the rptr for next usage
+             */        
+            inline void Cleanup( void ){
+                Assert( UseAcList, "this function can only be called use AcList" );
+                for( size_t i = 0; i < aclist.size(); i ++ ){
+                    const size_t ridx = aclist[i];
+                    rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0;
+                }
+                aclist.clear();
+            }
+        };
+    };
+    
+    namespace utils{
+        /*! 
+         * \brief simple sparse matrix container
+         * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
+         */    
+        template<typename IndexType>
+        struct SparseCSRMat{
+        private:
+            /*! \brief pointer to each of the row */
+            std::vector<size_t>    rptr;
+            /*! \brief index of nonzero entries in each row */
+            std::vector<IndexType> findex;
+        public:
+            /*! \brief matrix builder*/
+            SparseCSRMBuilder<IndexType> builder;
+        public:
+            SparseCSRMat( void ):builder( rptr, findex ){
+            }  
+        public:
+            /*! \return number of rows in the matrx */
+            inline size_t NumRow( void ) const{
+                return rptr.size() - 1;
+            }
+            /*! \return number of elements r-th row */
+            inline size_t NumElem( size_t r ) const{
+                return rptr[ r + 1 ] - rptr[ r ];
+            }
+            /*! \return r-th row */        
+            inline const IndexType *operator[]( size_t r ) const{
+                return &findex[ rptr[r] ];
+            } 
+        };    
+    };
+};
+#endif
diff --git a/utils/xgboost_random.h b/utils/xgboost_random.h
new file mode 100644
index 000000000..e7caef757
--- /dev/null
+++ b/utils/xgboost_random.h
@@ -0,0 +1,131 @@
+#ifndef _XGBOOST_RANDOM_H_
+#define _XGBOOST_RANDOM_H_
+/*!
+ * \file xgboost_random.h
+ * \brief PRNG to support random number generation
+ * \author Tianqi Chen: tianqi.tchen@gmail.com
+ *
+ * Use standard PRNG from stdlib
+ */
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#ifdef _MSC_VER
+typedef unsigned char uint8_t;
+typedef unsigned short int uint16_t;
+typedef unsigned int  uint32_t;
+#else
+#include <inttypes.h>
+#endif
+
+/*! namespace of PRNG */
+namespace xgboost{
+    namespace random{
+        /*! \brief seed the PRNG */
+        inline void Seed( uint32_t seed ){
+            srand( seed );
+        }
+		
+        /*! \brief return a real number uniform in [0,1) */
+        inline double NextDouble(){
+            return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0);
+        }
+        /*! \brief return a real numer uniform in (0,1) */
+        inline double NextDouble2(){
+            return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
+        }
+    };
+        
+    namespace random{
+        /*! \brief return a random number */
+        inline uint32_t NextUInt32( void ){
+            return (uint32_t)rand();
+        }
+        /*! \brief return a random number in n */
+        inline uint32_t NextUInt32( uint32_t n ){
+            return (uint32_t) floor( NextDouble() * n ) ;
+        }  
+        /*! \brief return  x~N(0,1) */
+        inline double SampleNormal(){
+            double x,y,s;
+            do{
+                x = 2 * NextDouble2() - 1.0;
+                y = 2 * NextDouble2() - 1.0;
+                s = x*x + y*y;
+            }while( s >= 1.0 || s == 0.0 );
+            
+            return x * sqrt( -2.0 * log(s) / s ) ;
+        }
+        
+        /*! \brief return iid x,y ~N(0,1) */
+        inline void SampleNormal2D( double &xx, double &yy ){
+            double x,y,s;
+            do{
+                x = 2 * NextDouble2() - 1.0;
+                y = 2 * NextDouble2() - 1.0;
+                s = x*x + y*y;
+            }while( s >= 1.0 || s == 0.0 );
+            double t = sqrt( -2.0 * log(s) / s ) ;
+            xx = x * t; 
+            yy = y * t;
+        }
+        /*! \brief return  x~N(mu,sigma^2) */
+        inline double SampleNormal( double mu, double sigma ){
+            return SampleNormal() * sigma + mu;
+        }
+
+        /*! \brief  return 1 with probability p, coin flip */
+        inline int SampleBinary( double p ){
+            return NextDouble() <  p;  
+        }
+        
+        /*! \brief  return distribution from Gamma( alpha, beta ) */
+        inline double SampleGamma( double alpha, double beta ) {
+            if ( alpha < 1.0 ) {
+                double u;
+                do {
+                    u = NextDouble();
+                } while (u == 0.0);
+                return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
+            } else {
+                double d,c,x,v,u;
+                d = alpha - 1.0/3.0;
+                c = 1.0 / sqrt( 9.0 * d );
+                do {
+                    do {
+                        x = SampleNormal();
+                        v = 1.0 + c*x;
+                    } while ( v <= 0.0 );
+                    v = v * v * v;
+                    u = NextDouble();
+                } while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x)))
+                          && (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) );
+                return d * v / beta;
+            }
+        }
+        
+        template<typename T>
+        inline void Exchange( T &a, T &b ){
+            T c;
+            c = a;
+            a = b;
+            b = c;
+        }
+        
+        template<typename T>
+        inline void Shuffle( T *data, size_t sz ){
+            if( sz == 0 ) return;
+            for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){
+                Exchange( data[i], data[ NextUInt32( i+1 ) ] );
+            } 
+        }
+        // random shuffle the data inside, require PRNG 
+        template<typename T>
+        inline void Shuffle( std::vector<T> &data ){
+            Shuffle( &data[0], data.size() );
+        }
+    };
+};
+
+#endif