diff --git a/Makefile b/Makefile
index b55cb3ac4..ef37a002d 100644
--- a/Makefile
+++ b/Makefile
@@ -4,14 +4,13 @@ export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fopenmp
 
 # specify tensor path
 BIN = xgboost
-OBJ = xgboost.o
+OBJ = 
 .PHONY: clean all
 
 all: $(BIN) $(OBJ)
 export LDFLAGS= -pthread -lm 
 
-xgboost.o: booster/xgboost.h booster/xgboost_data.h booster/xgboost.cpp booster/*/*.hpp booster/*/*.h
-xgboost: regression/xgboost_reg_main.cpp regression/*.h xgboost.o
+xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
diff --git a/booster/linear/xgboost_linear.hpp b/booster/linear/xgboost_linear.hpp
index eb76cf8d7..8979bee72 100644
--- a/booster/linear/xgboost_linear.hpp
+++ b/booster/linear/xgboost_linear.hpp
@@ -15,7 +15,8 @@
 namespace xgboost{
     namespace booster{
         /*! \brief linear model, with L1/L2 regularization */
-        class LinearBooster : public IBooster{
+        template<typename FMatrix>
+        class LinearBooster : public InterfaceBooster<FMatrix>{
         public:
             LinearBooster( void ){ silent = 0;}
             virtual ~LinearBooster( void ){}
@@ -37,15 +38,15 @@ namespace xgboost{
         public:
             virtual void DoBoost( std::vector<float> &grad, 
                                   std::vector<float> &hess,
-                                  const FMatrixS &smat,
+                                  const FMatrix &fmat,
                                   const std::vector<unsigned> &root_index ){
                 utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
-                this->UpdateWeights( grad, hess, smat );
-            }            
-            virtual float Predict( const FMatrixS::Line &sp, unsigned rid = 0 ){
+                this->UpdateWeights( grad, hess, fmat );
+            }
+            inline float Predict( const FMatrix &fmat, bst_uint ridx, unsigned root_index ){
                 float sum = model.bias();
-                for( unsigned i = 0; i < sp.len; i ++ ){
-                    sum += model.weight[ sp[i].findex ] * sp[i].fvalue;
+                for( typename FMatrix::RowIter it = fmat.GetRow(ridx); it.Next(); ){ 
+                    sum += model.weight[ it.findex() ] * it.fvalue();
                 }
                 return sum;
             }
@@ -59,6 +60,7 @@ namespace xgboost{
                 }
                 return sum;
             }
+            
         protected:
             // training parameter
             struct ParamTrain{
@@ -155,7 +157,6 @@ namespace xgboost{
             ParamTrain param;
         protected:
             // update weights, should work for any FMatrix
-            template<typename FMatrix>
             inline void UpdateWeights( std::vector<float> &grad,                       
                                        const std::vector<float> &hess,
                                        const FMatrix &smat ){
diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp
index 5daf759c7..31cfcf01f 100644
--- a/booster/tree/xgboost_tree.hpp
+++ b/booster/tree/xgboost_tree.hpp
@@ -29,7 +29,8 @@ namespace xgboost{
     namespace booster{
         // regression tree, construction algorithm is seperated from this class
         // see RegTreeUpdater
-        class RegTreeTrainer : public IBooster{
+        template<typename FMatrix>
+        class RegTreeTrainer : public InterfaceBooster<FMatrix>{
         public:
             RegTreeTrainer( void ){ 
                 silent = 0; tree_maker = 1;                
@@ -55,62 +56,6 @@ namespace xgboost{
             }
         public:
             virtual void DoBoost( std::vector<float> &grad, 
-                                  std::vector<float> &hess,
-                                  const FMatrixS &smat,
-                                  const std::vector<unsigned> &root_index ){
-                this->DoBoost_( grad, hess, smat, root_index );
-            }
-            
-            virtual int GetLeafIndex( const std::vector<float> &feat,
-                                      const std::vector<bool>  &funknown,
-                                      unsigned gid = 0 ){
-                // start from groups that belongs to current data
-                int pid = (int)gid;
-                // tranverse tree
-                while( !tree[ pid ].is_leaf() ){
-                    unsigned split_index = tree[ pid ].split_index();
-                    pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] );
-                }
-                return pid;
-            }
-
-            virtual void PredPath( std::vector<int> &path, const FMatrixS::Line &feat, unsigned gid = 0 ){
-                path.clear();
-                ThreadEntry &e = this->InitTmp();
-                this->PrepareTmp( feat, e );
-
-                int pid = (int)gid;
-                path.push_back( pid );
-                // tranverse tree
-                while( !tree[ pid ].is_leaf() ){                    
-                    unsigned split_index = tree[ pid ].split_index();
-                    pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
-                    path.push_back( pid );
-                }                
-                this->DropTmp( feat, e );
-            }
-            // make it OpenMP thread safe, but not thread safe in general
-            virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){
-                ThreadEntry &e = this->InitTmp();
-                this->PrepareTmp( feat, e );
-                int pid = this->GetLeafIndex( e.feat, e.funknown, gid );
-                this->DropTmp( feat, e );
-                return tree[ pid ].leaf_value();
-            }
-            virtual float Predict( const std::vector<float> &feat, 
-                                   const std::vector<bool>  &funknown,
-                                   unsigned gid = 0 ){
-                utils::Assert( feat.size() >= (size_t)tree.param.num_feature,
-                               "input data smaller than num feature" );
-                int pid = this->GetLeafIndex( feat, funknown, gid );
-                return tree[ pid ].leaf_value();
-            }            
-            virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){
-                tree.DumpModel( fo, fmap, with_stats );
-            }
-        private:
-            template<typename FMatrix>
-            inline void DoBoost_( std::vector<float> &grad, 
                                   std::vector<float> &hess,
                                   const FMatrix &smat,
                                   const std::vector<unsigned> &root_index ){
@@ -131,6 +76,52 @@ namespace xgboost{
                     printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", 
                             tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
                 }
+            }            
+            virtual float Predict( const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
+                ThreadEntry &e = this->InitTmp();
+                this->PrepareTmp( fmat.GetRow(ridx), e );
+                int pid = this->GetLeafIndex( e.feat, e.funknown, gid );
+                this->DropTmp( fmat.GetRow(ridx), e );
+                return tree[ pid ].leaf_value();          
+            }
+            virtual int GetLeafIndex( const std::vector<float> &feat,
+                                      const std::vector<bool>  &funknown,
+                                      unsigned gid = 0 ){
+                // start from groups that belongs to current data
+                int pid = (int)gid;
+                // tranverse tree
+                while( !tree[ pid ].is_leaf() ){
+                    unsigned split_index = tree[ pid ].split_index();
+                    pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] );
+                }
+                return pid;
+            }
+
+            virtual void PredPath( std::vector<int> &path, const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
+                path.clear();
+                ThreadEntry &e = this->InitTmp();
+                this->PrepareTmp( fmat.GetRow(ridx), e );
+                
+                int pid = (int)gid;
+                path.push_back( pid );
+                // tranverse tree
+                while( !tree[ pid ].is_leaf() ){                    
+                    unsigned split_index = tree[ pid ].split_index();
+                    pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
+                    path.push_back( pid );
+                }
+                this->DropTmp( fmat.GetRow(ridx), e );
+            }
+            virtual float Predict( const std::vector<float> &feat, 
+                                   const std::vector<bool>  &funknown,
+                                   unsigned gid = 0 ){
+                utils::Assert( feat.size() >= (size_t)tree.param.num_feature,
+                               "input data smaller than num feature" );
+                int pid = this->GetLeafIndex( feat, funknown, gid );
+                return tree[ pid ].leaf_value();
+            }            
+            virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){
+                tree.DumpModel( fo, fmap, with_stats );
             }
         private:
             int silent;
@@ -144,7 +135,6 @@ namespace xgboost{
             };
             std::vector<ThreadEntry> threadtemp;
         private:
-
             inline ThreadEntry& InitTmp( void ){
                 const int tid = omp_get_thread_num();
                 utils::Assert( tid < (int)threadtemp.size(), "RTreeUpdater: threadtemp pool is too small" );
@@ -156,16 +146,17 @@ namespace xgboost{
                 }
                 return e;
             }
-            inline void PrepareTmp( const FMatrixS::Line &feat, ThreadEntry &e ){
-                for( unsigned i = 0; i < feat.len; i ++ ){
-                    utils::Assert( feat[i].findex < (unsigned)tree.param.num_feature , "input feature execeed bound" );
-                    e.funknown[ feat[i].findex ] = false;
-                    e.feat[ feat[i].findex ] = feat[i].fvalue;
+            inline void PrepareTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
+                while( it.Next() ){
+                    const bst_uint findex = it.findex();
+                    utils::Assert( findex < (unsigned)tree.param.num_feature , "input feature execeed bound" );
+                    e.funknown[ findex ] = false;
+                    e.feat[ findex ] = it.fvalue();
                 } 
             }
-            inline void DropTmp( const FMatrixS::Line &feat, ThreadEntry &e ){
-                for( unsigned i = 0; i < feat.len; i ++ ){
-                    e.funknown[ feat[i].findex ] = true;
+            inline void DropTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
+                while( it.Next() ){
+                    e.funknown[ it.findex() ] = true;
                 }
             }
 
diff --git a/booster/xgboost.cpp b/booster/xgboost-inl.hpp
similarity index 53%
rename from booster/xgboost.cpp
rename to booster/xgboost-inl.hpp
index 61f42004b..eee64ac7b 100644
--- a/booster/xgboost.cpp
+++ b/booster/xgboost-inl.hpp
@@ -1,16 +1,13 @@
+#ifndef XGBOOST_INL_HPP
+#define XGBOOST_INL_HPP
 /*!
-* \file xgboost.cpp
-* \brief bootser implementations 
-* \author Tianqi Chen: tianqi.tchen@gmail.com
-*/
+ * \file xgboost-inl.hpp
+ * \brief bootser implementations 
+ * \author Tianqi Chen: tianqi.tchen@gmail.com
+ */
 // implementation of boosters go to here 
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#include <climits>
 #include "xgboost.h"
 #include "../utils/xgboost_utils.h"
-#include "xgboost_gbmbase.h"
-// implementations of boosters
 #include "tree/xgboost_tree.hpp"
 #include "linear/xgboost_linear.hpp"
 
@@ -19,16 +16,18 @@ namespace xgboost{
 		/*! 
 		* \brief create a gradient booster, given type of booster
 		* \param booster_type type of gradient booster, can be used to specify implements
+        * \tparam FMatrix input data type for booster
 		* \return the pointer to the gradient booster created
 		*/
-		IBooster *CreateBooster( int booster_type ){
+        template<typename FMatrix>
+        inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ){
 			switch( booster_type ){
-            case 0: return new RegTreeTrainer();
-            case 1: return new LinearBooster();
+            case 0: return new RegTreeTrainer<FMatrix>();
+            case 1: return new LinearBooster<FMatrix>();
 			default: utils::Error("unknown booster_type"); return NULL;
 			}
 		}
 	};
 };
 
-
+#endif // XGBOOST_INL_HPP
diff --git a/booster/xgboost.h b/booster/xgboost.h
index 800fa13e0..587cefdc3 100644
--- a/booster/xgboost.h
+++ b/booster/xgboost.h
@@ -3,7 +3,9 @@
 /*!
  * \file xgboost.h
  * \brief the general gradient boosting interface
- * 
+ *
+ *   common practice of this header: use IBooster and CreateBooster<FMatrixS>
+ *
  * \author Tianqi Chen: tianqi.tchen@gmail.com
  */
 #include <vector>
@@ -19,8 +21,10 @@ namespace xgboost{
     namespace booster{
         /*! 
          * \brief interface of a gradient boosting learner 
+         * \tparam FMatrix the feature matrix format that the booster takes
          */
-        class IBooster{
+        template<typename FMatrix>
+        class InterfaceBooster{
         public:
             // interface for model setting and loading
             // calling procedure:
@@ -69,9 +73,12 @@ namespace xgboost{
             /*! 
              * \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree
              * \param path the result of path
-             * \param rid root id of current instance, default = 0
+             * \param feats feature matrix
+             * \param row_index  row index in the feature matrix
+             * \param root_index root id of current instance, default = 0
              */
-            virtual void PredPath( std::vector<int> &path, const FMatrixS::Line &feat, unsigned rid = 0 ){
+            virtual void PredPath( std::vector<int> &path, const FMatrix &feats, 
+                                   bst_uint row_index, unsigned root_index = 0 ){
                 utils::Error( "not implemented" );
             }
             /*! 
@@ -79,11 +86,12 @@ namespace xgboost{
              * 
              *   NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general,
              *         dense version of Predict to ensures threadsafety
-             * \param feat vector in sparse format
-             * \param rid root id of current instance, default = 0
+             * \param feats feature matrix
+             * \param row_index  row index in the feature matrix
+             * \param root_index root id of current instance, default = 0
              * \return prediction 
              */        
-            virtual float Predict( const FMatrixS::Line &feat, unsigned rid = 0 ){
+            virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){                
                 utils::Error( "not implemented" );
                 return 0.0f;
             }
@@ -92,12 +100,12 @@ namespace xgboost{
              * \param feat feature vector in dense format
              * \param funknown indicator that the feature is missing
              * \param rid root id of current instance, default = 0
-             * \return prediction 
-             */                
+             * \return prediction
+             */
             virtual float Predict( const std::vector<float> &feat, 
                                    const std::vector<bool>  &funknown,
                                    unsigned rid = 0 ){
-                utils::Error( "not implemented" );            
+                utils::Error( "not implemented" );
                 return 0.0f;
             }
             /*! 
@@ -116,8 +124,15 @@ namespace xgboost{
             }
         public:
             /*! \brief virtual destructor */
-            virtual ~IBooster( void ){}
-        };    
+            virtual ~InterfaceBooster( void ){}
+        };
+    };
+    namespace booster{
+        /*! 
+         * \brief this will is the most commonly used booster interface 
+         *  we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat
+         */
+        typedef InterfaceBooster<FMatrixS> IBooster;
     };
 };
 
@@ -125,10 +140,24 @@ namespace xgboost{
     namespace booster{
         /*! 
          * \brief create a gradient booster, given type of booster
+         *    normally we use FMatrixS, by calling CreateBooster<FMatrixS>
          * \param booster_type type of gradient booster, can be used to specify implements
+         * \tparam FMatrix input data type for booster
          * \return the pointer to the gradient booster created
          */
-        IBooster *CreateBooster( int booster_type );
+        template<typename FMatrix>
+        inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type );
     };
 };
+
+// A good design should have minimum functions defined interface, user should only operate on interface
+// I break it a bit, by using template and let user 'see' the implementation
+// The user should pretend that they only can use the interface, and we are all cool
+// I find this is the only way so far I can think of to make boosters invariant of data structure, 
+// while keep everything fast
+
+// this file includes the template implementations of all boosters
+// the cost of using template is that the user can 'see' all the implementations, which is OK 
+// ignore implementations and focus on the interface:) 
+#include "xgboost-inl.hpp"
 #endif
diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h
index 5bd95d48b..128544e55 100644
--- a/booster/xgboost_data.h
+++ b/booster/xgboost_data.h
@@ -341,7 +341,7 @@ namespace xgboost{
                     utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
                 }
             }
-        private:
+        protected:
             /*! \brief row pointer of CSR sparse storage */
             std::vector<size_t>  row_ptr_;
             /*! \brief data in the row */
diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h
index 4bbd8a8a2..50746e59e 100644
--- a/booster/xgboost_gbmbase.h
+++ b/booster/xgboost_gbmbase.h
@@ -3,6 +3,7 @@
 
 #include <cstring>
 #include "xgboost.h"
+#include "xgboost_data.h"
 #include "../utils/xgboost_omp.h"
 #include "../utils/xgboost_config.h"
 /*!
@@ -128,7 +129,7 @@ namespace xgboost{
                 utils::Assert( fi.Read( &param, sizeof(Param) ) != 0 );
                 boosters.resize( param.num_boosters );
                 for( size_t i = 0; i < boosters.size(); i ++ ){
-                    boosters[ i ] = booster::CreateBooster( param.booster_type );
+                    boosters[ i ] = booster::CreateBooster<FMatrixS>( param.booster_type );
                     boosters[ i ]->LoadModel( fi );
                 }
                 {// load info 
@@ -207,7 +208,7 @@ namespace xgboost{
                     for( size_t j = 0; j < boosters.size(); ++ j ){
                         if( j != 0 ) fprintf( fo, "\t" );
                         std::vector<int> path;
-                        boosters[j]->PredPath( path, data[i] );
+                        boosters[j]->PredPath( path, data, i );
                         fprintf( fo, "%d", path[0] );
                         for( size_t k = 1; k < path.size(); ++ k ){
                             fprintf( fo, ",%d", path[k] );
@@ -236,12 +237,13 @@ namespace xgboost{
             /*! 
              * \brief predict values for given sparse feature vector
              *   NOTE: in tree implementation, this is not threadsafe
-             * \param feat vector in sparse format
+             * \param feats feature matrix
+             * \param row_index  row index in the feature matrix
              * \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
-             * \param rid root id of current instance, default = 0
+             * \param root_index root id of current instance, default = 0
              * \return prediction 
-             */        
-            virtual float Predict( const booster::FMatrixS::Line &feat, int buffer_index = -1, unsigned rid = 0 ){
+             */
+            inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
                 size_t istart = 0;
                 float  psum = 0.0f;
 
@@ -253,7 +255,7 @@ namespace xgboost{
                 }
             
                 for( size_t i = istart; i < this->boosters.size(); i ++ ){
-                    psum += this->boosters[ i ]->Predict( feat, rid );
+                    psum += this->boosters[ i ]->Predict( feats, row_index, root_index );
                 }
                 
                 // updated the buffered results
@@ -320,7 +322,7 @@ namespace xgboost{
             inline booster::IBooster *GetUpdateBooster( void ){
                 if( param.do_reboost == 0 || boosters.size() == 0 ){
                     param.num_boosters += 1;
-                    boosters.push_back( booster::CreateBooster( param.booster_type ) );
+                    boosters.push_back( booster::CreateBooster<FMatrixS>( param.booster_type ) );
                     booster_info.push_back( 0 );
                     this->ConfigBooster( boosters.back() );
                     boosters.back()->InitModel();                    
diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h
index 5096aec5e..7d1a680dc 100644
--- a/regression/xgboost_reg.h
+++ b/regression/xgboost_reg.h
@@ -174,7 +174,7 @@ namespace xgboost{
                 #pragma omp parallel for schedule( static )
                 for( unsigned j = 0; j < ndata; ++ j ){
                     preds[j] = mparam.PredTransform
-                        ( mparam.base_score + base_model.Predict( data.data[j], -1 ) );
+                        ( mparam.base_score + base_model.Predict( data.data, j, -1 ) );
                 }
             }
         private:
@@ -186,7 +186,7 @@ namespace xgboost{
                 #pragma omp parallel for schedule( static )
                 for( unsigned j = 0; j < ndata; ++ j ){                
                     preds[j] = mparam.PredTransform
-                        ( mparam.base_score + base_model.Predict( data.data[j], buffer_offset + j ) );
+                        ( mparam.base_score + base_model.Predict( data.data, j, buffer_offset + j ) );
                 }
             }