diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h
index c879c2d53..777fa2b0a 100644
--- a/booster/xgboost_data.h
+++ b/booster/xgboost_data.h
@@ -321,6 +321,8 @@ namespace xgboost{
                 fi.Read(&col_access, sizeof(int));
                 if (col_access != 0){
                     FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
+                }else{
+                    this->InitData();                    
                 }
             }
             /*!
diff --git a/python/Makefile b/python/Makefile
index 4b90f7017..0db0a1ed0 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -1,19 +1,18 @@
 export CC  = gcc
 export CXX = g++
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fopenmp
+export CFLAGS = -Wall -msse2  -Wno-unknown-pragmas -fopenmp
 
 # specify tensor path
-SLIB = xgboostpy.so
-OBJ = xgboost_python.o
+SLIB = libxgboostpy.so
 .PHONY: clean all
 
 all: $(SLIB)
 export LDFLAGS= -pthread -lm 
 
-xgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp
+libxgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp
 
 $(SLIB) :
-	$(CXX) $(CFLAGS) $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
 
diff --git a/python/xgboost.py b/python/xgboost.py
index ab7024d4d..580b80306 100644
--- a/python/xgboost.py
+++ b/python/xgboost.py
@@ -2,7 +2,7 @@
 import ctypes 
 
 # load in xgboost library
-#xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so')
+xglib = ctypes.cdll.LoadLibrary('./libxgboostpy.so')
 
 # entry type of sparse matrix
 class REntry(ctypes.Structure):
@@ -10,6 +10,13 @@ class REntry(ctypes.Structure):
 
 
 class DMatrix:
-    def __init__(fname = None):
-        self.__handle = xglib.
-    
+    def __init__(self,fname = None):
+        self.__handle = xglib.XGDMatrixCreate();
+        if fname != None:
+            xglib.XGDMatrixLoad(self.__handle, ctypes.c_char_p(fname), 0)
+    def __del__(self):
+        xglib.XGDMatrixFree(self.__handle)
+
+dmata = DMatrix('xx.buffer')
+
+
diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp
index 5d020066f..ee97c68d3 100644
--- a/python/xgboost_python.cpp
+++ b/python/xgboost_python.cpp
@@ -1,12 +1,41 @@
 #include "xgboost_python.h"
+#include "../regrank/xgboost_regrank.h"
+#include "../regrank/xgboost_regrank_data.h"
 
-void* XGDMatrixCreate(void){
-    return NULL;
-}
-void XGDMatrixFree(void *handle){
-}
-void XGDMatrixLoad(void *handle, const char *fname){
-}
-void XGDMatrixSaveBinary( void *handle, const char *fname ){
-}
+namespace xgboost{
+    namespace python{
+        class DMatrix: public regrank::DMatrix{
+        public:
+            // whether column is initialized
+            bool init_col_;
+        public:
+            DMatrix(void){
+                init_col_ = false;
+            }            
+            ~DMatrix(void){}
+        public:            
+            inline void Load(const char *fname, bool silent){
+                this->CacheLoad(fname, silent);
+                init_col_ = this->data.HaveColAccess();
+            }
+        };
+    };
+};
+
+using namespace xgboost::python;
+
+extern "C"{
+    void* XGDMatrixCreate(void){
+        return new DMatrix();
+    }
+    void XGDMatrixFree(void *handle){
+        delete static_cast<DMatrix*>(handle);
+    }
+    void XGDMatrixLoad(void *handle, const char *fname, int silent){
+        static_cast<DMatrix*>(handle)->Load(fname, silent!=0);
+    }
+    void XGDMatrixSaveBinary(void *handle, const char *fname, int silent){
+        static_cast<DMatrix*>(handle)->SaveBinary(fname, silent!=0);
+    }
+};
 
diff --git a/python/xgboost_python.h b/python/xgboost_python.h
index 56af7a095..ead07200d 100644
--- a/python/xgboost_python.h
+++ b/python/xgboost_python.h
@@ -7,37 +7,51 @@
  *      use c style interface
  */
 #include "../booster/xgboost_data.h"
-/*! \brief type of row entry */
-typedef xgboost::booster::FMatrixS::REntry XGEntry;
-
-/*! 
- * \brief create a data matrix 
- * \return a new data matrix
- */
-void* XGDMatrixCreate(void);
-/*! 
- * \brief free space in data matrix
- */
-void XGDMatrixFree(void *handle);
-/*! 
- * \brief load a data matrix from text file or buffer(if exists)
- * \param handle a instance of data matrix
- * \param fname file name 
- */
-void XGDMatrixLoad(void *handle, const char *fname);
-/*!
- * \brief load a data matrix into binary file
- * \param handle a instance of data matrix
- * \param fname file name 
- */
-void XGDMatrixSaveBinary( void *handle, const char *fname );
-/*! 
- * \brief add row 
- * \param handle a instance of data matrix
- * \param fname file name 
- * \return a new data matrix
- */
-//void XGDMatrixPush( void *handle, const std::pair<int,> );
+extern "C"{
+    /*! \brief type of row entry */
+    typedef xgboost::booster::FMatrixS::REntry XGEntry;
+    
+    /*! 
+     * \brief create a data matrix 
+     * \return a new data matrix
+     */
+    void* XGDMatrixCreate(void);
+    /*! 
+     * \brief free space in data matrix
+     */
+    void XGDMatrixFree(void *handle);
+    /*! 
+     * \brief load a data matrix from text file or buffer(if exists)
+     * \param handle a instance of data matrix
+     * \param fname file name 
+     * \param silent print statistics when loading
+     */
+    void XGDMatrixLoad(void *handle, const char *fname, int silent);
+    /*!
+     * \brief load a data matrix into binary file
+     * \param handle a instance of data matrix
+     * \param fname file name 
+     * \param silent print statistics when saving
+     */
+    void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
+    /*! 
+     * \brief add row 
+     * \param handle a instance of data matrix
+     * \param fname file name 
+     * \return a new data matrix
+     */
+    void XGDMatrixPush(void *handle, const XGEntry *data, int len);
+    
+    /*! 
+     * \brief create a booster
+     */
+    void* XGBoostCreate(void);
 
+    /*! 
+     * \brief create a booster
+     */
+    void* XGBoost(void);
+    
+};
 #endif
 
diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h
index ea8f3dd3d..c99241a4a 100644
--- a/regrank/xgboost_regrank.h
+++ b/regrank/xgboost_regrank.h
@@ -28,40 +28,36 @@ namespace xgboost{
                 name_obj_ = "reg";
             }
             /*!
-            * \brief a regression booter associated with training and evaluating data
-            * \param train pointer to the training data
-            * \param evals array of evaluating data
-            * \param evname name of evaluation data, used print statistics
-            */
-            RegRankBoostLearner(const DMatrix *train,
-                                const std::vector<DMatrix *> &evals,
-                                const std::vector<std::string> &evname){
+             * \brief a regression booter associated with training and evaluating data
+             * \param mats  array of pointers to matrix whose prediction result need to be cached
+             */
+            RegRankBoostLearner(const std::vector<const DMatrix *> mats){
                 silent = 0;
-                this->SetData(train, evals, evname);
-            }
-
+                obj_ = NULL;
+                name_obj_ = "reg";
+                this->SetCacheData(mats);
+            }            
             /*!
-            * \brief associate regression booster with training and evaluating data
-            * \param train pointer to the training data
-            * \param evals array of evaluating data
-            * \param evname name of evaluation data, used print statistics
-            */
-            inline void SetData(const DMatrix *train,
-                                const std::vector<DMatrix *> &evals,
-                                const std::vector<std::string> &evname){
-                this->train_ = train;
-                this->evals_ = evals;
-                this->evname_ = evname;
+             * \brief add internal cache space for mat, this can speedup prediction for matrix,
+             *        please cache prediction for training and eval data
+             *    warning: if the model is loaded from file from some previous training history
+             *             set cache data must be called with exactly SAME 
+             *             data matrices to continue training otherwise it will cause error
+             * \param mats  array of pointers to matrix whose prediction result need to be cached
+             */          
+            inline void SetCacheData(const std::vector<const DMatrix *> mats){
                 // estimate feature bound
-                int num_feature = (int)(train->data.NumCol());
+                int num_feature = 0;
                 // assign buffer index
-                unsigned buffer_size = static_cast<unsigned>(train->Size());
+                unsigned buffer_size = 0;
 
-                for (size_t i = 0; i < evals.size(); ++i){
-                    buffer_size += static_cast<unsigned>(evals[i]->Size());
-                    num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol()));
+                utils::Assert( cache_.size() == 0, "can only call cache data once" );
+                for( size_t i = 0; i < mats.size(); ++i ){
+                    cache_.push_back( CacheEntry( mats[i], buffer_size ) );
+                    buffer_size += static_cast<unsigned>(mats[i]->Size());
+                    num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
                 }
-
+                
                 char str_temp[25];
                 if (num_feature > mparam.num_feature){
                     mparam.num_feature = num_feature;
@@ -74,15 +70,13 @@ namespace xgboost{
                 if (!silent){
                     printf("buffer_size=%u\n", buffer_size);
                 }
-
-                // set eval_preds tmp sapce
-                this->eval_preds_.resize(evals.size(), std::vector<float>());
             }
+
             /*!
-            * \brief set parameters from outside
-            * \param name name of the parameter
-            * \param val  value of the parameter
-            */
+             * \brief set parameters from outside
+             * \param name name of the parameter
+             * \param val  value of the parameter
+             */
             inline void SetParam(const char *name, const char *val){
                 if (!strcmp(name, "silent"))  silent = atoi(val);
                 if (!strcmp(name, "eval_metric"))  evaluator_.AddEval(val);
@@ -104,8 +98,8 @@ namespace xgboost{
                 evaluator_.AddEval( obj_->DefaultEvalMetric() );
             }
             /*!
-            * \brief initialize the current data storage for model, if the model is used first time, call this function
-            */
+             * \brief initialize the current data storage for model, if the model is used first time, call this function
+             */
             inline void InitModel(void){
                 base_gbm.InitModel();
                 mparam.AdjustBase();
@@ -147,74 +141,66 @@ namespace xgboost{
              * \brief update the model for one iteration
              * \param iteration iteration number
              */
-            inline void UpdateOneIter(int iter){
-                this->PredictBuffer(preds_, *train_, 0);
-                obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_);
+            inline void UpdateOneIter(int iter, const DMatrix &train){
+                this->PredictRaw(preds_, train);
+                obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
+                // do boost
                 std::vector<unsigned> root_index;
-                base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
+                base_gbm.DoBoost(grad_, hess_, train.data, root_index);
             }
             /*!
              * \brief evaluate the model for specific iteration
              * \param iter iteration number
+             * \param evals datas i want to evaluate
+             * \param evname name of each dataset
              * \param fo file to output log
              */
-            inline void EvalOneIter(int iter, FILE *fo = stderr){
+            inline void EvalOneIter(int iter,
+                                    const std::vector<const DMatrix*> &evals,
+                                    const std::vector<std::string> &evname,
+                                    FILE *fo=stderr ){
                 fprintf(fo, "[%d]", iter);
-                int buffer_offset = static_cast<int>(train_->Size());
-
-                for (size_t i = 0; i < evals_.size(); ++i){
-                    std::vector<float> &preds = this->eval_preds_[i];
-                    this->PredictBuffer(preds, *evals_[i], buffer_offset);
-                    obj_->PredTransform(preds);
-                    evaluator_.Eval(fo, evname_[i].c_str(), preds, evals_[i]->info);
-                    buffer_offset += static_cast<int>(evals_[i]->Size());
+                for (size_t i = 0; i < evals.size(); ++i){
+                    this->PredictRaw(preds_, *evals[i]);
+                    obj_->PredTransform(preds_);
+                    evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
                 }
                 fprintf(fo, "\n");
                 fflush(fo);
             }
             /*! \brief get prediction, without buffering */
             inline void Predict(std::vector<float> &preds, const DMatrix &data){
-                preds.resize(data.Size());
-                const unsigned ndata = static_cast<unsigned>(data.Size());
-                #pragma omp parallel for schedule( static )
-                for (unsigned j = 0; j < ndata; ++j){
-                    preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
-                }
+                this->PredictRaw(preds,data);
                 obj_->PredTransform( preds );
             }            
         public:
             /*!
              * \brief interactive update 
              * \param action action type 
+             * \parma train training data
              */
-            inline void UpdateInteract(std::string action){
-                this->InteractPredict(preds_, *train_, 0);
-
-                int buffer_offset = static_cast<int>(train_->Size());
-                for (size_t i = 0; i < evals_.size(); ++i){
-                    std::vector<float> &preds = this->eval_preds_[i];
-                    this->InteractPredict(preds, *evals_[i], buffer_offset);
-                    buffer_offset += static_cast<int>(evals_[i]->Size());
+            inline void UpdateInteract(std::string action, const DMatrix& train){
+                for(size_t i = 0; i < cache_.size(); ++i){
+                    this->InteractPredict(preds_, *cache_[i].mat_);
                 }
 
                 if (action == "remove"){
                     base_gbm.DelteBooster(); return;
                 }
 
-                obj_->GetGradient(preds_, train_->info, base_gbm.NumBoosters(), grad_, hess_);
+                obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
                 std::vector<unsigned> root_index;
-                base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
+                base_gbm.DoBoost(grad_, hess_, train.data, root_index);
 
-                this->InteractRePredict(*train_, 0);
-                buffer_offset = static_cast<int>(train_->Size());
-                for (size_t i = 0; i < evals_.size(); ++i){
-                    this->InteractRePredict(*evals_[i], buffer_offset);
-                    buffer_offset += static_cast<int>(evals_[i]->Size());
+                for(size_t i = 0; i < cache_.size(); ++i){
+                    this->InteractRePredict(*cache_[i].mat_);
                 }
             }
         private:
             /*! \brief get the transformed predictions, given data */
-            inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
+            inline void InteractPredict(std::vector<float> &preds, const DMatrix &data){
+                int buffer_offset = this->FindBufferOffset(data);
+                utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
                 preds.resize(data.Size());
                 const unsigned ndata = static_cast<unsigned>(data.Size());
                 #pragma omp parallel for schedule( static )
@@ -224,7 +210,9 @@ namespace xgboost{
                 obj_->PredTransform( preds );
             }
             /*! \brief repredict trial */
-            inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
+            inline void InteractRePredict(const DMatrix &data){
+                int buffer_offset = this->FindBufferOffset(data);
+                utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
                 const unsigned ndata = static_cast<unsigned>(data.Size());
                 #pragma omp parallel for schedule( static )
                 for (unsigned j = 0; j < ndata; ++j){
@@ -232,13 +220,24 @@ namespace xgboost{
                 }
             }
         private:
-            /*! \brief get the transformed predictions, given data */
-            inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
+            /*! \brief get un-transformed prediction*/
+            inline void PredictRaw(std::vector<float> &preds, const DMatrix &data){
+                this->PredictBuffer(preds, data, this->FindBufferOffset(data) );
+            }
+            /*! \brief get the un-transformed predictions, given data */
+            inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, int buffer_offset){
                 preds.resize(data.Size());
                 const unsigned ndata = static_cast<unsigned>(data.Size());
-                #pragma omp parallel for schedule( static )
-                for (unsigned j = 0; j < ndata; ++j){
-                    preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
+                if( buffer_offset >= 0 ){  
+                    #pragma omp parallel for schedule( static )
+                    for (unsigned j = 0; j < ndata; ++j){
+                        preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j);
+                    }
+                }else
+                    #pragma omp parallel for schedule( static )
+                    for (unsigned j = 0; j < ndata; ++j){
+                        preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1);
+                    }{
                 }
             }
         private:
@@ -260,10 +259,10 @@ namespace xgboost{
                     memset(reserved, 0, sizeof(reserved));
                 }
                 /*!
-                * \brief set parameters from outside
-                * \param name name of the parameter
-                * \param val  value of the parameter
-                */
+                 * \brief set parameters from outside
+                 * \param name name of the parameter
+                 * \param val  value of the parameter
+                 */
                 inline void SetParam(const char *name, const char *val){
                     if (!strcmp("base_score", name))  base_score = (float)atof(val);
                     if (!strcmp("loss_type", name))   loss_type = atoi(val);
@@ -279,15 +278,28 @@ namespace xgboost{
                     }
                 }
             };
+        private:
+            struct CacheEntry{
+                const DMatrix *mat_;
+                int buffer_offset_;
+                CacheEntry(const DMatrix *mat, int buffer_offset)
+                    :mat_(mat), buffer_offset_(buffer_offset){}
+            };           
+            /*! \brief the entries indicates that we have internal prediction cache */
+            std::vector<CacheEntry> cache_;
+        private:
+            // find internal bufer offset for certain matrix, if not exist, return -1
+            inline int FindBufferOffset(const DMatrix &mat){
+                for(size_t i = 0; i < cache_.size(); ++i){
+                    if( cache_[i].mat_ == &mat ) return cache_[i].buffer_offset_; 
+                }
+                return -1;
+            } 
         private:
             int silent;
             EvalSet evaluator_;
             booster::GBMBase base_gbm;
-            ModelParam   mparam;
-            const DMatrix *train_;
-            std::vector<DMatrix *> evals_;
-            std::vector<std::string> evname_;
-            std::vector<unsigned> buffer_index_;
+            ModelParam   mparam;           
             // objective fnction
             IObjFunction *obj_;
             // name of objective function
@@ -295,7 +307,6 @@ namespace xgboost{
             std::vector< std::pair<std::string, std::string> > cfg_;
         private:
             std::vector<float> grad_, hess_, preds_;
-            std::vector< std::vector<float> > eval_preds_;
         };
     }
 };
diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h
index 3653021c5..458ad60f0 100644
--- a/regrank/xgboost_regrank_data.h
+++ b/regrank/xgboost_regrank_data.h
@@ -116,8 +116,6 @@ namespace xgboost{
                     }
                 }
                 fs.Close();
-                // initialize column support as well
-                data.InitData();
                 
                 if (!silent){
                     printf("%ux%u matrix with %lu entries is loaded from %s\n",
diff --git a/regrank/xgboost_regrank_main.cpp b/regrank/xgboost_regrank_main.cpp
index 862837c97..165ff2636 100644
--- a/regrank/xgboost_regrank_main.cpp
+++ b/regrank/xgboost_regrank_main.cpp
@@ -62,6 +62,7 @@ namespace xgboost{
                 if (!strcmp("seed", name))         random::Seed(atoi(val));
                 if (!strcmp("num_round", name))    num_round = atoi(val);
                 if (!strcmp("save_period", name))  save_period = atoi(val);
+                if (!strcmp("eval_train", name))   eval_train = atoi(val);
                 if (!strcmp("task", name))         task = val;
                 if (!strcmp("data", name))        train_path = val;
                 if (!strcmp("test:data", name))   test_path = val;
@@ -92,6 +93,7 @@ namespace xgboost{
                 use_buffer = 1;
                 num_round = 10;
                 save_period = 0;
+                eval_train = 0;
                 dump_model_stats = 0;
                 task = "train";
                 model_in = "NULL";
@@ -122,9 +124,22 @@ namespace xgboost{
                     for (size_t i = 0; i < eval_data_names.size(); ++i){
                         deval.push_back(new DMatrix());
                         deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
+                        devalall.push_back(deval.back());
                     }
+                    std::vector<const DMatrix *> dcache(1, &data);
+                    for( size_t i = 0; i < deval.size(); ++ i){
+                        dcache.push_back( deval[i] );
+                    }
+                    // set cache data to be all training and evaluation data
+                    learner.SetCacheData(dcache);
+
+                    // add training set to evaluation set if needed
+                    if( eval_train != 0 ){
+                        devalall.push_back( &data );
+                        eval_data_names.push_back( std::string("train") );
+                    }
+
                 }
-                learner.SetData(&data, deval, eval_data_names);
             }
             inline void InitLearner(void){
                 cfg.BeforeFirst();
@@ -148,8 +163,8 @@ namespace xgboost{
                 for (int i = 0; i < num_round; ++i){
                     elapsed = (unsigned long)(time(NULL) - start);
                     if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-                    learner.UpdateOneIter(i);
-                    learner.EvalOneIter(i);
+                    learner.UpdateOneIter(i, data);                    
+                    learner.EvalOneIter(i, devalall, eval_data_names);
                     if (save_period != 0 && (i + 1) % save_period == 0){
                         this->SaveModel(i);
                     }
@@ -169,7 +184,7 @@ namespace xgboost{
                 }
             }
             inline void TaskEval(void){
-                learner.EvalOneIter(0);
+                learner.EvalOneIter(0, devalall, eval_data_names);
             }
             inline void TaskInteractive(void){
                 const time_t start = time(NULL);
@@ -179,7 +194,7 @@ namespace xgboost{
                 cfg_batch.BeforeFirst();
                 while (cfg_batch.Next()){
                     if (!strcmp(cfg_batch.name(), "run")){
-                        learner.UpdateInteract(interact_action);
+                        learner.UpdateInteract(interact_action, data);
                         batch_action += 1;
                     }
                     else{
@@ -188,7 +203,7 @@ namespace xgboost{
                 }
 
                 if (batch_action == 0){
-                    learner.UpdateInteract(interact_action);
+                    learner.UpdateInteract(interact_action, data);
                 }
                 utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
                 this->SaveModel(model_out.c_str());
@@ -235,6 +250,8 @@ namespace xgboost{
             int silent;
             /* \brief whether use auto binary buffer */
             int use_buffer;
+            /* \brief whether evaluate training statistics */            
+            int eval_train;
             /* \brief number of boosting iterations */
             int num_round;
             /* \brief the period to save the model, 0 means only save the final round model */
@@ -272,6 +289,7 @@ namespace xgboost{
         private:
             DMatrix data;
             std::vector<DMatrix*> deval;
+            std::vector<const DMatrix*> devalall;
             utils::FeatMap fmap;
             RegRankBoostLearner learner;
         };