Merge pull request #245 from dmlc/lite

Lite
2015-04-19 00:56:10 -07:00 · 2015-04-19 00:56:10 -07:00 · 54a78b87dc
commit 54a78b87dc
parent 47ee5e7c14 5123b07d73
34 changed files with 1464 additions and 890 deletions
--- a/.gitignore
+++ b/.gitignore
@ -55,7 +55,6 @@ train*
 rabit
 .Rbuildignore
 R-package.Rproj
-
+*.cache*
 R-package/inst
 R-package/src
-
--- a/2
+++ b/2
@ -99,10 +99,8 @@ Rpack:
 	cp -r src xgboost/src/src
 	mkdir xgboost/src/subtree
 	mkdir xgboost/src/subtree/rabit
-	mkdir xgboost/src/subtree/rabit/rabit-learn
 	cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
 	cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
-	cp -r subtree/rabit/rabit-learn/io xgboost/src/subtree/rabit/rabit-learn/io
 	rm -rf xgboost/src/subtree/rabit/src/*.o
 	mkdir xgboost/src/wrapper
 	cp  wrapper/xgboost_wrapper.h xgboost/src/wrapper
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@ -2,7 +2,7 @@
 PKGROOT=../../
 # _*_ mode: Makefile; _*_
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o

--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@ -13,7 +13,7 @@ xgblib:
 	cp -r ../../subtree .

 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
 $(OBJECTS) : xgblib
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@ -7,3 +7,5 @@ XGBoost Python Feature Walkthrough
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
 * [Predicting leaf indices](predict_leaf_indices.py)
+* [Sklearn Wrapper](sklearn_example.py)
+* [External Memory](external_memory.py)
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@ -0,0 +1,25 @@
+#!/usr/bin/python
+import numpy as np
+import scipy.sparse
+import xgboost as xgb
+
+### simple example for using external memory version
+
+# this is the only difference, add a # followed by a cache prefix name
+# several cache file with the prefix will be generated
+# currently only support convert from libsvm file
+dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
+dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache')
+
+# specify validations set to watch performance
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+
+# performance notice: set nthread to be the number of your real cpu
+# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
+#param['nthread']=num_real_cpu
+
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 2
+bst = xgb.train(param, dtrain, num_round, watchlist)
+
+
--- a/doc/README.md
+++ b/doc/README.md
@ -0,0 +1,19 @@
+XGBoost Documentation
+====
+This is an ongoing effort to move the [wiki document](https://github.com/dmlc/xgboost/wiki) to here.
+
+List of Documentations
+====
+* [Parameters](parameter.md)
+* [Using XGBoost in Python](python.md)
+* [External Memory Version](external_memory.md)
+
+Highlights Links
+====
+This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
+* Blogpost by phunther: [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) 
+* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) 
+
+Contribution
+====
+Contribution of document usecases are welcomed!
--- a/doc/external_memory.md
+++ b/doc/external_memory.md
@ -0,0 +1,32 @@
+Using XGBoost External Memory Version
+====
+There is no big difference between using external memory version and in-memory version.
+The only difference is the filename format.
+
+The external memory version takes in the following filename format
+```
+filename#cacheprefix
+```
+
+The ```filename``` is the normal path to libsvm file you want to load in, ```cacheprefix``` is a
+path to a cache file that xgboost will use for external memory cache.
+
+The following code was extracted from [../demo/guide-python/external_memory.py](../demo/guide-python/external_memory.py)
+```python
+dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
+```
+You can find that there is additional ```#dtrain.cache``` following the libsvm file, this is the name of cache file.
+For CLI version, simply use ```"../data/agaricus.txt.train#dtrain.cache"``` in filename.
+
+Performance Note
+====
+* the parameter ```nthread``` should be set to number of ***real*** cores
+  - Most modern CPU offer hyperthreading, which means you can have a 4 core cpu with 8 threads
+  - Set nthread to be 4 for maximum performance in such case
+
+Usage Note:
+====
+* This is a experimental version
+  - If you like to try and test it, report results to https://github.com/dmlc/xgboost/issues/244
+* Currently only importing from libsvm format is supported
+  - Contribution of ingestion from other common external memory data source is welcomed
--- a/doc/parameter.md
+++ b/doc/parameter.md
@ -0,0 +1,111 @@
+XGBoost Parameters
+====
+Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters:
+- General parameters relates to which booster we are using to do boosting, commonly tree or linear model
+- Booster parameters depends on which booster you have chosen
+- Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks. 
+- In addition to these parameters, there can be console parameters that relates to behavior of console version of xgboost(e.g. when to save model)
+
+### Parameters in R Package
+In R-package, you can use .(dot) to replace under score in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R.
+
+### General Parameters
+* booster [default=gbtree]
+  - which booster to use, can be gbtree or gblinear. The details about different boosters are described [here](https://github.com/dmlc/xgboost/wiki/Boosters). 
+* silent [default=0]
+  - 0 means printing running messages, 1 means silent mode.
+* nthread [default to maximum number of threads available if not set]
+  - number of parallel threads used to run xgboost
+* num_pbuffer [set automatically by xgboost, no need to be set by user]
+  - size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. 
+* num_feature [set automatically by xgboost, no need to be set by user]
+  - feature dimension used in boosting, set to maximum dimension of the feature
+
+### Booster Parameters
+From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parameters. Parameter with or without bst: prefix will be equivalent(i.e. both bst:eta and eta will be valid parameter setting) .
+
+#### Parameter for Tree Booster
+* eta [default=0.3] 
+  - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. 
+* gamma 
+  - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
+* max_depth [default=6] 
+  - maximum depth of a tree
+* min_child_weight [default=1] 
+  - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
+* max_delta_step [default=0]
+  - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
+* subsample [default=1] 
+  - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.
+* colsample_bytree [default=1]
+  - subsample ratio of columns when constructing each tree.
+
+#### Parameter for Linear Booster
+* lambda [default=0]
+  - L2 regularization term on weights
+* alpha [default=0]
+  - L1 regularization term on weights 
+* lambda_bias 
+  - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)
+
+### Task Parameters 
+* objective [ default=reg:linear ]
+ - specify the learning task and the corresponding learning objective, and the objective options are below:
+ - "reg:linear" --linear regression
+ - "reg:logistic" --logistic regression
+ - "binary:logistic" --logistic regression for binary classification, output probability
+ - "binary:logitraw" --logistic regression for binary classification, output score before logistic transformation
+ - "multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
+ - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
+ - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss
+* base_score [ default=0.5 ]
+  - the initial prediction score of all instances, global bias
+* eval_metric [ default according to objective ] 
+  - evaluation metrics for validation data, a default metric will be assigned according to objective( rmse for regression, and error for classification, mean average precision for ranking ) 
+  - User can add multiple evaluation metrics, for python user, remember to pass the metrics in as list of parameters pairs instead of map, so that latter 'eval_metric' won't override previous one
+  - The choices are listed below:
+  - "rmse": [root mean square error](http://en.wikipedia.org/wiki/Root_mean_square_error)
+  - "logloss": negative [log-likelihood](http://en.wikipedia.org/wiki/Log-likelihood)
+  - "error": Binary classification error rate. It is calculated as #(wrong cases)/#(all cases). For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
+  - "merror": Multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases).
+  - "mlogloss":  Multiclass logloss
+  - "auc": [Area under the curve](http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve) for ranking evaluation.
+  - "ndcg":[Normalized Discounted Cumulative Gain](http://en.wikipedia.org/wiki/NDCG) 
+  - "map":[Mean average precision](http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision)
+  - "ndcg@n","map@n": n can be assigned as an integer to cut off the top positions in the lists for evaluation.
+  - "ndcg-","map-","ndcg@n-","map@n-": In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions.
+training repeatively 
+* seed [ default=0 ]
+ - random number seed.
+
+### Console Parameters
+The following parameters are only used in the console version of xgboost
+* use_buffer [ default=1 ]
+ -  whether create binary buffer for text input, this normally will speedup loading when do 
+* num_round 
+ - the number of round for boosting.
+* data 
+  - The path of training data
+* test:data  
+  - The path of test data to do prediction
+* save_period [default=0] 
+ - the period to save the model, setting save_period=10 means that for every 10 rounds XGBoost will save the model, setting it to 0 means not save any model during training.
+* task [default=train] options: train, pred, eval, dump
+  - train: training using data
+  - pred: making prediction for test:data
+  - eval: for evaluating statistics specified by eval[name]=filenam
+  - dump: for dump the learned model into text format(preliminary)
+* model_in [default=NULL]
+  - path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model
+* model_out [default=NULL]
+  - path to output model after training finishes, if not specified, will output like 0003.model where 0003 is number of rounds to do boosting.
+* model_dir [default=models]
+  - The output directory of the saved models during training
+* fmap  
+  - feature map, used for dump model
+* name_dump [default=dump.txt] 
+  - name of model dump file
+* name_pred [default=pred.txt]
+  - name of prediction file, used in pred mode
+* pred_margin [default=0]
+  - predict margin instead of transformed probability
--- a/doc/python.md
+++ b/doc/python.md
@ -0,0 +1,126 @@
+XGBoost Python Module
+====
+
+This page will introduce XGBoost Python module, including:
+* [Building and Import](#building-and-import)
+* [Data Interface](#data-interface)
+* [Setting Parameters](#setting-parameters)
+* [Train Model](#training-model)
+* [Early Stopping](#early-stopping)
+* [Prediction](#prediction)
+
+A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided.
+
+=
+#### Install
+
+To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run 
+
+```shell
+python setup.py install
+```
+Then import the module in Python as usual
+```python
+import xgboost as xgb
+```
+
+=
+#### Data Interface
+XGBoost python module is able to loading from libsvm txt format file, Numpy 2D array and xgboost binary buffer file. The data will be store in ```DMatrix``` object. 
+
+* To load libsvm text format file and XGBoost binary file into ```DMatrix```, the usage is like
+```python
+dtrain = xgb.DMatrix('train.svm.txt')
+dtest = xgb.DMatrix('test.svm.buffer')
+```
+* To load numpy array into ```DMatrix```, the usage is like
+```python
+data = np.random.rand(5,10) # 5 entities, each contains 10 features
+label = np.random.randint(2, size=5) # binary target
+dtrain = xgb.DMatrix( data, label=label)
+```
+* Build ```DMatrix``` from ```scipy.sparse```
+```python
+csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
+dtrain = xgb.DMatrix( csr )
+```
+* Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time. The usage is like:
+```python
+dtrain = xgb.DMatrix('train.svm.txt')
+dtrain.save_binary("train.buffer")
+``` 
+* To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` like:
+```python
+dtrain = xgb.DMatrix( data, label=label, missing = -999.0)
+``` 
+* Weight can be set when needed, like
+```python
+w = np.random.rand(5,1)
+dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=w)
+```
+
+
+=
+#### Setting Parameters
+XGBoost use list of pair to save [parameters](https://github.com/tqchen/xgboost/wiki/Parameters). Eg
+* Booster parameters 
+```python
+param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
+param['nthread'] = 4
+plst = param.items()
+plst += [('eval_metric', 'auc')] # Multiple evals can be handled in this way
+plst += [('eval_metric', 'ams@0')] 
+```
+* Specify validations set to watch performance
+```python
+evallist  = [(dtest,'eval'), (dtrain,'train')]
+```
+
+=
+#### Training Model
+With parameter list and data, you are able to train a model. 
+* Training 
+```python
+num_round = 10
+bst = xgb.train( plst, dtrain, num_round, evallist )
+```
+* Saving model
+After training, you can save model and dump it out.
+```python
+bst.save_model('0001.model')
+```
+* Dump Model and Feature Map
+You can dump the model to txt and review the meaning of model
+```python
+# dump model
+bst.dump_model('dump.raw.txt')
+# dump model with feature map
+bst.dump_model('dump.raw.txt','featmap.txt')
+```
+* Loading model
+After you save your model, you can load model file at anytime by using
+```python
+bst = xgb.Booster({'nthread':4}) #init model
+bst.load_model("model.bin") # load data
+```
+=
+#### Early stopping
+
+If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in `evals`. If there's more than one, it will use the last.
+
+`train(..., evals=evals, early_stopping_rounds=10)`
+
+The model will train until the validation score stops improving. Validation error needs to decrease at least every `early_stopping_rounds` to continue training. 
+
+If early stopping occurs, the model will have two additional fields: `bst.best_score` and `bst.best_iteration`. Note that `train()` will return a model from the last iteration, not the best one.
+
+This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC).
+
+=
+#### Prediction
+After you training/loading a model and preparing the data, you can start to do prediction.
+```python
+data = np.random.rand(7,10) # 7 entities, each contains 10 features
+dtest = xgb.DMatrix( data, missing = -999.0 )
+ypred = bst.predict( xgmat )
+```
--- a/src/data.h
+++ b/src/data.h
@ -149,7 +149,7 @@ class IFMatrix {
  virtual size_t NumCol(void) const = 0;
  /*! \brief get number of non-missing entries in column */
  virtual size_t GetColSize(size_t cidx) const = 0;
-  /*! \brief get column density */  
+  /*! \brief get column density */
  virtual float GetColDensity(size_t cidx) const = 0;  
  /*! \brief reference of buffered rowset */
  virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
--- a/src/io/dmlc_simple.cpp
+++ b/src/io/dmlc_simple.cpp
@ -8,45 +8,132 @@

 namespace xgboost {
 namespace utils {
+/*!
+ * \brief line split implementation from single FILE 
+ * simply returns lines of files, used for stdin
+ */
 class SingleFileSplit : public dmlc::InputSplit {
 public:
-  explicit SingleFileSplit(const char *fname) 
-      : use_stdin_(false) {
+  explicit SingleFileSplit(const char *fname)
+      : use_stdin_(false),
+        chunk_begin_(NULL), chunk_end_(NULL) {
    if (!std::strcmp(fname, "stdin")) {
 #ifndef XGBOOST_STRICT_CXX98_
      use_stdin_ = true; fp_ = stdin;
 #endif
    }
    if (!use_stdin_) {
-      fp_ = utils::FopenCheck(fname, "r");
+      fp_ = utils::FopenCheck(fname, "rb");
    }
-    end_of_file_ = false;
+    buffer_.resize(kBufferSize);
  }
  virtual ~SingleFileSplit(void) {
    if (!use_stdin_) std::fclose(fp_);
  }
-  virtual bool ReadRecord(std::string *out_data) {
-    if (end_of_file_) return false;
-    out_data->clear();
-    while (true) {
-      char c = std::fgetc(fp_);
-      if (c == EOF) {
-        end_of_file_ = true;
+  virtual size_t Read(void *ptr, size_t size) {
+    return std::fread(ptr, 1, size, fp_);
+  }  
+  virtual void Write(const void *ptr, size_t size) {
+    utils::Error("cannot do write in inputsplit");
+  }
+  virtual bool NextRecord(Blob *out_rec) {
+    if (chunk_begin_ == chunk_end_) {
+      if (!LoadChunk()) return false;
+    }
+    char *next = FindNextRecord(chunk_begin_,
+                                chunk_end_);
+    out_rec->dptr = chunk_begin_;
+    out_rec->size = next - chunk_begin_;
+    chunk_begin_ = next;    
+    return true;
+  }
+  virtual bool NextChunk(Blob *out_chunk) {
+    if (chunk_begin_ == chunk_end_) {
+      if (!LoadChunk()) return false;
+    }    
+    out_chunk->dptr = chunk_begin_;
+    out_chunk->size = chunk_end_ - chunk_begin_;
+    chunk_begin_ = chunk_end_;
+    return true;
+  }
+  inline bool ReadChunk(void *buf, size_t *size) {
+    size_t max_size = *size;
+    if (max_size <= overflow_.length()) {
+      *size = 0; return true;
+    }
+    if (overflow_.length() != 0) { 
+      std::memcpy(buf, BeginPtr(overflow_), overflow_.length());  
+    }
+    size_t olen = overflow_.length();
+    overflow_.resize(0);
+    size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
+                              max_size - olen);
+    nread += olen;
+    if (nread == 0) return false;
+    if (nread != max_size) {
+      *size = nread;
+      return true;
+    } else {
+      const char *bptr = reinterpret_cast<const char*>(buf);
+      // return the last position where a record starts
+      const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
+      *size = bend - bptr;
+      overflow_.resize(max_size - *size);
+      if (overflow_.length() != 0) {
+        std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
      }
-      if (c != '\r' && c != '\n' && c != EOF) {
-        *out_data += c;
+      return true;
+    }
+  }
+  
+ protected:
+  inline const char* FindLastRecordBegin(const char *begin,
+                                         const char *end) {
+    if (begin == end) return begin;
+    for (const char *p = end - 1; p != begin; --p) {
+      if (*p == '\n' || *p == '\r') return p + 1; 
+    }
+    return begin;
+  }
+  inline char* FindNextRecord(char *begin, char *end) {
+    char *p;
+    for (p = begin; p != end; ++p) {
+      if (*p == '\n' || *p == '\r') break;
+    }
+    for (; p != end; ++p) {
+      if (*p != '\n' && *p != '\r') return p;
+    }
+    return end;
+  }
+  inline bool LoadChunk(void) {
+    while (true) {
+      size_t size = buffer_.length();
+      if (!ReadChunk(BeginPtr(buffer_), &size)) return false;
+      if (size == 0) {
+        buffer_.resize(buffer_.length() * 2);
      } else {
-        if (out_data->length() != 0) return true;
-        if (end_of_file_) return false;
+        chunk_begin_ = reinterpret_cast<char *>(BeginPtr(buffer_));
+        chunk_end_ = chunk_begin_ + size;
+        break;
      }
    }
-    return false;
-  }  
-    
+    return true;
+  }
+
 private:
+  // buffer size
+  static const size_t kBufferSize = 1 << 18UL;
+  // file
  std::FILE *fp_;
  bool use_stdin_;
-  bool end_of_file_;
+  // internal overflow
+  std::string overflow_;
+  // internal buffer
+  std::string buffer_;
+  // beginning of chunk
+  char *chunk_begin_;
+  // end of chunk
+  char *chunk_end_;
 };

 class StdFile : public dmlc::Stream {
@ -105,7 +192,8 @@ class StdFile : public dmlc::Stream {
 namespace dmlc {
 InputSplit* InputSplit::Create(const char *uri,
                               unsigned part,
-                               unsigned nsplit) {
+                               unsigned nsplit,
+                               const char *type) {
  using namespace xgboost;
  const char *msg = "xgboost is compiled in local mode\n"\
      "to use hdfs, s3 or distributed version, compile with make dmlc=1";
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@ -6,82 +6,75 @@
 #include "../utils/io.h"
 #include "../utils/utils.h"
 #include "simple_dmatrix-inl.hpp"
-#ifndef XGBOOST_STRICT_CXX98_
 #include "page_dmatrix-inl.hpp"
-#include "page_fmatrix-inl.hpp"
-#endif
-// implements data loads using dmatrix simple for now

 namespace xgboost {
 namespace io {
-DataMatrix* LoadDataMatrix(const char *fname, bool silent,
-                           bool savebuffer, bool loadsplit) {
-  if (!std::strcmp(fname, "stdin") ||
-      !std::strncmp(fname, "s3://", 5) ||
-      !std::strncmp(fname, "hdfs://", 7) ||
-      loadsplit) {
-    DMatrixSimple *dmat = new DMatrixSimple();
-    dmat->LoadText(fname, silent, loadsplit);
-    return dmat;
+DataMatrix* LoadDataMatrix(const char *fname,
+                           bool silent,
+                           bool savebuffer,
+                           bool loadsplit,
+                           const char *cache_file) {
+  std::string fname_ = fname;
+  
+  const char *dlm = strchr(fname, '#');
+  if (dlm != NULL) {
+    utils::Check(strchr(dlm + 1, '#') == NULL,
+                 "only one `#` is allowed in file path for cachefile specification");
+    utils::Check(cache_file == NULL,
+                 "can only specify the cachefile with `#` or argument, not both");
+    fname_ = std::string(fname, dlm - fname);
+    fname = fname_.c_str();
+    cache_file = dlm +1;
  }
-  int magic;
-  utils::FileStream fs(utils::FopenCheck(fname, "rb"));
-  utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
-  fs.Seek(0);

-  if (magic == DMatrixSimple::kMagic) { 
-    DMatrixSimple *dmat = new DMatrixSimple();
-    dmat->LoadBinary(fs, silent, fname);
+  if (cache_file == NULL) { 
+    if (!std::strcmp(fname, "stdin") ||
+        !std::strncmp(fname, "s3://", 5) ||
+        !std::strncmp(fname, "hdfs://", 7) ||
+        loadsplit) {
+      DMatrixSimple *dmat = new DMatrixSimple();
+      dmat->LoadText(fname, silent, loadsplit);
+      return dmat;
+    }
+    int magic;
+    utils::FileStream fs(utils::FopenCheck(fname, "rb"));
+    utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
+    fs.Seek(0);
+    if (magic == DMatrixSimple::kMagic) { 
+      DMatrixSimple *dmat = new DMatrixSimple();
+      dmat->LoadBinary(fs, silent, fname);
+      fs.Close();
+      return dmat;
+    }
    fs.Close();
+    DMatrixSimple *dmat = new DMatrixSimple();
+    dmat->CacheLoad(fname, silent, savebuffer);
    return dmat;
-  }
-#ifndef XGBOOST_STRICT_CXX98_
-  std::string tmp_fname;
-  const char *fname_ext = NULL;
-  if (strchr(fname, ';') != NULL) {
-    tmp_fname = fname;
-    char *ptr = strchr(&tmp_fname[0], ';');
-    ptr[0] = '\0'; fname = &tmp_fname[0];
-    fname_ext = ptr + 1;
-  }
-  if (magic == DMatrixPage::kMagic) {
-    if (fname_ext == NULL) {
+  } else {
+    std::string cache_fname = cache_file;
+    if (loadsplit) {
+      std::ostringstream os;
+      os << cache_file << ".r" << rabit::GetRank();
+      cache_fname = os.str();
+      cache_file = cache_fname.c_str();
+    }
+    FILE *fi = fopen64(cache_file, "rb");
+    if (fi != NULL) {
      DMatrixPage *dmat = new DMatrixPage();
-      dmat->Load(fs, silent, fname);
+      utils::FileStream fs(fi);
+      dmat->LoadBinary(fs, silent, cache_file);
+      fs.Close();
      return dmat;
-    } else {
-      DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
-      dmat->Load(fs, silent, fname, true);
+    } else {   
+      DMatrixPage *dmat = new DMatrixPage();
+      dmat->LoadText(fname, cache_file, false, loadsplit);
      return dmat;
    }
  }
-  if (magic == DMatrixColPage::kMagic) {
-    std::string sfname = fname;
-    if (fname_ext == NULL) {
-      sfname += ".col"; fname_ext = sfname.c_str();
-    }
-    DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
-    dmat->Load(fs, silent, fname);
-    return dmat;
-  }
- #endif
-  fs.Close();
-  DMatrixSimple *dmat = new DMatrixSimple();
-  dmat->CacheLoad(fname, silent, savebuffer);
-  return dmat;
 }

-void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
-#ifndef XGBOOST_STRICT_CXX98_
-  if (!strcmp(fname + strlen(fname) - 5, ".page")) {    
-    DMatrixPage::Save(fname, dmat, silent);
-    return;
-  }
-  if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
-    DMatrixColPage::Save(fname, dmat, silent);
-    return;
-  }
-#endif
+void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {  
  if (dmat.magic == DMatrixSimple::kMagic) {
    const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
    p_dmat->SaveBinary(fname, silent);
--- a/src/io/io.h
+++ b/src/io/io.h
@ -21,12 +21,16 @@ typedef learner::DMatrix DataMatrix;
 * \param savebuffer whether temporal buffer the file if the file is in text format
 * \param loadsplit whether we only load a split of input files
 *   such that each worker node get a split of the data
+ * \param cache_file name of cache_file, used by external memory version
+ *        can be NULL, if cache_file is specified, this will be the temporal
+ *        space that can be re-used to store intermediate data
 * \return a loaded DMatrix
 */
 DataMatrix* LoadDataMatrix(const char *fname,
                           bool silent,
                           bool savebuffer,
-                           bool loadsplit);
+                           bool loadsplit,
+                           const char *cache_file = NULL);
 /*!
 * \brief save DataMatrix into stream, 
 *  note: the saved dmatrix format may not be in exactly same as input
--- a/src/io/libsvm_parser.h
+++ b/src/io/libsvm_parser.h
@ -0,0 +1,210 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file libsvm_parser.h
+ * \brief iterator parser to parse libsvm format
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_IO_LIBSVM_PARSER_H_
+#define XGBOOST_IO_LIBSVM_PARSER_H_
+#define NOMINMAX
+#include <vector>
+#include <cstring>
+#include <cctype>
+#include <algorithm>
+#include "../utils/omp.h"
+#include "../utils/utils.h"
+#include "../sync/sync.h"
+#include "../utils/thread_buffer.h"
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace io {
+/*! \brief page returned by libsvm parser */
+struct LibSVMPage : public SparsePage {
+  std::vector<float> label;
+  // overload clear  
+  inline void Clear() {
+    SparsePage::Clear();
+    label.clear();
+  }
+};
+/*!
+ * \brief libsvm parser that parses the input lines
+ * and returns rows in input data
+ * factry that was used by threadbuffer template
+ */
+class LibSVMPageFactory  {
+ public:
+  explicit LibSVMPageFactory() 
+      : bytes_read_(0), at_head_(true) {
+  }
+  inline bool Init(void) {
+    return true;
+  }
+  inline void Setup(dmlc::InputSplit *source,
+                    int nthread) {
+    source_ = source;
+    int maxthread;
+    #pragma omp parallel
+    {
+      maxthread = omp_get_num_threads();
+    }
+    maxthread = std::max(maxthread / 2, 1);
+    nthread_ = std::min(maxthread, nthread);
+  }
+  inline void SetParam(const char *name, const char *val) {}
+  inline bool LoadNext(std::vector<LibSVMPage> *data) {
+    return FillData(data);
+  }
+  inline void FreeSpace(std::vector<LibSVMPage> *a) {
+    delete a;
+  }
+  inline std::vector<LibSVMPage> *Create(void) {
+    return new std::vector<LibSVMPage>();
+  }
+  inline void BeforeFirst(void) {
+    utils::Assert(at_head_, "cannot call beforefirst");
+  }
+  inline void Destroy(void) {
+    delete source_;
+  }
+  inline size_t bytes_read(void) const {
+    return bytes_read_;
+  }
+
+ protected:
+  inline bool FillData(std::vector<LibSVMPage> *data) {
+    dmlc::InputSplit::Blob chunk;
+    if (!source_->NextChunk(&chunk)) return false;
+    int nthread;
+    #pragma omp parallel num_threads(nthread_)
+    {
+      nthread = omp_get_num_threads();
+    }
+    // reserve space for data
+    data->resize(nthread);
+    bytes_read_ += chunk.size;
+    utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
+    char *head = reinterpret_cast<char*>(chunk.dptr);        
+    #pragma omp parallel num_threads(nthread_)
+    {
+      // threadid
+      int tid = omp_get_thread_num();
+      size_t nstep = (chunk.size + nthread - 1) / nthread;
+      size_t sbegin = std::min(tid * nstep, chunk.size);
+      size_t send = std::min((tid + 1) * nstep, chunk.size);
+      char *pbegin = BackFindEndLine(head + sbegin, head);
+      char *pend;
+      if (tid + 1 == nthread) {
+        pend = head + send;
+      } else {
+        pend = BackFindEndLine(head + send, head);
+      }
+      ParseBlock(pbegin, pend, &(*data)[tid]);
+    }
+    return true;
+  }
+  /*!
+   * \brief parse data into out
+   * \param begin beginning of buffer
+   * \param end end of buffer
+   */
+  inline void ParseBlock(char *begin,
+                         char *end,
+                         LibSVMPage *out) {    
+    out->Clear();
+    char *p = begin;
+    while (p != end) {
+      while (isspace(*p) && p != end) ++p;
+      if (p == end) break;
+      char *head = p;
+      while (isdigit(*p) && p != end) ++p;
+      if (*p == ':') {
+        out->data.push_back(SparseBatch::Entry(atol(head),
+                                               static_cast<bst_float>(atof(p + 1))));
+      } else {
+        if (out->label.size() != 0) {
+          out->offset.push_back(out->data.size());
+        }
+        out->label.push_back(static_cast<float>(atof(head)));
+      }
+      while (!isspace(*p) && p != end) ++p;
+    }
+    if (out->label.size() != 0) {
+      out->offset.push_back(out->data.size());
+    }
+    utils::Check(out->label.size() + 1 == out->offset.size(),
+                 "LibSVMParser inconsistent");
+  }
+  /*!
+   * \brief start from bptr, go backward and find first endof line
+   * \param bptr end position to go backward
+   * \param begin the beginning position of buffer
+   * \return position of first endof line going backward
+   */
+  inline char* BackFindEndLine(char *bptr,
+                               char *begin) {
+    for (; bptr != begin; --bptr) {
+      if (*bptr == '\n' || *bptr == '\r') return bptr;
+    }
+    return begin;
+  }
+  
+ private:
+  // nthread
+  int nthread_;
+  // number of bytes readed
+  size_t bytes_read_;
+  // at beginning, at end of stream
+  bool at_head_;
+  // source split that provides the data
+  dmlc::InputSplit *source_;
+};
+
+class LibSVMParser : public utils::IIterator<LibSVMPage> {
+ public:
+  explicit LibSVMParser(dmlc::InputSplit *source,
+                        int nthread)
+      : at_end_(false), data_ptr_(0), data_(NULL) {
+    itr.SetParam("buffer_size", "2");
+    itr.get_factory().Setup(source, nthread);
+    itr.Init();
+  }
+  virtual void BeforeFirst(void) {
+    itr.BeforeFirst();
+  }
+  virtual bool Next(void) {
+    if (at_end_) return false;
+    while (true) {
+      if (data_ == NULL || data_ptr_ >= data_->size()) {
+        if (!itr.Next(data_)) {
+          at_end_ = true; return false;
+        } else {
+          data_ptr_ = 0;
+        }
+      }
+      while (data_ptr_ < data_->size()) {
+        data_ptr_ += 1;
+        if ((*data_)[data_ptr_ - 1].Size() != 0) {
+          return true;
+        }
+      }
+    }
+    return true;
+  }
+  virtual const LibSVMPage &Value(void) const {
+    return (*data_)[data_ptr_ - 1];
+  }
+  inline size_t bytes_read(void) const {
+    return itr.get_factory().bytes_read();
+  }
+ private:
+  bool at_end_;
+  size_t data_ptr_;
+  std::vector<LibSVMPage> *data_;
+  utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
+}; 
+
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_LIBSVM_PARSER_H_
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@ -1,8 +1,8 @@
-#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
-#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
+#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
 /*!
- * \file page_row_iter-inl.hpp
- * row iterator based on sparse page
+ * \file page_dmatrix-inl.hpp
+ *   row iterator based on sparse page
 * \author Tianqi Chen
 */
 #include <vector>
@ -10,97 +10,12 @@
 #include "../utils/iterator.h"
 #include "../utils/thread_buffer.h"
 #include "./simple_fmatrix-inl.hpp"
+#include "./sparse_batch_page.h"
+#include "./page_fmatrix-inl.hpp"
+#include "./libsvm_parser.h"

 namespace xgboost {
 namespace io {
-/*! \brief page structure that can be used to store a rowbatch */
-struct RowBatchPage {
- public:
-  explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
-    data_ = new int[kPageSize];
-    utils::Assert(data_ != NULL, "fail to allocate row batch page");
-    this->Clear();
-  }
-  ~RowBatchPage(void) {
-    if (data_ != NULL) delete [] data_;
-  }
-  /*! 
-   * \brief Push one row into page
-   *  \param row an instance row
-   *  \return false or true to push into
-   */  
-  inline bool PushRow(const RowBatch::Inst &row) {
-    const size_t dsize = row.length * sizeof(RowBatch::Entry);
-    if (FreeBytes() < dsize+ sizeof(int)) return false;
-    row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
-    memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
-    ++data_[0];
-    return true;
-  }
-  /*!
-   * \brief get a row batch representation from the page
-   * \param p_rptr a temporal space that can be used to provide
-   *  ind_ptr storage for RowBatch
-   * \return a new RowBatch object
-   */
-  inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
-    RowBatch batch;
-    batch.base_rowid = base_rowid;
-    batch.data_ptr = this->data_ptr(0);
-    batch.size = static_cast<size_t>(this->Size());
-    std::vector<size_t> &rptr = *p_rptr;
-    rptr.resize(this->Size() + 1);
-    for (size_t i = 0; i < rptr.size(); ++i) {
-      rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
-    }
-    batch.ind_ptr = &rptr[0];
-    return batch;
-  }
-  /*! \brief get i-th row from the batch */
-  inline RowBatch::Inst operator[](int i) {
-    return RowBatch::Inst(data_ptr(0) + row_ptr(i),
-                          static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
-  }
-  /*!
-   * \brief clear the page, cleanup the content
-   */
-  inline void Clear(void) {
-    memset(&data_[0], 0, sizeof(int) * kPageSize);
-  }
-  /*!
-   * \brief load one page form instream
-   * \return true if loading is successful
-   */
-  inline bool Load(utils::IStream &fi) {
-    return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
-  }
-  /*! \brief save one page into outstream */
-  inline void Save(utils::IStream &fo) {
-    fo.Write(&data_[0], sizeof(int) * kPageSize);
-  }
-  /*! \return number of elements */
-  inline int Size(void) const {
-    return data_[0];
-  }
-
- protected:
-  /*! \return number of elements */
-  inline size_t FreeBytes(void) {
-    return (kPageSize - (Size() + 2)) * sizeof(int) -
-        row_ptr(Size()) * sizeof(RowBatch::Entry);
-  }
-  /*! \brief equivalent row pointer at i */
-  inline int& row_ptr(int i) {
-    return data_[kPageSize - i - 1];
-  }
-  inline RowBatch::Entry* data_ptr(int i) {
-    return (RowBatch::Entry*)(&data_[1]) + i;
-  }
-  // content of data
-  int *data_;
-  // page size
-  const size_t kPageSize;
-};
 /*! \brief thread buffer iterator */
 class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
 public:
@ -118,7 +33,10 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
  }
  virtual bool Next(void) {
    if (!itr.Next(page_)) return false;
-    out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
+    out_.base_rowid  = base_rowid_;
+    out_.ind_ptr = BeginPtr(page_->offset);
+    out_.data_ptr = BeginPtr(page_->data);
+    out_.size = page_->offset.size() - 1;
    base_rowid_ += out_.size;
    return true;
  }
@ -127,76 +45,18 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
  }
  /*! \brief load and initialize the iterator with fi */
  inline void Load(const utils::FileStream &fi) {
-    itr.get_factory().SetFile(fi);
+    itr.get_factory().SetFile(fi, 0);
    itr.Init();
    this->BeforeFirst();
  }
-  /*!
-   * \brief save a row iterator to output stream, in row iterator format
-   */
-  inline static void Save(utils::IIterator<RowBatch> *iter,
-                          utils::IStream &fo) {
-    RowBatchPage page(kPageSize);
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        if (!page.PushRow(batch[i])) {
-          page.Save(fo);
-          page.Clear();
-          utils::Check(page.PushRow(batch[i]), "row is too big");
-        }
-      }
-    }
-    if (page.Size() != 0) page.Save(fo);
-  }
-  /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 18;

 private:
  // base row id
  size_t base_rowid_;
-  // temporal ptr
-  std::vector<size_t> tmp_ptr_;
  // output data
  RowBatch out_;
-  // page pointer type
-  typedef RowBatchPage* PagePtr;
-  // loader factory for page
-  struct Factory {
-   public:
-    size_t file_begin_;
-    utils::FileStream fi;
-    Factory(void) {}
-    inline void SetFile(const utils::FileStream &fi) {
-      this->fi = fi;
-      file_begin_ = this->fi.Tell();
-    }
-    inline bool Init(void) {
-      return true;
-    }
-    inline void SetParam(const char *name, const char *val) {}
-    inline bool LoadNext(PagePtr &val) {
-      return val->Load(fi);
-    }
-    inline PagePtr Create(void) {
-      PagePtr a = new RowBatchPage(kPageSize);
-      return a;
-    }
-    inline void FreeSpace(PagePtr &a) {
-      delete a;
-    }
-    inline void Destroy(void) {
-      fi.Close();
-    }
-    inline void BeforeFirst(void) {
-      fi.Seek(file_begin_);
-    }
-  };
-
- protected:
-  PagePtr page_;
-  utils::ThreadBuffer<PagePtr, Factory> itr;
+  SparsePage *page_;
+  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
 };

 /*! \brief data matrix using page */
@ -211,24 +71,56 @@ class DMatrixPageBase : public DataMatrix {
    // do not delete row iterator, since it is owned by fmat
    // to be cleaned up in a more clear way
  }
+  /*! \brief save a DataMatrix as DMatrixPage */
+  inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) {
+    std::string fname = fname_;
+    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "wb"));
+    int magic = kMagic;
+    fs.Write(&magic, sizeof(magic));
+    mat.info.SaveBinary(fs);
+    fs.Close();
+    fname += ".row.blob";
+    utils::IIterator<RowBatch> *iter = mat.fmat()->RowIterator();
+    utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb"));
+    SparsePage page;
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        page.Push(batch[i]);
+        if (page.MemCostBytes() >= kPageSize) {
+          page.Save(&fbin); page.Clear();
+        }
+      }
+    }
+    if (page.data.size() != 0) page.Save(&fbin);
+    fbin.Close();
+    if (!silent) {
+      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
+                    static_cast<unsigned long>(mat.info.num_row()),
+                    static_cast<unsigned long>(mat.info.num_col()), fname_);
+    }
+  }
  /*! \brief load and initialize the iterator with fi */
-  inline void Load(utils::FileStream &fi,
-                   bool silent = false,
-                   const char *fname = NULL,
-                   bool skip_magic_check = false) {
+  inline void LoadBinary(utils::FileStream &fi,
+                         bool silent,
+                         const char *fname_) {
+    this->set_cache_file(fname_);
+    std::string fname = fname_;
    int tmagic;
    utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    if (!skip_magic_check) {
-      utils::Check(tmagic == magic, "invalid format,magic number mismatch");
-    }
+    utils::Check(tmagic == magic, "invalid format,magic number mismatch");
    this->info.LoadBinary(fi);
-    iter_->Load(fi);
+    // load in the row data file
+    fname += ".row.blob";
+    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb"));
+    iter_->Load(fs);
    if (!silent) {
      utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
                    static_cast<unsigned long>(info.num_row()),
                    static_cast<unsigned long>(info.num_col()));
-      if (fname != NULL) {
-        utils::Printf(" from %s\n", fname);
+      if (fname_ != NULL) {
+        utils::Printf(" from %s\n", fname_);
      } else {
        utils::Printf("\n");
      }
@ -237,24 +129,80 @@ class DMatrixPageBase : public DataMatrix {
      }
    }
  }
-  /*! \brief save a DataMatrix as DMatrixPage*/
-  inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
-    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
+  /*! \brief save a LibSVM format file as DMatrixPage */
+  inline void LoadText(const char *uri,
+                       const char* cache_file,
+                       bool silent,
+                       bool loadsplit) {
+    if (!silent) {
+      utils::Printf("start generate text file from %s\n", uri);
+    }
+    int rank = 0, npart = 1;
+    if (loadsplit) {
+      rank = rabit::GetRank();
+      npart = rabit::GetWorldSize();
+    }
+    this->set_cache_file(cache_file);
+    std::string fname_row = std::string(cache_file) + ".row.blob";
+    utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));    
+    SparsePage page;
+    size_t bytes_write = 0;
+    double tstart = rabit::utils::GetTime();
+    LibSVMParser parser(
+        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
+    info.Clear();
+    while (parser.Next()) {
+      const LibSVMPage &batch = parser.Value();
+      size_t nlabel = info.labels.size();
+      info.labels.resize(nlabel + batch.label.size());
+      if (batch.label.size() != 0) {
+        std::memcpy(BeginPtr(info.labels) + nlabel,
+                    BeginPtr(batch.label),
+                    batch.label.size() * sizeof(float));
+      }
+      page.Push(batch);
+      for (size_t i = 0; i < batch.data.size(); ++i) {
+        info.info.num_col = std::max(info.info.num_col,
+                                     static_cast<size_t>(batch.data[i].index+1));
+      }
+      if (page.MemCostBytes() >= kPageSize) {
+        bytes_write += page.MemCostBytes();
+        page.Save(&fo);
+        page.Clear();
+        double tdiff = rabit::utils::GetTime() - tstart;
+        if (!silent) {
+          utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
+                        cache_file, (bytes_write >> 20UL) / tdiff,
+                        (bytes_write >> 20UL));
+        }
+      }
+      info.info.num_row += batch.label.size();
+    }
+    if (page.data.size() != 0) {
+      page.Save(&fo);
+    }
+    fo.Close();    
+    iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));    
+    // save data matrix
+    utils::FileStream fs(utils::FopenCheck(cache_file, "wb"));
    int magic = kMagic;
    fs.Write(&magic, sizeof(magic));
-    mat.info.SaveBinary(fs);
-    ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
+    this->info.SaveBinary(fs);
    fs.Close();
    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
-                    static_cast<unsigned long>(mat.info.num_row()),
-                    static_cast<unsigned long>(mat.info.num_col()), fname);
+      utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
+                    static_cast<unsigned long>(info.num_row()),
+                    static_cast<unsigned long>(info.num_col()),
+                    uri);
    }
  }
  /*! \brief magic number used to identify DMatrix */
  static const int kMagic = TKMagic;
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64UL << 20UL;

 protected:
+  virtual void set_cache_file(const std::string &cache_file)  = 0;
  /*! \brief row iterator */
  ThreadRowPageIterator *iter_;
 };
@ -262,7 +210,7 @@ class DMatrixPageBase : public DataMatrix {
 class DMatrixPage : public DMatrixPageBase<0xffffab02> {
 public:
  DMatrixPage(void) {
-    fmat_ = new FMatrixS(iter_);
+    fmat_ = new FMatrixPage(iter_, this->info);
  }
  virtual ~DMatrixPage(void) {
    delete fmat_;
@ -270,8 +218,11 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> {
  virtual IFMatrix *fmat(void) const {
    return fmat_;
  }
+  virtual void set_cache_file(const std::string &cache_file) {
+    fmat_->set_cache_file(cache_file);
+  }
  /*! \brief the real fmatrix */
-  IFMatrix *fmat_;
+  FMatrixPage *fmat_;
 };
 }  // namespace io
 }  // namespace xgboost
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@ -2,235 +2,79 @@
 #define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
 /*!
 * \file page_fmatrix-inl.hpp
- * sparse page manager for fmatrix
+ *   col iterator based on sparse page
 * \author Tianqi Chen
 */
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "../data.h"
-#include "../utils/iterator.h"
-#include "../utils/io.h"
-#include "../utils/matrix_csr.h"
-#include "../utils/thread_buffer.h"
 namespace xgboost {
 namespace io {
-class CSCMatrixManager {
+/*! \brief thread buffer iterator */
+class ThreadColPageIterator: public utils::IIterator<ColBatch> {
 public:
-  /*! \brief in memory page */
-  struct Page {
-   public:
-    /*! \brief initialize the page */
-    explicit Page(size_t size) {
-      buffer.resize(size);
-      col_index.reserve(10);
-      col_data.reserve(10);
-    }
-    /*! \brief clear the page */
-    inline void Clear(void) {
-      num_entry = 0;
-      col_index.clear();
-      col_data.clear();
-    }
-    /*! \brief number of used entries */
-    size_t num_entry;
-    /*! \brief column index */
-    std::vector<bst_uint> col_index;
-    /*! \brief column data */
-    std::vector<ColBatch::Inst> col_data;
-    /*! \brief number of free entries */
-    inline size_t NumFreeEntry(void) const {
-      return buffer.size() - num_entry;
-    }
-    inline ColBatch::Entry* AllocEntry(size_t len) {
-      ColBatch::Entry *p_data = &buffer[0] + num_entry;
-      num_entry += len;
-      return p_data;
-    }
-    /*! \brief get underlying batch */
-    inline ColBatch GetBatch(void) const {
-      ColBatch batch;
-      batch.size = col_index.size();
-      batch.col_index = BeginPtr(col_index);
-      batch.col_data  = BeginPtr(col_data);
-      return batch;
-    }
-
-   private:
-    /*! \brief buffer space, not to be changed since ready */
-    std::vector<ColBatch::Entry> buffer;
-  };
-  /*! \brief define type of page pointer */
-  typedef Page *PagePtr;
-  // constructor
-  CSCMatrixManager(void) {
-    fi_ = NULL;
-  }
-  /*! \brief get column pointer */
-  inline const std::vector<size_t> &col_ptr(void) const {
-    return col_ptr_;
-  }
-  inline void SetParam(const char *name, const char *val) {
-  }
-  inline PagePtr Create(void) {
-    return new Page(page_size_);
-  }
-  inline void FreeSpace(PagePtr &a) {
-    delete a;
-  }
-  inline void Destroy(void) {
-  }
-  inline void BeforeFirst(void) {
-    col_index_ = col_todo_;
-    read_top_ = 0;
-  }
-  inline bool LoadNext(PagePtr &val) {
-    val->Clear();
-    if (read_top_ >= col_index_.size()) return false;
-    while (read_top_ < col_index_.size()) {
-      if (!this->TryFill(col_index_[read_top_], val)) {
-        return true;
-      }
-      ++read_top_;
-    }
-    return true;
-  }
-  inline bool Init(void) {
-    this->BeforeFirst();
-    return true;
-  }
-  inline void Setup(utils::ISeekStream *fi, double page_ratio) {
-    fi_ = fi;
-    fi_->Read(&begin_meta_ , sizeof(begin_meta_));
-    begin_data_ = static_cast<size_t>(fi->Tell());
-    fi_->Seek(begin_meta_);
-    fi_->Read(&col_ptr_);
-    size_t psmax = 0;
-    for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
-      psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
-    }
-    utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
-    page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
-  }
-  inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
-    if (!setall) {
-      col_todo_.resize(0);
-      for (size_t i = 0; i < cset.size(); ++i) {
-        if (col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1)) {
-          col_todo_.push_back(cset[i]);
-        }
-      }
-      std::sort(col_todo_.begin(), col_todo_.end());
-    } else {
-      col_todo_.resize(col_ptr_.size()-1);
-      for (size_t i = 0; i < col_todo_.size(); ++i) {
-        col_todo_[i] = static_cast<bst_uint>(i);
-      }
-    }
-  }
-
- private:
-  /*! \brief fill a page with */
-  inline bool TryFill(size_t cidx, Page *p_page) {
-    size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
-    if (p_page->NumFreeEntry() < len) return false;
-    ColBatch::Entry *p_data = p_page->AllocEntry(len);
-    fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
-    utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
-                 "invalid column buffer format");
-    p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast<bst_uint>(len)));
-    p_page->col_index.push_back(static_cast<bst_uint>(cidx));
-    return true;
-  }
-  // the following are in memory auxiliary data structure
-  /*! \brief top of reader position */
-  size_t read_top_;
-  /*! \brief size of page */
-  size_t page_size_;
-  /*! \brief column index to be loaded */
-  std::vector<bst_uint> col_index_;
-  /*! \brief column index to be after calling before first */
-  std::vector<bst_uint> col_todo_;
-  // the following are input content
-  /*! \brief beginning position of data content */
-  size_t begin_data_;
-  /*! \brief size of data content */
-  size_t begin_meta_;
-  /*! \brief input stream */
-  utils::ISeekStream *fi_;
-  /*! \brief column pointer of CSC format */
-  std::vector<size_t> col_ptr_;
-};
-
-class ThreadColPageIterator : public utils::IIterator<ColBatch> {
- public:
-  explicit ThreadColPageIterator(utils::ISeekStream *fi,
-                                 float page_ratio, bool silent) {
-    itr_.SetParam("buffer_size", "2");
-    itr_.get_factory().Setup(fi, page_ratio);
-    itr_.Init();
-    if (!silent) {
-      utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
-                    static_cast<unsigned>(col_ptr().size() - 1));
-    }
-  }
-  virtual ~ThreadColPageIterator(void) {
+  ThreadColPageIterator(void) {
+    itr.SetParam("buffer_size", "2");
+    page_ = NULL;
  }
+  virtual ~ThreadColPageIterator(void) {}
+  virtual void Init(void) {}
  virtual void BeforeFirst(void) {
-    itr_.BeforeFirst();
+    itr.BeforeFirst();
  }
  virtual bool Next(void) {
-    // page to be loaded
-    CSCMatrixManager::PagePtr page;
-    if (!itr_.Next(page)) return false;
-    out_ = page->GetBatch();
+    if (!itr.Next(page_)) return false;
+    out_.col_index = BeginPtr(itr.get_factory().index_set());
+    col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0));
+    for (size_t i = 0; i < col_data_.size(); ++i) {
+      col_data_[i] = SparseBatch::Inst
+          (BeginPtr(page_->data) + page_->offset[i],
+           static_cast<bst_uint>(page_->offset[i + 1] - page_->offset[i]));
+    }
+    out_.col_data = BeginPtr(col_data_);
+    out_.size = col_data_.size();
    return true;
  }
  virtual const ColBatch &Value(void) const {
    return out_;
  }
-  inline const std::vector<size_t> &col_ptr(void) const {
-    return itr_.get_factory().col_ptr();
+  /*! \brief load and initialize the iterator with fi */
+  inline void SetFile(const utils::FileStream &fi) {
+    itr.get_factory().SetFile(fi);
+    itr.Init();
  }
-  inline void SetColSet(const std::vector<bst_uint> &cset,
-                        bool setall = false) {
-    itr_.get_factory().SetColSet(cset, setall);
+  // set index set
+  inline void SetIndexSet(const std::vector<bst_uint> &fset, bool load_all) {
+    itr.get_factory().SetIndexSet(fset, load_all);    
  }
-
+  
 private:
  // output data
  ColBatch out_;
-  // internal iterator
-  utils::ThreadBuffer<CSCMatrixManager::PagePtr, CSCMatrixManager> itr_;
+  SparsePage *page_;
+  std::vector<SparseBatch::Inst> col_data_;
+  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
 };
 /*!
- * \brief sparse matrix that support column access
+ * \brief sparse matrix that support column access, CSC
 */
 class FMatrixPage : public IFMatrix {
 public:
+  typedef SparseBatch::Entry Entry;
  /*! \brief constructor */
-  FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer)
-      : fname_cbuffer_(fname_buffer) {
-    this->row_iter_ = iter;
-    this->col_iter_ = NULL;
-    this->fi_ = NULL;
+  FMatrixPage(utils::IIterator<RowBatch> *iter,
+              const learner::MetaInfo &info) : info(info) {
+    this->iter_ = iter;
  }
  // destructor
  virtual ~FMatrixPage(void) {
-    if (row_iter_ != NULL) delete row_iter_;
-    if (col_iter_ != NULL) delete col_iter_;
-    if (fi_ != NULL) {
-      fi_->Close(); delete fi_;
-    }
+    if (iter_ != NULL) delete iter_;
  }
  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const {
-    return col_iter_ != NULL;
+  virtual bool HaveColAccess(void) const {   
+    return col_size_.size() != 0;
  }
  /*! \brief get number of colmuns */
  virtual size_t NumCol(void) const {
    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_iter_->col_ptr().size() - 1;
+    return col_size_.size();
  }
  /*! \brief get number of buffered rows */
  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
@ -238,145 +82,207 @@ class FMatrixPage : public IFMatrix {
  }
  /*! \brief get column size */
  virtual size_t GetColSize(size_t cidx) const {
-    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
-    return col_ptr[cidx+1] - col_ptr[cidx];
+    return col_size_[cidx];
  }
  /*! \brief get column density */
  virtual float GetColDensity(size_t cidx) const {
-    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
-    size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
-    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+    size_t nmiss = num_buffered_row_ - (col_size_[cidx]);
+    return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
  }
-  virtual void InitColAccess(const std::vector<bool> &enabled, float pkeep = 1.0f) {
+  virtual void InitColAccess(const std::vector<bool> &enabled, 
+                             float pkeep = 1.0f) {
    if (this->HaveColAccess()) return;
-    utils::Printf("start to initialize page col access\n");
-    if (this->LoadColData()) {
-      utils::Printf("loading previously saved col data\n");
-      return;
-    }
-    this->InitColData(pkeep, fname_cbuffer_.c_str(),
-                      1 << 30, 5);
-    utils::Check(this->LoadColData(), "fail to read in column data");
-    utils::Printf("finish initialize page col access\n");
+    if (TryLoadColData()) return;
+    this->InitColData(enabled, pkeep);
+    utils::Check(TryLoadColData(), "failed on creating col.blob");
  }
  /*!
   * \brief get the row iterator associated with FMatrix
   */
  virtual utils::IIterator<RowBatch>* RowIterator(void) {
-    row_iter_->BeforeFirst();
-    return row_iter_;
+    iter_->BeforeFirst();
+    return iter_;
  }
  /*!
   * \brief get the column based  iterator
   */
  virtual utils::IIterator<ColBatch>* ColIterator(void) {
-    std::vector<bst_uint> cset;
-    col_iter_->SetColSet(cset, true);
-    col_iter_->BeforeFirst();
-    return col_iter_;
+    size_t ncol = this->NumCol();
+    col_index_.resize(0);
+    for (size_t i = 0; i < ncol; ++i) {
+      col_index_.push_back(static_cast<bst_uint>(i));
+    }
+    col_iter_.SetIndexSet(col_index_, false);
+    col_iter_.BeforeFirst();
+    return &col_iter_;
  }
  /*!
   * \brief colmun based iterator
   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    col_iter_->SetColSet(fset, false);
-    col_iter_->BeforeFirst();
-    return col_iter_;
+  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {    
+    size_t ncol = this->NumCol();
+    col_index_.resize(0);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (fset[i] < ncol) col_index_.push_back(fset[i]); 
+    }
+    col_iter_.SetIndexSet(col_index_, false);
+    col_iter_.BeforeFirst();
+    return &col_iter_;
+  }
+  // set the cache file name
+  inline void set_cache_file(const std::string &cache_file) {
+    col_data_name_ = std::string(cache_file) + ".col.blob";
+    col_meta_name_ = std::string(cache_file) + ".col.meta";    
  }

 protected:
-  /*!
-   * \brief try load column data from file
-   */
-  inline bool LoadColData(void) {
-    FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb");
-    if (fp == NULL) return false;
-    fi_ = new utils::FileStream(fp);
-    static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
-    col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
+  inline bool TryLoadColData(void) {
+    FILE *fi = fopen64(col_meta_name_.c_str(), "rb");
+    if (fi == NULL) return false;    
+    utils::FileStream fs(fi);
+    LoadMeta(&fs);
+    fs.Close();
+    fi = utils::FopenCheck(col_data_name_.c_str(), "rb");
+    if (fi == NULL) return false;
+    col_iter_.SetFile(utils::FileStream(fi));
    return true;
  }
+  inline void LoadMeta(utils::IStream *fi) {
+    utils::Check(fi->Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0,
+                 "invalid col.blob file");
+    utils::Check(fi->Read(&buffered_rowset_),
+                 "invalid col.blob file");
+    utils::Check(fi->Read(&col_size_),
+                 "invalid col.blob file");
+  }
+  inline void SaveMeta(utils::IStream *fo) {
+    fo->Write(&num_buffered_row_, sizeof(num_buffered_row_));
+    fo->Write(buffered_rowset_);
+    fo->Write(col_size_);
+  }
  /*!
   * \brief intialize column data
   * \param pkeep probability to keep a row
   */
-  inline void InitColData(float pkeep, const char *fname,
-                          size_t buffer_size, size_t col_step) {
+  inline void InitColData(const std::vector<bool> &enabled, float pkeep) {
+    SparsePage prow, pcol;
+    size_t btop = 0;
+    // clear rowset
    buffered_rowset_.clear();
-    utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
-    // use 64M buffer
-    utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
+    col_size_.resize(info.num_col());
+    std::fill(col_size_.begin(), col_size_.end(), 0);
+    utils::FileStream fo;
+    fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb"));
+    size_t bytes_write = 0;
+    double tstart = rabit::utils::GetTime();
    // start working
-    row_iter_->BeforeFirst();
-    while (row_iter_->Next()) {
-      const RowBatch &batch = row_iter_->Value();
+    iter_->BeforeFirst();
+    while (iter_->Next()) {
+      const RowBatch &batch = iter_->Value();
      for (size_t i = 0; i < batch.size; ++i) {
+        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.AddBudget(inst[j].index);
+          buffered_rowset_.push_back(ridx);
+          prow.Push(batch[i]);
+          if (prow.MemCostBytes() >= kPageSize) {
+            bytes_write += prow.MemCostBytes();
+            this->PushColPage(prow, BeginPtr(buffered_rowset_) + btop,
+                              enabled, &pcol, &fo);
+            btop += prow.Size();
+            prow.Clear();
+            
+            double tdiff = rabit::utils::GetTime() - tstart;
+            utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
+                          col_data_name_.c_str(),
+                          (bytes_write >> 20UL) / tdiff,
+                          (bytes_write >> 20UL));
          }
        }
      }
    }
-    // write buffered rowset
-    static_cast<utils::IStream*>(&fo)->Write(buffered_rowset_);
-    builder.InitStorage();
-    row_iter_->BeforeFirst();
-    size_t ktop = 0;
-    while (row_iter_->Next()) {
-      const RowBatch &batch = row_iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        if (ktop < buffered_rowset_.size() &&
-            buffered_rowset_[ktop] == batch.base_rowid + i) {
-          ++ktop;
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.PushElem(inst[j].index,
-                             ColBatch::Entry((bst_uint)(batch.base_rowid+i),
-                                             inst[j].fvalue));
-          }
-          if (ktop % 100000 == 0) {
-            utils::Printf("\r                         \r");
-            utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));    
-          }
-        }
-      }
+    if (prow.Size() != 0) {
+      this->PushColPage(prow, BeginPtr(buffered_rowset_) + btop,
+                        enabled, &pcol, &fo);
    }
-    builder.Finalize();
-    builder.SortRows(ColBatch::Entry::CmpValue, col_step);
    fo.Close();
+    num_buffered_row_ = buffered_rowset_.size();
+    fo = utils::FileStream(utils::FopenCheck(col_meta_name_.c_str(), "wb"));
+    this->SaveMeta(&fo);
+    fo.Close();
+  }
+  inline void PushColPage(const SparsePage &prow,
+                          const bst_uint *ridx,
+                          const std::vector<bool> &enabled,
+                          SparsePage *pcol,
+                          utils::IStream *fo) {
+    pcol->Clear();
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    pcol->Clear();
+    utils::ParallelGroupBuilder<SparseBatch::Entry>
+        builder(&pcol->offset, &pcol->data);
+    builder.InitBudget(info.num_col(), nthread);
+    bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
+        const SparseBatch::Entry &e = prow.data[j];
+        if (enabled[e.index]) { 
+          builder.AddBudget(e.index, tid);
+        }
+      }    
+    }
+    builder.InitStorage();
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      int tid = omp_get_thread_num();
+      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
+        const SparseBatch::Entry &e = prow.data[j];
+        builder.Push(e.index,
+                     SparseBatch::Entry(ridx[i], e.fvalue),
+                     tid);
+      }
+    }
+    utils::Assert(pcol->Size() == info.num_col(), "inconsistent col data");
+    // sort columns
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint i = 0; i < ncol; ++i) {
+      if (pcol->offset[i] < pcol->offset[i + 1]) {
+        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
+                  BeginPtr(pcol->data) + pcol->offset[i + 1], Entry::CmpValue);
+      }
+      col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
+    }    
+    pcol->Save(fo);  
  }

 private:
+  /*! \brief page size 256 M */
+  static const size_t kPageSize = 256 << 20UL;
+  // shared meta info with DMatrix
+  const learner::MetaInfo &info;
  // row iterator
-  utils::IIterator<RowBatch> *row_iter_;
-  // column iterator
-  ThreadColPageIterator *col_iter_;
-  // file pointer to data
-  utils::FileStream *fi_;
-  // file name of column buffer
-  std::string fname_cbuffer_;
+  utils::IIterator<RowBatch> *iter_;
+  /*! \brief column based data file name */
+  std::string col_data_name_;
+  /*! \brief column based data file name */
+  std::string col_meta_name_;
  /*! \brief list of row index that are buffered */
  std::vector<bst_uint> buffered_rowset_;
+  // number of buffered rows
+  size_t num_buffered_row_;
+  // count for column data
+  std::vector<size_t> col_size_;
+  // internal column index for output
+  std::vector<bst_uint> col_index_;
+  // internal thread backed col iterator
+  ThreadColPageIterator col_iter_;
 };
-
-class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
- public:
-  explicit DMatrixColPage(const char *fname) {
-    fmat_ = new FMatrixPage(iter_, fname);
-  }
-  virtual ~DMatrixColPage(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  /*! \brief the real fmatrix */
-  IFMatrix *fmat_;
-};
-
 }  // namespace io
 }  // namespace xgboost
 #endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@ -19,6 +19,7 @@
 #include "./io.h"
 #include "./simple_fmatrix-inl.hpp"
 #include "../sync/sync.h"
+#include "./libsvm_parser.h"

 namespace xgboost {
 namespace io {
@ -72,7 +73,8 @@ class DMatrixSimple : public DataMatrix {
  inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
    for (size_t i = 0; i < feats.size(); ++i) {
      row_data_.push_back(feats[i]);
-      info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
+      info.info.num_col = std::max(info.info.num_col,
+                                   static_cast<size_t>(feats[i].index+1));
    }
    row_ptr_.push_back(row_ptr_.back() + feats.size());
    info.info.num_row += 1;
@ -90,26 +92,35 @@ class DMatrixSimple : public DataMatrix {
      rank = rabit::GetRank();
      npart = rabit::GetWorldSize();
    }
-    dmlc::InputSplit *in =
-        dmlc::InputSplit::Create(uri, rank, npart);
+    LibSVMParser parser(
+        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
    this->Clear();
-    std::string line;
-    while (in->ReadRecord(&line)) {
-      float label;
-      std::istringstream ss(line);
-      std::vector<RowBatch::Entry> feats;
-      ss >> label;
-      while (!ss.eof()) {
-        RowBatch::Entry e;
-        if (!(ss >> e.index)) break;
-        ss.ignore(32, ':');
-        if (!(ss >> e.fvalue)) break;
-        feats.push_back(e);
+    while (parser.Next()) {
+      const LibSVMPage &batch = parser.Value();
+      size_t nlabel = info.labels.size();
+      info.labels.resize(nlabel + batch.label.size());
+      if (batch.label.size() != 0) {
+        std::memcpy(BeginPtr(info.labels) + nlabel,
+                    BeginPtr(batch.label),
+                    batch.label.size() * sizeof(float));
      }
-      info.labels.push_back(label);
-      this->AddRow(feats);
+      size_t ndata = row_data_.size();
+      row_data_.resize(ndata + batch.data.size());
+      if (batch.data.size() != 0) {
+        std::memcpy(BeginPtr(row_data_) + ndata,
+                    BeginPtr(batch.data),
+                    batch.data.size() * sizeof(RowBatch::Entry));
+      }
+      row_ptr_.resize(row_ptr_.size() + batch.label.size());
+      for (size_t i = 0; i < batch.label.size(); ++i) {
+        row_ptr_[nlabel + i + 1] = row_ptr_[nlabel] + batch.offset[i + 1];
+      }
+      info.info.num_row += batch.Size();
+      for (size_t i = 0; i < batch.data.size(); ++i) {
+        info.info.num_col = std::max(info.info.num_col,
+                                     static_cast<size_t>(batch.data[i].index+1));
+      }      
    }
-    delete in;
    if (!silent) {
      utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
                    static_cast<unsigned long>(info.num_row()),
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@ -9,13 +9,14 @@
 #include "../utils/utils.h"
 #include "../utils/random.h"
 #include "../utils/omp.h"
-#include "../utils/matrix_csr.h"
+#include "../utils/group_data.h"
+
 namespace xgboost {
 namespace io {
 /*!
 * \brief sparse matrix that support column access, CSC
 */
-class FMatrixS : public IFMatrix{
+class FMatrixS : public IFMatrix {
 public:
  typedef SparseBatch::Entry Entry;
  /*! \brief constructor */
@ -147,21 +148,41 @@ class FMatrixS : public IFMatrix{
   * \param pkeep probability to keep a row
   */
  inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
+    // clear rowset
    buffered_rowset_.clear();
-    // note: this part of code is serial, todo, parallelize this transformer
-    utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
-    builder.InitBudget(0);
+    // bit map
+    int nthread;
+    std::vector<bool> bmap;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    // build the column matrix in parallel
+    utils::ParallelGroupBuilder<RowBatch::Entry> builder(&col_ptr_, &col_data_);
+    builder.InitBudget(0, nthread);
    // start working
    iter_->BeforeFirst();
    while (iter_->Next()) {
      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {        
+      bmap.resize(bmap.size() + batch.size, true);
+	  long batch_size = static_cast<long>(batch.size);
+      for (long i = 0; i < batch_size; ++i) {
+        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
+          buffered_rowset_.push_back(ridx);
+        } else {
+          bmap[i] = false;
+        }
+      }
+      #pragma omp parallel for schedule(static)
+      for (long i = 0; i < batch_size; ++i) {
+        int tid = omp_get_thread_num();
+        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        if (bmap[ridx]) {
          RowBatch::Inst inst = batch[i];
          for (bst_uint j = 0; j < inst.length; ++j) {
            if (enabled[inst[j].index]){ 
-              builder.AddBudget(inst[j].index);
+              builder.AddBudget(inst[j].index, tid);
            }
          }
        }
@ -170,19 +191,19 @@ class FMatrixS : public IFMatrix{
    builder.InitStorage();

    iter_->BeforeFirst();
-    size_t ktop = 0;
    while (iter_->Next()) {
      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        if (ktop < buffered_rowset_.size() &&
-            buffered_rowset_[ktop] == batch.base_rowid+i) {
-          ++ktop;
+      #pragma omp parallel for schedule(static)
+      for (long i = 0; i < static_cast<long>(batch.size); ++i) {
+        int tid = omp_get_thread_num();
+        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        if (bmap[ridx]) {
          RowBatch::Inst inst = batch[i];
          for (bst_uint j = 0; j < inst.length; ++j) {
            if (enabled[inst[j].index]) { 
-              builder.PushElem(inst[j].index,
-                               Entry((bst_uint)(batch.base_rowid+i),
-                                     inst[j].fvalue));
+              builder.Push(inst[j].index,
+                           Entry((bst_uint)(batch.base_rowid+i),
+                                 inst[j].fvalue), tid);
            }
          }
        }
@ -218,7 +239,7 @@ class FMatrixS : public IFMatrix{
    inline void SetBatch(const std::vector<size_t> &ptr,
                         const std::vector<ColBatch::Entry> &data) {
      batch_.size = col_index_.size();
-      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0));
+      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
      for (size_t i = 0; i < col_data_.size(); ++i) {
        const bst_uint ridx = col_index_[i];
        col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
--- a/src/io/sparse_batch_page.h
+++ b/src/io/sparse_batch_page.h
@ -0,0 +1,254 @@
+#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
+#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
+/*!
+ * \file sparse_batch_page.h
+ *   content holder of sparse batch that can be saved to disk
+ *   the representation can be effectively
+ *   use in external memory computation
+ * \author Tianqi Chen
+ */
+#include "../data.h"
+
+namespace xgboost {
+namespace io {
+/*!
+ * \brief storage unit of sparse batch  
+ */
+class SparsePage {
+ public:
+  /*! \brief offset of the segments */
+  std::vector<size_t> offset;
+  /*! \brief the data of the segments */
+  std::vector<SparseBatch::Entry> data;
+  /*! \brief constructor */
+  SparsePage() {
+    this->Clear();
+  }
+  /*! \return number of instance in the page */
+  inline size_t Size() const {
+    return offset.size() - 1;
+  }
+  /*!
+   * \brief load the by providing a list of interested segments
+   *        only the interested segments are loaded
+   * \param fi the input stream of the file
+   * \param sorted_index_set sorted index of segments we are interested in
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool Load(utils::ISeekStream *fi,
+                   const std::vector<bst_uint> &sorted_index_set) {
+    if (!fi->Read(&disk_offset_)) return false;
+    // setup the offset
+    offset.clear(); offset.push_back(0);
+    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
+      bst_uint fid = sorted_index_set[i];
+      utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format");
+      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
+      offset.push_back(offset.back() + size);
+    }
+    data.resize(offset.back());
+    // read in the data
+    size_t begin = fi->Tell();
+    size_t curr_offset = 0;
+    for (size_t i = 0; i < sorted_index_set.size();) {
+      bst_uint fid = sorted_index_set[i];
+      if (disk_offset_[fid] != curr_offset) {
+        utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
+        fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
+        curr_offset = disk_offset_[fid];
+      }
+      size_t j, size_to_read = 0;
+      for (j = i; j < sorted_index_set.size(); ++j) {
+        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
+          size_to_read += offset[j + 1] - offset[j];
+        } else {
+          break;
+        }
+      }
+      if (size_to_read != 0) {
+        utils::Check(fi->Read(BeginPtr(data) + offset[i],
+                              size_to_read * sizeof(SparseBatch::Entry)) != 0,
+                     "Invalid SparsePage file");
+        curr_offset += size_to_read;
+      }
+      i = j;
+    }
+    // seek to end of record
+    if (curr_offset != disk_offset_.back()) {
+      fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
+    }
+    return true;
+  }
+  /*!
+   * \brief load all the segments
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool Load(utils::IStream *fi) {
+    if (!fi->Read(&offset)) return false;
+    utils::Check(offset.size() != 0, "Invalid SparsePage file");
+    data.resize(offset.back());
+    if (data.size() != 0) {
+      utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
+                   "Invalid SparsePage file");
+    }
+    return true;
+  }
+  /*!
+   * \brief save the data to fo, when a page was written
+   *    to disk it must contain all the elements in the 
+   * \param fo output stream
+   */
+  inline void Save(utils::IStream *fo) const {
+    utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
+    utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
+    fo->Write(offset);
+    if (data.size() != 0) {
+      fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
+    }
+  }
+  /*! \return estimation of memory cost of this page */
+  inline size_t MemCostBytes(void) const {
+    return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
+  }
+  /*! \brief clear the page */
+  inline void Clear(void) {
+    offset.clear();
+    offset.push_back(0);
+    data.clear();
+  }
+  /*!
+   * \brief load all the segments and add it to existing batch
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool PushLoad(utils::IStream *fi) {
+    if (!fi->Read(&disk_offset_)) return false;
+    data.resize(offset.back() + disk_offset_.back());    
+    if (disk_offset_.back() != 0) {
+      utils::Check(fi->Read(BeginPtr(data) + offset.back(),
+                            disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
+                   "Invalid SparsePage file");
+    }
+    size_t top = offset.back();
+    size_t begin = offset.size();
+    offset.resize(offset.size() + disk_offset_.size());
+    for (size_t i = 0; i < disk_offset_.size(); ++i) {
+      offset[i + begin] = top + disk_offset_[i];
+    }
+  }
+  /*! 
+   * \brief Push row batch into the page
+   * \param batch the row batch
+   */
+  inline void Push(const RowBatch &batch) {
+    data.resize(offset.back() + batch.ind_ptr[batch.size]);
+    std::memcpy(BeginPtr(data) + offset.back(),
+                batch.data_ptr + batch.ind_ptr[0],
+                sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
+    size_t top = offset.back();
+    size_t begin = offset.size();
+    offset.resize(offset.size() + batch.size);
+    for (size_t i = 0; i < batch.size; ++i) {
+      offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
+    }
+  }
+  /*! 
+   * \brief Push a sparse page
+   * \param batch the row page
+   */
+  inline void Push(const SparsePage &batch) {
+    size_t top = offset.back();
+    data.resize(top + batch.data.size());
+    std::memcpy(BeginPtr(data) + top,
+                BeginPtr(batch.data),
+                sizeof(SparseBatch::Entry) * batch.data.size());
+    size_t begin = offset.size();
+    offset.resize(begin + batch.Size());
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      offset[i + begin] = top + batch.offset[i + 1];
+    }
+  }
+  /*! 
+   * \brief Push one instance into page
+   *  \param row an instance row
+   */
+  inline void Push(const SparseBatch::Inst &inst) {
+    offset.push_back(offset.back() + inst.length);
+    size_t begin = data.size();
+    data.resize(begin + inst.length);
+    std::memcpy(BeginPtr(data) + begin, inst.data,
+                sizeof(SparseBatch::Entry) * inst.length);
+  }
+
+ private:
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
+};
+/*!
+ * \brief factory class for SparsePage,
+ *        used in threadbuffer template  
+ */
+class SparsePageFactory {
+ public:
+  SparsePageFactory(void)
+      : action_load_all_(true), set_load_all_(true) {}
+  inline void SetFile(const utils::FileStream &fi,
+                      size_t file_begin = 0) {
+    fi_ = fi;
+    file_begin_ = file_begin;
+  }
+  inline const std::vector<bst_uint> &index_set(void) const {
+    return action_index_set_;
+  }
+  // set index set, will be used after next before first
+  inline void SetIndexSet(const std::vector<bst_uint> &index_set, 
+                          bool load_all) {
+    set_load_all_ = load_all;
+    if (!set_load_all_) {
+      set_index_set_ = index_set;
+      std::sort(set_index_set_.begin(), set_index_set_.end());
+    }
+  }
+  inline bool Init(void) {
+    return true;
+  }
+  inline void SetParam(const char *name, const char *val) {}
+  inline bool LoadNext(SparsePage *val) {    
+    if (!action_load_all_) {
+      if (action_index_set_.size() == 0) {
+        return false;
+      } else {
+        return val->Load(&fi_, action_index_set_);
+      }
+    } else {
+      return val->Load(&fi_);
+    }
+  }
+  inline SparsePage *Create(void) {
+    return new SparsePage();
+  }
+  inline void FreeSpace(SparsePage *a) {
+    delete a;
+  }
+  inline void Destroy(void) {
+    fi_.Close();
+  }
+  inline void BeforeFirst(void) {
+    fi_.Seek(file_begin_);
+    action_load_all_ = set_load_all_;
+    if (!set_load_all_) {
+      action_index_set_ = set_index_set_;
+    }
+  }
+
+ private:
+  bool action_load_all_, set_load_all_;
+  size_t file_begin_;
+  utils::FileStream fi_;
+  std::vector<bst_uint> action_index_set_;
+  std::vector<bst_uint> set_index_set_;
+};
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_SPARSE_BATCH_PAGE_H_
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@ -130,7 +130,7 @@ struct EvalMClassBase : public IEvaluator {
      const float wt = info.GetWeight(i);
      int label =  static_cast<int>(info.labels[i]);
      if (label >= 0 && label < static_cast<int>(nclass)) {
-        sum += Derived::EvalRow(info.labels[i],
+        sum += Derived::EvalRow(label,
                                BeginPtr(preds) + i * nclass,
                                nclass) * wt;
        wsum += wt;
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -69,7 +69,7 @@ class BoostLearner : public rabit::Serializable {
    utils::SPrintf(str_temp, sizeof(str_temp), "%lu", 
                   static_cast<unsigned long>(buffer_size));
    this->SetParam("num_pbuffer", str_temp);
-    this->pred_buffer_size = buffer_size;
+    this->pred_buffer_size = buffer_size;    
  }
  /*!
   * \brief set parameters from outside
@ -259,7 +259,12 @@ class BoostLearner : public rabit::Serializable {
    int ncol = static_cast<int>(p_train->info.info.num_col);    
    std::vector<bool> enabled(ncol, true);    
    // initialize column access
-    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);    
+    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);
+    const int kMagicSimple = 0xffffab01;
+    // check, if it is not DMatrix simple, then use hist maker
+    if (p_train->magic != kMagicSimple) {
+      this->SetParam("updater", "grow_histmaker,prune");
+    }
  }
  /*!
   * \brief update the model for one iteration
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@ -7,6 +7,7 @@
 * \author Tianqi Chen
 */
 #include "../../subtree/rabit/include/rabit.h"
+#include "../../subtree/rabit/include/rabit/timer.h"
 #endif  // XGBOOST_SYNC_H_


--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@ -50,7 +50,7 @@ class BaseMaker: public IUpdater {
            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
          }
        }
-      }      
+      }
      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
    }
    // get feature type, 0:empty 1:binary 2:real
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@ -366,7 +366,7 @@ class CQHistMaker: public HistMaker<TStats> {
      } else {
        feat2workindex[fset[i]] = -2;  
      }
-    }      
+    }
    this->GetNodeStats(gpair, *p_fmat, tree, info,
                       &thread_stats, &node_stats);       
    sketchs.resize(this->qexpand.size() * freal_set.size());
@ -578,7 +578,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
                                  IFMatrix *p_fmat,
                                  const BoosterInfo &info,
                                  const std::vector <bst_uint> &fset,
-                                  const RegTree &tree) {
+                                  const RegTree &tree) {    
    // initialize the data structure
    int nthread = BaseMaker::get_nthread();
    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
--- a/src/utils/group_data.h
+++ b/src/utils/group_data.h
@ -40,7 +40,7 @@ struct ParallelGroupBuilder {
   * \param nkeys number of keys in the matrix, can be smaller than expected
   * \param nthread number of thread that will be used in construction
   */
-  inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
+  inline void InitBudget(size_t nkeys, int nthread) {
    thread_rptr.resize(nthread);
    for (size_t i = 0;  i < thread_rptr.size(); ++i) {
      thread_rptr[i].resize(nkeys);
@ -53,7 +53,7 @@ struct ParallelGroupBuilder {
   * \param threadid the id of thread that calls this function
   * \param nelem number of element budget add to this row
   */
-  inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
+  inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) {
    std::vector<SizeType> &trptr = thread_rptr[threadid];
    if (trptr.size() < key + 1) {
      trptr.resize(key + 1, 0);      
@ -65,7 +65,7 @@ struct ParallelGroupBuilder {
    // set rptr to correct size
    for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
      if (rptr.size() <= thread_rptr[tid].size()) {
-        rptr.resize(thread_rptr[tid].size()+1);
+        rptr.resize(thread_rptr[tid].size() + 1);
      }
    }
    // initialize rptr to be beginning of each segment
@ -90,7 +90,7 @@ struct ParallelGroupBuilder {
   * \param key the key of 
   * \param threadid the id of thread that calls this function
   */
-  inline void Push(size_t key, ValueType value, int threadid = 0) {    
+  inline void Push(size_t key, ValueType value, int threadid) {    
    SizeType &rp = thread_rptr[threadid][key];
    data[rp++] = value;
  }
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@ -1,260 +0,0 @@
-#ifndef XGBOOST_UTILS_MATRIX_CSR_H_
-#define XGBOOST_UTILS_MATRIX_CSR_H_
-/*!
- * \file matrix_csr.h
- * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
- * \author Tianqi Chen
- */
-#include <vector>
-#include <utility>
-#include <algorithm>
-#include "./io.h"
-#include "./utils.h"
-#include "./omp.h"
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief a class used to help construct CSR format matrix,
- *        can be used to convert row major CSR to column major CSR
- * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
- * \tparam whether enabling the usage of aclist, this option must be enabled manually
- */
-template<typename IndexType, bool UseAcList = false, typename SizeType = size_t>
-struct SparseCSRMBuilder {
- private:
-  /*! \brief dummy variable used in the indicator matrix construction */
-  std::vector<size_t> dummy_aclist;
-  /*! \brief pointer to each of the row */
-  std::vector<SizeType> &rptr;
-  /*! \brief index of nonzero entries in each row */
-  std::vector<IndexType> &findex;
-  /*! \brief a list of active rows, used when many rows are empty */
-  std::vector<size_t> &aclist;
-
- public:
-  SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
-                    std::vector<IndexType> &p_findex)
-      :rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
-    Assert(!UseAcList, "enabling bug");
-  }
-  /*! \brief use with caution! rptr must be cleaned before use */
-  SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
-                    std::vector<IndexType> &p_findex,
-                    std::vector<size_t> &p_aclist)
-      :rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
-    Assert(UseAcList, "must manually enable the option use aclist");
-  }
-
- public:
-  /*!
-   * \brief step 1: initialize the number of rows in the data, not necessary exact
-   * \nrows number of rows in the matrix, can be smaller than expected
-   */
-  inline void InitBudget(size_t nrows = 0) {
-    if (!UseAcList) {
-      rptr.clear();
-      rptr.resize(nrows + 1, 0);
-    } else {
-      Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
-      this->Cleanup();
-    }
-  }
-  /*!
-   * \brief step 2: add budget to each rows, this function is called when aclist is used
-   * \param row_id the id of the row
-   * \param nelem  number of element budget add to this row
-   */
-  inline void AddBudget(size_t row_id, SizeType nelem = 1) {
-    if (rptr.size() < row_id + 2) {
-      rptr.resize(row_id + 2, 0);
-    }
-    if (UseAcList) {
-      if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
-    }
-    rptr[row_id + 1] += nelem;
-  }
-  /*! \brief step 3: initialize the necessary storage */
-  inline void InitStorage(void) {
-    // initialize rptr to be beginning of each segment
-    size_t start = 0;
-    if (!UseAcList) {
-      for (size_t i = 1; i < rptr.size(); i++) {
-        size_t rlen = rptr[i];
-        rptr[i] = start;
-        start += rlen;
-      }
-    } else {
-      // case with active list
-      std::sort(aclist.begin(), aclist.end());
-      for (size_t i = 0; i < aclist.size(); i++) {
-        size_t ridx = aclist[i];
-        size_t rlen = rptr[ridx + 1];
-        rptr[ridx + 1] = start;
-        // set previous rptr to right position if previous feature is not active
-        if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
-        start += rlen;
-      }
-    }
-    findex.resize(start);
-  }
-  /*!
-   * \brief step 4:
-   * used in indicator matrix construction, add new
-   * element to each row, the number of calls shall be exactly same as add_budget
-   */
-  inline void PushElem(size_t row_id, IndexType col_id) {
-    SizeType &rp = rptr[row_id + 1];
-    findex[rp++] = col_id;
-  }
-  /*!
-   * \brief step 5: only needed when aclist is used
-   * clean up the rptr for next usage
-   */
-  inline void Cleanup(void) {
-    Assert(UseAcList, "this function can only be called use AcList");
-    for (size_t i = 0; i < aclist.size(); i++) {
-      const size_t ridx = aclist[i];
-      rptr[ridx] = 0; rptr[ridx + 1] = 0;
-    }
-    aclist.clear();
-  }
-};
-
-/*!
- * \brief a class used to help construct CSR format matrix file
- * \tparam IndexType type of index used to store the index position
- * \tparam SizeType type of size used in row pointer
- */
-template<typename IndexType, typename SizeType = size_t>
-struct SparseCSRFileBuilder {
- public:
-  explicit SparseCSRFileBuilder(utils::ISeekStream *fo, size_t buffer_size) 
-      : fo(fo), buffer_size(buffer_size) {
-  }
-  /*!
-   * \brief step 1: initialize the number of rows in the data, not necessary exact
-   * \nrows number of rows in the matrix, can be smaller than expected
-   */
-  inline void InitBudget(size_t nrows = 0) {
-    rptr.clear();
-    rptr.resize(nrows + 1, 0);
-  }
-  /*!
-   * \brief step 2: add budget to each rows
-   * \param row_id the id of the row
-   * \param nelem  number of element budget add to this row
-   */
-  inline void AddBudget(size_t row_id, SizeType nelem = 1) {
-    if (rptr.size() < row_id + 2) {
-      rptr.resize(row_id + 2, 0);
-    }
-    rptr[row_id + 1] += nelem;
-  }
-  /*! \brief step 3: initialize the necessary storage */
-  inline void InitStorage(void) {
-    SizeType nelem = 0;
-    for (size_t i = 1; i < rptr.size(); i++) {
-      nelem += rptr[i];
-      rptr[i] = nelem;
-    }
-    begin_data = static_cast<SizeType>(fo->Tell()) + sizeof(SizeType);
-    SizeType begin_meta = begin_data + nelem * sizeof(IndexType);
-    fo->Write(&begin_meta, sizeof(begin_meta));
-    fo->Seek(begin_meta);
-    fo->Write(rptr);
-    // setup buffer space
-    buffer_rptr.resize(rptr.size());
-    buffer_temp.reserve(buffer_size);
-    buffer_data.resize(buffer_size);
-    saved_offset = rptr;
-    saved_offset.resize(rptr.size() - 1);
-    this->ClearBuffer();
-  }
-  /*! \brief step 4: push element into buffer */
-  inline void PushElem(SizeType row_id, IndexType col_id) {
-    if (buffer_temp.size() == buffer_size) {
-      this->WriteBuffer();
-      this->ClearBuffer();
-    }
-    buffer_rptr[row_id + 1] += 1;
-    buffer_temp.push_back(std::make_pair(row_id, col_id));
-  }
-  /*! \brief finalize the construction */
-  inline void Finalize(void) {
-    this->WriteBuffer();
-    for (size_t i = 0; i < saved_offset.size(); ++i) {
-      utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out");
-    }
-  }
-  /*! \brief content must be in wb+ */
-  template<typename Comparator>
-  inline void SortRows(Comparator comp, size_t step) {
-    for (size_t i = 0; i < rptr.size() - 1; i += step) {
-      bst_omp_uint begin = static_cast<bst_omp_uint>(i);
-      bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size() - 1, i + step));
-      if (rptr[end] != rptr[begin]) {
-        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
-        buffer_data.resize(rptr[end] - rptr[begin]);
-        fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
-        // do parallel sorting
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = begin; j < end; ++j) {
-          std::sort(&buffer_data[0] + rptr[j] - rptr[begin],
-                    &buffer_data[0] + rptr[j+1] - rptr[begin],
-                    comp);
-        }
-        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
-        fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
-      }
-    }
-  }
- protected:
-  inline void WriteBuffer(void) {
-    SizeType start = 0;
-    for (size_t i = 1; i < buffer_rptr.size(); ++i) {
-      size_t rlen = buffer_rptr[i];
-      buffer_rptr[i] = start;
-      start += rlen;
-    }
-    for (size_t i = 0; i < buffer_temp.size(); ++i) {
-      SizeType &rp = buffer_rptr[buffer_temp[i].first + 1];
-      buffer_data[rp++] = buffer_temp[i].second;
-    }
-    // write out
-    for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) {
-      size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
-      if (nelem != 0) {
-        utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound");
-        fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data);
-        fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
-        saved_offset[i] += nelem;
-      }
-    }
-  }
-  inline void ClearBuffer(void) {
-    buffer_temp.clear();
-    std::fill(buffer_rptr.begin(), buffer_rptr.end(), 0);
-  }
- private:
-  /*! \brief output file pointer the data */
-  utils::ISeekStream *fo;
-  /*! \brief pointer to each of the row */
-  std::vector<SizeType> rptr;
-  /*! \brief saved top space of each item */
-  std::vector<SizeType> saved_offset;
-  /*! \brief beginning position of data */
-  size_t begin_data;
-  // ----- the following are buffer space
-  /*! \brief maximum size of content buffer*/
-  size_t buffer_size;
-  /*! \brief store the data content */
-  std::vector< std::pair<SizeType, IndexType> > buffer_temp;
-  /*! \brief saved top space of each item */
-  std::vector<SizeType> buffer_rptr;
-  /*! \brief saved top space of each item */
-  std::vector<IndexType> buffer_data;
-};
-}  // namespace utils
-}  // namespace xgboost
-#endif
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@ -31,7 +31,7 @@ class ThreadBuffer {
  }
  /*!\brief set parameter, will also pass the parameter to factory */
  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
+    if (!std::strcmp( name, "buffer_size")) buf_size = atoi(val);
    factory.SetParam(name, val);
  }
  /*!
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@ -174,5 +174,13 @@ inline const T *BeginPtr(const std::vector<T> &vec) {
    return &vec[0];
  }
 }
+inline char* BeginPtr(std::string &str) {
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
+inline const char* BeginPtr(const std::string &str) {
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
 }  // namespace xgboost
 #endif  // XGBOOST_UTILS_UTILS_H_
--- a/subtree/rabit/include/dmlc/io.h
+++ b/subtree/rabit/include/dmlc/io.h
@ -11,7 +11,6 @@
 #include <istream>
 #include <ostream>
 #include <streambuf>
-#include <cassert>

 /*! \brief namespace for dmlc */
 namespace dmlc {
@ -100,32 +99,71 @@ class Serializable {
 };

 /*!
- * \brief input split header, used to create input split on input dataset
- * this class can be used to obtain filesystem invariant splits from input files
+ * \brief input split creates that allows reading
+ *  of records from split of data,
+ *  independent part that covers all the dataset
+ * 
+ *  see InputSplit::Create for definition of record
 */
 class InputSplit {
 public:
+  /*! \brief a blob of memory region */
+  struct Blob {
+    /*! \brief points to start of the memory region */
+    void *dptr;
+    /*! \brief size of the memory region */
+    size_t size;
+  };
  /*!
-   * \brief read next record, store into out_data
-   *   the data in outcomming record depends on the input data format
-   *   if input is text data, each line is returned as a record (\n not included)
-   *   if input is recordio, each record is returned
-   * \param out_data the string that stores the line data, \n is not included
-   * \return true of next line was found, false if we read all the lines
+   * \brief get the next record, the returning value
+   *   is valid until next call to NextRecord or NextChunk
+   *   caller can modify the memory content of out_rec
+   * \param out_rec used to store the result
+   * \return true if we can successfully get next record
+   *     false if we reached end of split
+   * \sa InputSplit::Create for definition of record
   */
-  virtual bool ReadRecord(std::string *out_data) = 0;
+  virtual bool NextRecord(Blob *out_rec) = 0;
+  /*!
+   * \brief get a chunk of memory that can contain multiple records, 
+   *  the caller needs to parse the content of the resulting chunk,
+   *  for text file, out_chunk can contain data of multiple lines
+   *  for recordio, out_chunk can contain data of multiple records
+   *   
+   *  This function ensures there won't be partial record in the chunk
+   *  caller can modify the memory content of out_chunk,
+   *  the memory is valid until next call to NextRecord or NextChunk
+   *
+   *  Usually NextRecord is sufficient, NextChunk can be used by some
+   *  multi-threaded parsers to parse the input content
+   *
+   * \param out_chunk used to store the result
+   * \return true if we can successfully get next record
+   *     false if we reached end of split
+   * \sa InputSplit::Create for definition of record
+   */
+  virtual bool NextChunk(Blob *out_chunk) = 0;
  /*! \brief destructor*/
-  virtual ~InputSplit(void) {}  
+  virtual ~InputSplit(void) {}
  /*!
   * \brief factory function:
   *  create input split given a uri
   * \param uri the uri of the input, can contain hdfs prefix
   * \param part_index the part id of current input
   * \param num_parts total number of splits
+   * \param type type of record
+   *   List of possible types: "text", "recordio"
+   *     - "text":
+   *         text file, each line is treated as a record
+   *         input split will split on \n or \r
+   *     - "recordio":
+   *         binary recordio file, see recordio.h
+   * \sa InputSplit::Type
   */
  static InputSplit* Create(const char *uri,
                            unsigned part_index,
-                            unsigned num_parts);
+                            unsigned num_parts,
+                            const char *type);
 };

 /*!
@ -172,7 +210,7 @@ class ostream : public std::basic_ostream<char> {
   public:
    explicit OutBuf(size_t buffer_size)
        : stream_(NULL), buffer_(buffer_size) {
-      assert(buffer_.size() > 0); 
+      if (buffer_size == 0) buffer_.resize(2);
    }
    // set stream to the buffer
    inline void set_stream(Stream *stream);
@ -225,22 +263,32 @@ class istream : public std::basic_istream<char> {
    buf_.set_stream(stream);
    this->rdbuf(&buf_);
  }
-  
+  /*! \return how many bytes we read so far */
+  inline size_t bytes_read(void) const {
+    return buf_.bytes_read();
+  }
+
 private:
  // internal streambuf
  class InBuf : public std::streambuf {
   public:
    explicit InBuf(size_t buffer_size)
-        : stream_(NULL), buffer_(buffer_size) {
-      assert(buffer_.size() > 0);
+        : stream_(NULL), bytes_read_(0),
+          buffer_(buffer_size) {
+      if (buffer_size == 0) buffer_.resize(2);
    }
    // set stream to the buffer
    inline void set_stream(Stream *stream);
-    
+    // return how many bytes read so far
+    inline size_t bytes_read(void) const {
+      return bytes_read_;
+    }
   private:
    /*! \brief internal stream by StreamBuf */
    Stream *stream_;
-    /*! \brief internal buffer */
+    /*! \brief how many bytes we read so far */
+    size_t bytes_read_;
+    /*! \brief internal buffer */    
    std::vector<char> buffer_;
    // override underflow
    inline int_type underflow();
@ -297,13 +345,13 @@ inline int ostream::OutBuf::sync(void) {
  if (stream_ == NULL) return -1;
  std::ptrdiff_t n = pptr() - pbase();
  stream_->Write(pbase(), n);
-  this->pbump(-n);
+  this->pbump(-static_cast<int>(n));
  return 0;
 }
 inline int ostream::OutBuf::overflow(int c) {
  *(this->pptr()) = c;
  std::ptrdiff_t n = pptr() - pbase();
-  this->pbump(-n);
+  this->pbump(-static_cast<int>(n));
  if (c == EOF) {
    stream_->Write(pbase(), n);
  } else {
@ -322,6 +370,7 @@ inline int istream::InBuf::underflow() {
  if (this->gptr() == this->egptr()) {
    size_t sz = stream_->Read(bhead, buffer_.size());
    this->setg(bhead, bhead, bhead + sz);
+    bytes_read_ += sz;
  }
  if (this->gptr() == this->egptr()) {
    return traits_type::eof();
--- a/subtree/rabit/include/rabit/timer.h
+++ b/subtree/rabit/include/rabit/timer.h
@ -18,7 +18,8 @@ namespace utils {
 * \brief return time in seconds, not cross platform, avoid to use this in most places
 */
 inline double GetTime(void) {
-  #ifdef __MACH__ 
+  // TODO: use c++11 chrono when c++11 was available
+  #ifdef __MACH__
  clock_serv_t cclock;
  mach_timespec_t mts;
  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
@ -26,9 +27,14 @@ inline double GetTime(void) {
  mach_port_deallocate(mach_task_self(), cclock);
  return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
  #else
+  #if defined(__unix__) || defined(__linux__)
  timespec ts;
  utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
  return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
+  #else
+  // TODO: add MSVC macro, and MSVC timer
+  return static_cast<double>(time(NULL));
+  #endif
  #endif
 }
 }  // namespace utils
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -94,7 +94,8 @@ class DMatrix(object):
        Parameters
        ----------
        data : string/numpy array/scipy.sparse
-            Data source, string type is the path of svmlight format txt file or xgb buffer.
+            Data source, string type is the path of svmlight format txt file,
+            xgb buffer or path to cache_file
        label : list or numpy 1-D array (optional)
            Label of the training data.
        missing : float
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@ -19,7 +19,7 @@ using namespace std;
 #include "../src/learner/learner-inl.hpp"
 #include "../src/io/io.h"
 #include "../src/utils/utils.h"
-#include "../src/utils/matrix_csr.h"
+#include "../src/utils/group_data.h"
 #include "../src/io/simple_dmatrix-inl.hpp"

 using namespace xgboost;
@ -139,20 +139,32 @@ extern "C"{
                                       const float *data,
                                       bst_ulong nindptr,
                                       bst_ulong nelem) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    
    DMatrixSimple *p_mat = new DMatrixSimple();
    DMatrixSimple &mat = *p_mat;
-    utils::SparseCSRMBuilder<RowBatch::Entry, false> builder(mat.row_ptr_, mat.row_data_);
-    builder.InitBudget();
-    bst_ulong ncol = nindptr - 1;
-    for (bst_ulong i = 0; i < ncol; ++i) {
+    utils::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
+    builder.InitBudget(0, nthread);
+    long ncol = static_cast<long>(nindptr - 1);
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < ncol; ++i) {
+      int tid = omp_get_thread_num();
      for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
-        builder.AddBudget(indices[j]);
+        builder.AddBudget(indices[j], tid);
      }
    }
    builder.InitStorage();
-    for (bst_ulong i = 0; i < ncol; ++i) {
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < ncol; ++i) {
+      int tid = omp_get_thread_num();
      for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
-        builder.PushElem(indices[j], RowBatch::Entry(static_cast<bst_uint>(i), data[j]));
+        builder.Push(indices[j],
+                     RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
+                     tid);
      }
    }
    mat.info.info.num_row = mat.row_ptr_.size() - 1;
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@ -19,6 +19,8 @@ extern "C" {
 #endif
  /*!
   * \brief load a data matrix 
+   * \param fname the name of the file
+   * \param silent whether print messages during loading
   * \return a loaded data matrix
   */
  XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);