diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index 8bca0de5f..1ca232509 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -62,9 +62,9 @@ extern "C" {
     int ncol = length(indptr) - 1;
     int ndata = length(data);
     // transform into CSR format
-    std::vector<size_t> row_ptr;
+    std::vector<bst_ulong> row_ptr;
     std::vector< std::pair<unsigned, float> > csr_data;
-    utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
+    utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
     builder.InitBudget();
     for (int i = 0; i < ncol; ++i) {
       for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
@@ -119,7 +119,7 @@ extern "C" {
     }
   }
   SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
-    size_t olen;
+    bst_ulong olen;
     const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                              CHAR(asChar(field)), &olen);
     SEXP ret = PROTECT(allocVector(REALSXP, olen));
@@ -188,7 +188,7 @@ extern "C" {
                                          &vec_dmats[0], &vec_sptr[0], len));
   }
   SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
-    size_t olen;
+    bst_ulong olen;
     const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
                                         R_ExternalPtrAddr(dmat),
                                         asInteger(output_margin),
@@ -207,7 +207,7 @@ extern "C" {
     XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
   }
   void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
-    size_t olen;
+    bst_ulong olen;
     const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
                                           CHAR(asChar(fmap)),
                                           &olen);
diff --git a/demo/binary_classification/mushroom.conf b/demo/binary_classification/mushroom.conf
index d364905f7..d2566f132 100644
--- a/demo/binary_classification/mushroom.conf
+++ b/demo/binary_classification/mushroom.conf
@@ -6,13 +6,13 @@ objective = binary:logistic
 
 # Tree Booster Parameters
 # step size shrinkage
-bst:eta = 1.0 
+eta = 1.0 
 # minimum loss reduction required to make a further partition
-bst:gamma = 1.0 
+gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
-bst:min_child_weight = 1 
+min_child_weight = 1 
 # maximum depth of a tree
-bst:max_depth = 3 
+max_depth = 3 
 
 # Task Parameters
 # the number of round to do boosting
diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py
index bd60f074f..1e7448a4c 100755
--- a/demo/kaggle-higgs/higgs-numpy.py
+++ b/demo/kaggle-higgs/higgs-numpy.py
@@ -42,8 +42,8 @@ param = {}
 param['objective'] = 'binary:logitraw'
 # scale weight of positive examples
 param['scale_pos_weight'] = sum_wneg/sum_wpos
-param['bst:eta'] = 0.1 
-param['bst:max_depth'] = 6
+param['eta'] = 0.1 
+param['max_depth'] = 6
 param['eval_metric'] = 'auc'
 param['silent'] = 1
 param['nthread'] = 16
diff --git a/demo/multiclass_classification/train.py b/demo/multiclass_classification/train.py
index 702542a4c..f387de7c0 100755
--- a/demo/multiclass_classification/train.py
+++ b/demo/multiclass_classification/train.py
@@ -25,8 +25,8 @@ param = {}
 # use softmax multi-class classification
 param['objective'] = 'multi:softmax'
 # scale weight of positive examples
-param['bst:eta'] = 0.1
-param['bst:max_depth'] = 6
+param['eta'] = 0.1
+param['max_depth'] = 6
 param['silent'] = 1
 param['nthread'] = 4
 param['num_class'] = 6
diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf
index 90aadec4e..a19758bb7 100644
--- a/demo/rank/mq2008.conf
+++ b/demo/rank/mq2008.conf
@@ -5,13 +5,13 @@ objective="rank:pairwise"
 
 # Tree Booster Parameters
 # step size shrinkage
-bst:eta = 0.1 
+eta = 0.1 
 # minimum loss reduction required to make a further partition
-bst:gamma = 1.0 
+gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
-bst:min_child_weight = 0.1
+min_child_weight = 0.1
 # maximum depth of a tree
-bst:max_depth = 6
+max_depth = 6
 
 # Task parameters
 # the number of round to do boosting
diff --git a/demo/regression/machine.conf b/demo/regression/machine.conf
index f5a5163a8..8c677a502 100644
--- a/demo/regression/machine.conf
+++ b/demo/regression/machine.conf
@@ -7,13 +7,13 @@ objective = reg:linear
 
 # Tree Booster Parameters
 # step size shrinkage
-bst:eta = 1.0 
+eta = 1.0 
 # minimum loss reduction required to make a further partition
-bst:gamma = 1.0 
+gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
-bst:min_child_weight = 1 
+min_child_weight = 1 
 # maximum depth of a tree
-bst:max_depth = 3 
+max_depth = 3 
 
 # Task parameters
 # the number of round to do boosting
diff --git a/src/data.h b/src/data.h
index 1c9d9a290..4316885b1 100644
--- a/src/data.h
+++ b/src/data.h
@@ -12,6 +12,7 @@
 #include <cstring>
 #include <algorithm>
 #include "utils/io.h"
+#include "utils/omp.h"
 #include "utils/utils.h"
 #include "utils/iterator.h"
 #include "utils/random.h"
@@ -370,9 +371,9 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
     }
 
     // sort columns
-    unsigned ncol = static_cast<unsigned>(this->NumCol());
+    bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
     #pragma omp parallel for schedule(static)
-    for (unsigned i = 0; i < ncol; ++i) {
+    for (bst_omp_uint i = 0; i < ncol; ++i) {
       std::sort(&col_data_[0] + col_ptr_[i],
                 &col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
     }
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 4f9bd0707..9a7e3d8b6 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -51,9 +51,9 @@ class GBLinear : public IGradBooster<FMatrix> {
     // for all the output group
     for (int gid = 0; gid < ngroup; ++gid) {
       double sum_grad = 0.0, sum_hess = 0.0;
-      const unsigned ndata = static_cast<unsigned>(rowset.size());
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
       #pragma omp parallel for schedule(static) reduction(+: sum_grad, sum_hess)
-      for (unsigned i = 0; i < ndata; ++i) {
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
         bst_gpair &p = gpair[rowset[i] * ngroup + gid];
         if (p.hess >= 0.0f) {
           sum_grad += p.grad; sum_hess += p.hess;
@@ -65,7 +65,7 @@ class GBLinear : public IGradBooster<FMatrix> {
       model.bias()[gid] += dw;
       // update grad value
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < ndata; ++i) {
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
         bst_gpair &p = gpair[rowset[i] * ngroup + gid];
         if (p.hess >= 0.0f) {
           p.grad += p.hess * dw;
@@ -73,9 +73,9 @@ class GBLinear : public IGradBooster<FMatrix> {
       }
     }
     // number of features
-    const unsigned nfeat = static_cast<unsigned>(feat_index.size());
+    const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
     #pragma omp parallel for schedule(static)
-    for (unsigned i = 0; i < nfeat; ++i) {
+    for (bst_omp_uint i = 0; i < nfeat; ++i) {
       const bst_uint fid = feat_index[i];
       for (int gid = 0; gid < ngroup; ++gid) {
         double sum_grad = 0.0, sum_hess = 0.0;
@@ -117,9 +117,9 @@ class GBLinear : public IGradBooster<FMatrix> {
       // k is number of group
       preds.resize(preds.size() + batch.size * ngroup);
       // parallel over local batch
-      const unsigned nsize = static_cast<unsigned>(batch.size);
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < nsize; ++i) {
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
         const size_t ridx = batch.base_rowid + i;
         // loop over output groups
         for (int gid = 0; gid < ngroup; ++gid) {
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index bd3adac08..1f34aa55b 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -94,8 +94,9 @@ class GBTree : public IGradBooster<FMatrix> {
                    "must have exactly ngroup*nrow gpairs");
       std::vector<bst_gpair> tmp(gpair.size()/ngroup);
       for (int gid = 0; gid < ngroup; ++gid) {
+        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
         #pragma omp parallel for schedule(static)
-        for (size_t i = 0; i < tmp.size(); ++i) {
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
           tmp[i] = gpair[i * ngroup + gid];
         }
         this->BoostNewTrees(tmp, fmat, info, gid);
@@ -125,9 +126,9 @@ class GBTree : public IGradBooster<FMatrix> {
     while (iter->Next()) {
       const SparseBatch &batch = iter->Value();
       // parallel over local batch
-      const unsigned nsize = static_cast<unsigned>(batch.size);
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < nsize; ++i) {
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
         const int tid = omp_get_thread_num();
         tree::RegTree::FVec &feats = thread_temp[tid];
         int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
index 32e7f7443..3e05880ec 100644
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -27,10 +27,12 @@ struct EvalEWiseBase : public IEvaluator {
     utils::Check(info.labels.size() != 0, "label set cannot be empty");
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label and prediction size not match");
-    const unsigned ndata = static_cast<unsigned>(info.labels.size());
+
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
+
     float sum = 0.0, wsum = 0.0;
     #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
-    for (unsigned i = 0; i < ndata; ++i) {
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
       const float wt = info.GetWeight(i);
       sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
       wsum += wt;
@@ -149,12 +151,13 @@ struct EvalAMS : public IEvaluator {
   }
   virtual float Eval(const std::vector<float> &preds,
                      const MetaInfo &info) const {
-    const unsigned ndata = static_cast<unsigned>(info.labels.size());
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
+
     utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
     std::vector< std::pair<float, unsigned> > rec(ndata);
 
     #pragma omp parallel for schedule(static)
-    for (unsigned i = 0; i < ndata; ++i) {
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
       rec[i] = std::make_pair(preds[i], i);
     }
     std::sort(rec.begin(), rec.end(), CmpFirst);
@@ -163,7 +166,7 @@ struct EvalAMS : public IEvaluator {
     const double br = 10.0;
     unsigned thresindex = 0;
     double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
+    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
       const unsigned ridx = rec[i].second;
       const float wt = info.weights[ridx];
       if (info.labels[ridx] > 0.5f) {
@@ -257,7 +260,7 @@ struct EvalAuc : public IEvaluator {
     const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
     utils::Check(gptr.back() == info.labels.size(),
                  "EvalAuc: group structure must match number of prediction");
-    const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
     // sum statictis
     double sum_auc = 0.0f;
     #pragma omp parallel reduction(+:sum_auc)
@@ -265,7 +268,7 @@ struct EvalAuc : public IEvaluator {
       // each thread takes a local rec
       std::vector< std::pair<float, unsigned> > rec;
       #pragma omp for schedule(static)
-      for (unsigned k = 0; k < ngroup; ++k) {
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
         rec.clear();
         for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
           rec.push_back(std::make_pair(preds[j], j));
@@ -315,7 +318,7 @@ struct EvalRankList : public IEvaluator {
     utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
     utils::Assert(gptr.back() == preds.size(),
                    "EvalRanklist: group structure must match number of prediction");
-    const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
     // sum statistics
     double sum_metric = 0.0f;
     #pragma omp parallel reduction(+:sum_metric)
@@ -323,7 +326,7 @@ struct EvalRankList : public IEvaluator {
       // each thread takes a local rec
       std::vector< std::pair<float, unsigned> > rec;
       #pragma omp for schedule(static)
-      for (unsigned k = 0; k < ngroup; ++k) {
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
         rec.clear();
         for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
           rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 7bf8c33ac..387d1a57b 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -79,6 +79,11 @@ class BoostLearner {
    * \param val  value of the parameter
    */
   inline void SetParam(const char *name, const char *val) {
+    // in this version, bst: prefix is no longer required 
+    if (strncmp(name, "bst:", 4) != 0) {
+      std::string n = "bst:"; n += name;
+      this->SetParam(n.c_str(), val);
+    }
     if (!strcmp(name, "silent")) silent = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
     if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
@@ -91,7 +96,7 @@ class BoostLearner {
       if (!strcmp(name, "objective")) name_obj_ = val;
       if (!strcmp(name, "booster")) name_gbm_ = val;
       mparam.SetParam(name, val);
-    }
+    }    
     if (gbm_ != NULL) gbm_->SetParam(name, val);
     if (obj_ != NULL) obj_->SetParam(name, val);
     if (gbm_ == NULL || obj_ == NULL) {
@@ -248,17 +253,17 @@ class BoostLearner {
                   data.info.info, out_preds);
     // add base margin
     std::vector<float> &preds = *out_preds;
-    const unsigned ndata = static_cast<unsigned>(preds.size());
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
     if (data.info.base_margin.size() != 0) {
       utils::Check(preds.size() == data.info.base_margin.size(),
                    "base_margin.size does not match with prediction size");
       #pragma omp parallel for schedule(static)
-      for (unsigned j = 0; j < ndata; ++j) {
+      for (bst_omp_uint j = 0; j < ndata; ++j) {
         preds[j] += data.info.base_margin[j];
       }
     } else {
       #pragma omp parallel for schedule(static)
-      for (unsigned j = 0; j < ndata; ++j) {
+      for (bst_omp_uint j = 0; j < ndata; ++j) {
         preds[j] += mparam.base_score;
       }
     }
diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
index 4b5b4f014..9e338a6b2 100644
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -116,9 +116,9 @@ class RegLossObj : public IObjFunction{
     gpair.resize(preds.size());
     // start calculating gradient
     const unsigned nstep = static_cast<unsigned>(info.labels.size());
-    const unsigned ndata = static_cast<unsigned>(preds.size());
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
     #pragma omp parallel for schedule(static)
-    for (unsigned i = 0; i < ndata; ++i) {
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
       const unsigned j = i % nstep;
       float p = loss.PredTransform(preds[i]);
       float w = info.GetWeight(j);
@@ -132,9 +132,9 @@ class RegLossObj : public IObjFunction{
   }
   virtual void PredTransform(std::vector<float> *io_preds) {
     std::vector<float> &preds = *io_preds;
-    const unsigned ndata = static_cast<unsigned>(preds.size());
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
     #pragma omp parallel for schedule(static)
-    for (unsigned j = 0; j < ndata; ++j) {
+    for (bst_omp_uint j = 0; j < ndata; ++j) {
       preds[j] = loss.PredTransform(preds[j]);
     }
   }
@@ -169,12 +169,12 @@ class SoftmaxMultiClassObj : public IObjFunction {
     std::vector<bst_gpair> &gpair = *out_gpair;
     gpair.resize(preds.size());
     const unsigned nstep = static_cast<unsigned>(info.labels.size() * nclass);
-    const unsigned ndata = static_cast<unsigned>(preds.size() / nclass);
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size() / nclass);
     #pragma omp parallel
     {
       std::vector<float> rec(nclass);
       #pragma omp for schedule(static)
-      for (unsigned i = 0; i < ndata; ++i) {
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
         for (int k = 0; k < nclass; ++k) {
           rec[k] = preds[i * nclass + k];
         }
@@ -210,13 +210,13 @@ class SoftmaxMultiClassObj : public IObjFunction {
     utils::Check(nclass != 0, "must set num_class to use softmax");
     std::vector<float> &preds = *io_preds;
     std::vector<float> tmp;
-    const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()/nclass);
     if (prob == 0) tmp.resize(ndata);
     #pragma omp parallel
     {
       std::vector<float> rec(nclass);
       #pragma omp for schedule(static)
-      for (unsigned j = 0; j < ndata; ++j) {
+      for (bst_omp_uint j = 0; j < ndata; ++j) {
         for (int k = 0; k < nclass; ++k) {
           rec[k] = preds[j * nclass + k];
         }
@@ -263,7 +263,7 @@ class LambdaRankObj : public IObjFunction {
     const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
     utils::Check(gptr.size() != 0 && gptr.back() == info.labels.size(),
                  "group structure not consistent with #rows");
-    const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
+    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
     #pragma omp parallel
     {
       // parall construct, declare random number generator here, so that each
@@ -273,7 +273,7 @@ class LambdaRankObj : public IObjFunction {
       std::vector<ListEntry>  lst;
       std::vector< std::pair<float, unsigned> > rec;
       #pragma omp for schedule(static)
-      for (unsigned k = 0; k < ngroup; ++k) {
+      for (bst_omp_uint k = 0; k < ngroup; ++k) {
         lst.clear(); pairs.clear();
         for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
           lst.push_back(ListEntry(preds[j], info.labels[j], j));
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index e26f2ada4..e2c0d43e1 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -186,9 +186,9 @@ class ColMaker: public IUpdater<FMatrix> {
       }
       const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
       // setup position
-      const unsigned ndata = static_cast<unsigned>(rowset.size());
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < ndata; ++i) {
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
         const int tid = omp_get_thread_num();
         if (position[ridx] < 0) continue;
@@ -286,12 +286,12 @@ class ColMaker: public IUpdater<FMatrix> {
         feat_set.resize(n);
       }
       // start enumeration
-      const unsigned nsize = static_cast<unsigned>(feat_set.size());
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size());
       #if defined(_OPENMP)
       const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
       #endif
       #pragma omp parallel for schedule(dynamic, batch_size)
-      for (unsigned i = 0; i < nsize; ++i) {
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
         const unsigned fid = feat_set[i];
         const int tid = omp_get_thread_num();
         if (param.need_forward_search(fmat.GetColDensity(fid))) {
@@ -321,9 +321,9 @@ class ColMaker: public IUpdater<FMatrix> {
     inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
       const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
       // step 1, set default direct nodes to default, and leaf nodes to -1
-      const unsigned ndata = static_cast<unsigned>(rowset.size());
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < ndata; ++i) {
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
         const int nid = position[ridx];
         if (nid >= 0) {
@@ -344,9 +344,9 @@ class ColMaker: public IUpdater<FMatrix> {
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
       // start put things into right place
-      const unsigned nfeats = static_cast<unsigned>(fsplits.size());
+      const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size());
       #pragma omp parallel for schedule(dynamic, 1)
-      for (unsigned i = 0; i < nfeats; ++i) {
+      for (bst_omp_uint i = 0; i < nfeats; ++i) {
         const unsigned fid = fsplits[i];
         for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
           const bst_uint ridx = it.rindex();
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index ff6cf14b0..299f8414a 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -56,9 +56,9 @@ class TreeRefresher: public IUpdater<FMatrix> {
       const SparseBatch &batch = iter->Value();
       utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
                    "too large batch size ");
-      const unsigned nbatch = static_cast<unsigned>(batch.size);
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(static)
-      for (unsigned i = 0; i < nbatch; ++i) {
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
         SparseBatch::Inst inst = batch[i];
         const int tid = omp_get_thread_num();
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index 31022553b..0f3b20a14 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -17,26 +17,26 @@ namespace utils {
  * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
  * \tparam whether enabling the usage of aclist, this option must be enabled manually
  */
-template<typename IndexType, bool UseAcList = false>
+template<typename IndexType, bool UseAcList = false, typename SizeType = size_t>
 struct SparseCSRMBuilder {
  private:
   /*! \brief dummy variable used in the indicator matrix construction */
   std::vector<size_t> dummy_aclist;
   /*! \brief pointer to each of the row */
-  std::vector<size_t> &rptr;
+  std::vector<SizeType> &rptr;
   /*! \brief index of nonzero entries in each row */
   std::vector<IndexType> &findex;
   /*! \brief a list of active rows, used when many rows are empty */
   std::vector<size_t> &aclist;
 
  public:
-  SparseCSRMBuilder(std::vector<size_t> &p_rptr,
+  SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
                     std::vector<IndexType> &p_findex)
       :rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
     Assert(!UseAcList, "enabling bug");
   }
   /*! \brief use with caution! rptr must be cleaned before use */
-  SparseCSRMBuilder(std::vector<size_t> &p_rptr,
+  SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
                     std::vector<IndexType> &p_findex,
                     std::vector<size_t> &p_aclist)
       :rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
@@ -62,7 +62,7 @@ struct SparseCSRMBuilder {
    * \param row_id the id of the row
    * \param nelem  number of element budget add to this row
    */
-  inline void AddBudget(size_t row_id, size_t nelem = 1) {
+  inline void AddBudget(size_t row_id, SizeType nelem = 1) {
     if (rptr.size() < row_id + 2) {
       rptr.resize(row_id + 2, 0);
     }
@@ -101,7 +101,7 @@ struct SparseCSRMBuilder {
    * element to each row, the number of calls shall be exactly same as add_budget
    */
   inline void PushElem(size_t row_id, IndexType col_id) {
-    size_t &rp = rptr[row_id + 1];
+    SizeType &rp = rptr[row_id + 1];
     findex[rp++] = col_id;
   }
   /*!
diff --git a/src/utils/omp.h b/src/utils/omp.h
index 8d6531526..0380ebd67 100644
--- a/src/utils/omp.h
+++ b/src/utils/omp.h
@@ -21,4 +21,14 @@ inline int omp_get_thread_num() { return 0; }
 inline int omp_get_num_threads() { return 1; }
 inline void omp_set_num_threads(int nthread) {}
 #endif
+
+// loop variable used in openmp
+namespace xgboost {
+#ifdef _MSC_VER
+typedef int bst_omp_uint;
+#else
+typedef unsigned bst_omp_uint;
+#endif
+} // namespace xgboost
+
 #endif  // XGBOOST_UTILS_OMP_H_
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
index 064dd6ee6..8b88dbf2e 100644
--- a/windows/xgboost/xgboost.vcxproj
+++ b/windows/xgboost/xgboost.vcxproj
@@ -99,6 +99,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
+      <OpenMPSupport>true</OpenMPSupport>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
diff --git a/wrapper/python-example/demo.py b/wrapper/python-example/demo.py
index 52d565456..687b491a4 100755
--- a/wrapper/python-example/demo.py
+++ b/wrapper/python-example/demo.py
@@ -13,7 +13,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train')
 dtest = xgb.DMatrix('agaricus.txt.test')
 
 # specify parameters via map, definition are same as c++ version
-param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 
 # specify validations set to watch performance
 evallist  = [(dtest,'eval'), (dtrain,'train')]
@@ -75,7 +75,7 @@ print ('start running example to used cutomized objective function')
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 }
+param = {'max_depth':2, 'eta':1, 'silent':1 }
 
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
@@ -107,7 +107,7 @@ bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
 #
 print ('start running example to start from a initial prediction')
 # specify parameters via map, definition are same as c++ version
-param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 # train xgboost for 1 round
 bst = xgb.train( param, dtrain, 1, evallist )
 # Note: we need the margin value instead of transformed prediction in set_base_margin
diff --git a/wrapper/xgboost_R.cpp b/wrapper/xgboost_R.cpp
index 76a1f2840..4be565d1a 100644
--- a/wrapper/xgboost_R.cpp
+++ b/wrapper/xgboost_R.cpp
@@ -62,9 +62,9 @@ extern "C" {
     int ncol = length(indptr) - 1;
     int ndata = length(data);
     // transform into CSR format
-    std::vector<size_t> row_ptr;
+    std::vector<bst_ulong> row_ptr;
     std::vector< std::pair<unsigned, float> > csr_data;
-    utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
+    utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
     builder.InitBudget();
     for (int i = 0; i < ncol; ++i) {
       for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
@@ -119,7 +119,7 @@ extern "C" {
     }
   }
   SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
-    uint64_t olen;
+    bst_ulong olen;
     const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                              CHAR(asChar(field)), &olen);
     SEXP ret = PROTECT(allocVector(REALSXP, olen));
@@ -188,7 +188,7 @@ extern "C" {
                                          &vec_dmats[0], &vec_sptr[0], len));
   }
   SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
-    uint64_t olen;
+    bst_ulong olen;
     const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
                                         R_ExternalPtrAddr(dmat),
                                         asInteger(output_margin),
@@ -207,13 +207,13 @@ extern "C" {
     XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
   }
   void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
-    uint64_t olen;
+    bst_ulong olen;
     const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
                                           CHAR(asChar(fmap)),
                                           &olen);
     FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
     for (size_t i = 0; i < olen; ++i) {
-      fprintf(fo, "booster[%lu]:\n", i);
+      fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
       fprintf(fo, "%s", res[i]);
     }
     fclose(fo);
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index bff900361..ccbb9ffea 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -23,18 +23,18 @@ class Booster: public learner::BoostLearner<FMatrixS> {
     this->init_model = false;
     this->SetCacheData(mats);
   }
-  const float *Pred(const DataMatrix &dmat, int output_margin, uint64_t *len) {
+  const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) {
     this->CheckInitModel();
     this->Predict(dmat, output_margin, &this->preds_);
     *len = this->preds_.size();
     return &this->preds_[0];
   }
   inline void BoostOneIter(const DataMatrix &train,
-                           float *grad, float *hess, uint64_t len) {
+                           float *grad, float *hess, bst_ulong len) {
     this->gpair_.resize(len);
-    const unsigned ndata = static_cast<unsigned>(len);
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
     #pragma omp parallel for schedule(static)
-    for (unsigned j = 0; j < ndata; ++j) {
+    for (bst_omp_uint j = 0; j < ndata; ++j) {
       gpair_[j] = bst_gpair(grad[j], hess[j]);
     }
     gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@@ -48,7 +48,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
     learner::BoostLearner<FMatrixS>::LoadModel(fname);
     this->init_model = true;
   }
-  inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, uint64_t *len) {
+  inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
     model_dump = this->DumpModel(fmap, with_stats);
     model_dump_cptr.resize(model_dump.size());
     for (size_t i = 0; i < model_dump.size(); ++i) {
@@ -76,19 +76,19 @@ extern "C"{
   void* XGDMatrixCreateFromFile(const char *fname, int silent) {
     return LoadDataMatrix(fname, silent, false);
   }
-  void* XGDMatrixCreateFromCSR(const uint64_t *indptr,
+  void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
                                const unsigned *indices,
                                const float *data,
-                               uint64_t nindptr,
-                               uint64_t nelem) {
+                               bst_ulong nindptr,
+                               bst_ulong nelem) {
     DMatrixSimple *p_mat = new DMatrixSimple();
     DMatrixSimple &mat = *p_mat;
     mat.row_ptr_.resize(nindptr);
-    for (uint64_t i = 0; i < nindptr; ++ i) {
+    for (bst_ulong i = 0; i < nindptr; ++i) {
       mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
     }
     mat.row_data_.resize(nelem);
-    for (uint64_t i = 0; i < nelem; ++i) {
+    for (bst_ulong i = 0; i < nelem; ++i) {
       mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
       mat.info.info.num_col = std::max(mat.info.info.num_col,
                                        static_cast<uint64_t>(indices[i]+1));
@@ -97,16 +97,16 @@ extern "C"{
     return p_mat;
   }
   void* XGDMatrixCreateFromMat(const float *data,
-                               uint64_t nrow,
-                               uint64_t ncol,
+                               bst_ulong nrow,
+                               bst_ulong ncol,
                                float  missing) {
     DMatrixSimple *p_mat = new DMatrixSimple();
     DMatrixSimple &mat = *p_mat;
     mat.info.info.num_row = nrow;
     mat.info.info.num_col = ncol;
-    for (uint64_t i = 0; i < nrow; ++i, data += ncol) {
-      uint64_t nelem = 0;
-      for (uint64_t j = 0; j < ncol; ++j) {
+    for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
+      bst_ulong nelem = 0;
+      for (bst_ulong j = 0; j < ncol; ++j) {
         if (data[j] != missing) {
           mat.row_data_.push_back(SparseBatch::Entry(j, data[j]));
           ++nelem;
@@ -118,7 +118,7 @@ extern "C"{
   }
   void* XGDMatrixSliceDMatrix(void *handle,
                               const int *idxset,
-                              uint64_t len) {
+                              bst_ulong len) {
     DMatrixSimple tmp;
     DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
     if (dsrc.magic != DMatrixSimple::kMagic) {
@@ -139,10 +139,10 @@ extern "C"{
     iter->BeforeFirst();
     utils::Assert(iter->Next(), "slice");
     const SparseBatch &batch = iter->Value();
-    for (uint64_t i = 0; i < len; ++i) {
+    for (bst_ulong i = 0; i < len; ++i) {
       const int ridx = idxset[i];
       SparseBatch::Inst inst = batch[ridx];
-      utils::Check(static_cast<uint64_t>(ridx) < batch.size, "slice index exceed number of rows");
+      utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
       ret.row_data_.resize(ret.row_data_.size() + inst.length);
       memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
              sizeof(SparseBatch::Entry) * inst.length);
@@ -168,13 +168,13 @@ extern "C"{
   void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
     SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent);
   }
-  void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, uint64_t len) {
+  void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) {
     std::vector<float> &vec = 
         static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
     vec.resize(len);
     memcpy(&vec[0], info, sizeof(float) * len);
   }
-  void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, uint64_t len) {
+  void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) {
     std::vector<unsigned> &vec =
         static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
     vec.resize(len);
@@ -194,20 +194,20 @@ extern "C"{
     *len = vec.size();
     return &vec[0];
   }
-  const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, uint64_t* len) {
+  const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) {
     const std::vector<unsigned> &vec =
         static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
     *len = vec.size();
     return &vec[0];
   }
-  uint64_t XGDMatrixNumRow(const void *handle) {
+  bst_ulong XGDMatrixNumRow(const void *handle) {
     return static_cast<const DataMatrix*>(handle)->info.num_row();
   }
 
   // xgboost implementation
-  void *XGBoosterCreate(void *dmats[], uint64_t len) {
+  void *XGBoosterCreate(void *dmats[], bst_ulong len) {
     std::vector<DataMatrix*> mats;
-    for (uint64_t i = 0; i < len; ++i) {
+    for (bst_ulong i = 0; i < len; ++i) {
       DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
       mats.push_back(dtr);
     }
@@ -227,7 +227,7 @@ extern "C"{
     bst->UpdateOneIter(iter, *dtr);
   }
   void XGBoosterBoostOneIter(void *handle, void *dtrain,
-                             float *grad, float *hess, uint64_t len) {
+                             float *grad, float *hess, bst_ulong len) {
     Booster *bst = static_cast<Booster*>(handle);
     DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
     bst->CheckInitModel();
@@ -235,11 +235,11 @@ extern "C"{
     bst->BoostOneIter(*dtr, grad, hess, len);
   }
   const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
-                                   const char *evnames[], uint64_t len) {
+                                   const char *evnames[], bst_ulong len) {
     Booster *bst = static_cast<Booster*>(handle);
     std::vector<std::string> names;
     std::vector<const DataMatrix*> mats;
-    for (uint64_t i = 0; i < len; ++i) {
+    for (bst_ulong i = 0; i < len; ++i) {
       mats.push_back(static_cast<DataMatrix*>(dmats[i]));
       names.push_back(std::string(evnames[i]));
     }
@@ -247,7 +247,7 @@ extern "C"{
     bst->eval_str = bst->EvalOneIter(iter, mats, names);
     return bst->eval_str.c_str();
   }
-  const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, uint64_t *len) {
+  const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len) {
     return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, len);
   }
   void XGBoosterLoadModel(void *handle, const char *fname) {
@@ -256,7 +256,7 @@ extern "C"{
   void XGBoosterSaveModel(const void *handle, const char *fname) {
     static_cast<const Booster*>(handle)->SaveModel(fname);
   }
-  const char** XGBoosterDumpModel(void *handle, const char *fmap, uint64_t *len){
+  const char** XGBoosterDumpModel(void *handle, const char *fmap, bst_ulong *len){
     utils::FeatMap featmap;
     if (strlen(fmap) != 0) {
       featmap.LoadText(fmap);
diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h
index 994f7ed15..d6d968ec5 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -7,15 +7,17 @@
  *  can be used to create wrapper of other languages
  */
 #include <cstdio>
-// define uint64_t
-typedef unsigned long uint64_t;
+#define XGB_DLL
+// manually define unsign long
+typedef unsigned long bst_ulong;
+
 
 extern "C" {
   /*!
    * \brief load a data matrix 
    * \return a loaded data matrix
    */
-  void* XGDMatrixCreateFromFile(const char *fname, int silent);
+  XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);
   /*! 
    * \brief create a matrix content from csr format
    * \param indptr pointer to row headers
@@ -25,11 +27,11 @@ extern "C" {
    * \param nelem number of nonzero elements in the matrix
    * \return created dmatrix
    */
-  void* XGDMatrixCreateFromCSR(const uint64_t *indptr,
-                               const unsigned *indices,
-                               const float *data,
-                               uint64_t nindptr,
-                               uint64_t nelem);
+  XGB_DLL void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
+                                       const unsigned *indices,
+                                       const float *data,
+                                       bst_ulong nindptr,
+                                       bst_ulong nelem);
   /*!
    * \brief create matrix content from dense matrix
    * \param data pointer to the data space
@@ -38,10 +40,10 @@ extern "C" {
    * \param missing which value to represent missing value
    * \return created dmatrix
    */
-  void* XGDMatrixCreateFromMat(const float *data,
-                               uint64_t nrow,
-                               uint64_t ncol,
-                               float  missing);
+  XGB_DLL void* XGDMatrixCreateFromMat(const float *data,
+                                       bst_ulong nrow,
+                                       bst_ulong ncol,
+                                       float  missing);
   /*!
    * \brief create a new dmatrix from sliced content of existing matrix
    * \param handle instance of data matrix to be sliced
@@ -49,20 +51,20 @@ extern "C" {
    * \param len length of index set
    * \return a sliced new matrix
    */
-  void* XGDMatrixSliceDMatrix(void *handle,
-                              const int *idxset,
-                              uint64_t len);
+  XGB_DLL void* XGDMatrixSliceDMatrix(void *handle,
+                                      const int *idxset,
+                                      bst_ulong len);
   /*!
    * \brief free space in data matrix
    */
-  void XGDMatrixFree(void *handle);
+  XGB_DLL void XGDMatrixFree(void *handle);
   /*!
    * \brief load a data matrix into binary file
    * \param handle a instance of data matrix
    * \param fname file name
    * \param silent print statistics when saving
    */
-  void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
+  XGB_DLL void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
   /*!
    * \brief set float vector to a content in info
    * \param handle a instance of data matrix
@@ -70,7 +72,7 @@ extern "C" {
    * \param array pointer to float vector
    * \param len length of array
    */
-  void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, uint64_t len);
+  XGB_DLL void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, bst_ulong len);
   /*!
    * \brief set uint32 vector to a content in info
    * \param handle a instance of data matrix
@@ -78,14 +80,14 @@ extern "C" {
    * \param array pointer to float vector
    * \param len length of array
    */
-  void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, uint64_t len);
+  XGB_DLL void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, bst_ulong len);
   /*!
    * \brief set label of the training matrix
    * \param handle a instance of data matrix
    * \param group pointer to group size
    * \param len length of array
    */
-  void XGDMatrixSetGroup(void *handle, const unsigned *group, uint64_t len);
+  XGB_DLL void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len);
   /*!
    * \brief get float info vector from matrix
    * \param handle a instance of data matrix
@@ -93,7 +95,7 @@ extern "C" {
    * \param out_len used to set result length
    * \return pointer to the result
    */
-  const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, uint64_t* out_len);
+  XGB_DLL const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* out_len);
   /*!
    * \brief get uint32 info vector from matrix
    * \param handle a instance of data matrix
@@ -101,37 +103,37 @@ extern "C" {
    * \param out_len used to set result length
    * \return pointer to the result
    */
-  const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, uint64_t* out_len);
+  XGB_DLL const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* out_len);
   /*!
    * \brief return number of rows
    */
-  uint64_t XGDMatrixNumRow(const void *handle);
+  XGB_DLL bst_ulong XGDMatrixNumRow(const void *handle);
   // --- start XGBoost class
   /*! 
    * \brief create xgboost learner 
    * \param dmats matrices that are set to be cached
    * \param len length of dmats
    */
-  void *XGBoosterCreate(void* dmats[], uint64_t len);
+  XGB_DLL void *XGBoosterCreate(void* dmats[], bst_ulong len);
   /*! 
    * \brief free obj in handle 
    * \param handle handle to be freed
    */
-  void XGBoosterFree(void* handle);
+  XGB_DLL void XGBoosterFree(void* handle);
   /*! 
    * \brief set parameters 
    * \param handle handle
    * \param name  parameter name
    * \param val value of parameter
    */    
-  void XGBoosterSetParam(void *handle, const char *name, const char *value);
+  XGB_DLL void XGBoosterSetParam(void *handle, const char *name, const char *value);
   /*! 
    * \brief update the model in one round using dtrain
    * \param handle handle
    * \param iter current iteration rounds
    * \param dtrain training data
    */
-  void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
+  XGB_DLL void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
   /*!
    * \brief update the model, by directly specify gradient and second order gradient,
    *        this can be used to replace UpdateOneIter, to support customized loss function
@@ -141,8 +143,8 @@ extern "C" {
    * \param hess second order gradient statistics
    * \param len length of grad/hess array
    */
-  void XGBoosterBoostOneIter(void *handle, void *dtrain,
-                             float *grad, float *hess, uint64_t len);
+  XGB_DLL void XGBoosterBoostOneIter(void *handle, void *dtrain,
+                                     float *grad, float *hess, bst_ulong len);
   /*!
    * \brief get evaluation statistics for xgboost
    * \param handle handle
@@ -152,8 +154,8 @@ extern "C" {
    * \param len length of dmats
    * \return the string containing evaluation stati
    */
-  const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
-                                   const char *evnames[], uint64_t len);
+  XGB_DLL const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
+                                           const char *evnames[], bst_ulong len);
   /*!
    * \brief make prediction based on dmat
    * \param handle handle
@@ -161,19 +163,19 @@ extern "C" {
    * \param output_margin whether only output raw margin value
    * \param len used to store length of returning result
    */
-  const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, uint64_t *len);
+  XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len);
   /*!
    * \brief load model from existing file
    * \param handle handle
    * \param fname file name
    */
-  void XGBoosterLoadModel(void *handle, const char *fname);
+  XGB_DLL void XGBoosterLoadModel(void *handle, const char *fname);
   /*!
    * \brief save model into existing file
    * \param handle handle
    * \param fname file name
    */
-  void XGBoosterSaveModel(const void *handle, const char *fname);
+  XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
   /*!
    * \brief dump model, return array of strings representing model dump
    * \param handle handle
@@ -181,7 +183,7 @@ extern "C" {
    * \param out_len length of output array
    * \return char *data[], representing dump of each model
    */
-  const char **XGBoosterDumpModel(void *handle, const char *fmap,
-                                  uint64_t *out_len);
+  XGB_DLL const char **XGBoosterDumpModel(void *handle, const char *fmap,
+                                          bst_ulong *out_len);
 };
 #endif  // XGBOOST_WRAPPER_H_