diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 674f46109..867fee6a9 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -68,12 +68,9 @@ class GradientBooster { * \param obj The objective function, optional, can be nullptr when use customized version * the booster may change content of gpair */ - virtual void DoBoost(DMatrix* p_fmat, - std::vector* in_gpair, - ObjFunction* obj = nullptr) = 0; virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector* in_gpair, - ObjFunction* obj = nullptr); + ObjFunction* obj = nullptr) = 0; /*! * \brief generate predictions for given feature matrix @@ -82,12 +79,9 @@ class GradientBooster { * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear */ - virtual void PredictBatch(DMatrix* dmat, - std::vector* out_preds, - unsigned ntree_limit = 0) = 0; virtual void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, - unsigned ntree_limit = 0); + unsigned ntree_limit = 0) = 0; /*! * \brief online prediction function, predict score for one instance at a time * NOTE: use the batch prediction interface if possible, batch prediction is usually diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 995a590c9..3981940d2 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -84,7 +84,7 @@ class Learner : public rabit::Serializable { */ virtual void BoostOneIter(int iter, DMatrix* train, - std::vector* in_gpair) = 0; + HostDeviceVector* in_gpair) = 0; /*! * \brief evaluate the model for specific iteration using the configured metrics. * \param iter iteration number @@ -109,7 +109,7 @@ class Learner : public rabit::Serializable { */ virtual void Predict(DMatrix* data, bool output_margin, - std::vector *out_preds, + HostDeviceVector *out_preds, unsigned ntree_limit = 0, bool pred_leaf = false, bool pred_contribs = false, @@ -169,7 +169,7 @@ class Learner : public rabit::Serializable { */ inline void Predict(const SparseBatch::Inst &inst, bool output_margin, - std::vector *out_preds, + HostDeviceVector *out_preds, unsigned ntree_limit = 0) const; /*! * \brief Create a new instance of learner. @@ -192,9 +192,9 @@ class Learner : public rabit::Serializable { // implementation of inline functions. inline void Learner::Predict(const SparseBatch::Inst& inst, bool output_margin, - std::vector* out_preds, + HostDeviceVector* out_preds, unsigned ntree_limit) const { - gbm_->PredictInstance(inst, out_preds, ntree_limit); + gbm_->PredictInstance(inst, &out_preds->data_h(), ntree_limit); if (!output_margin) { obj_->PredTransform(out_preds); } diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h index 3f26db891..63e4c4d14 100644 --- a/include/xgboost/objective.h +++ b/include/xgboost/objective.h @@ -44,14 +44,10 @@ class ObjFunction { * \param iteration current iteration number. * \param out_gpair output of get gradient, saves gradient and second order gradient in */ - virtual void GetGradient(const std::vector& preds, - const MetaInfo& info, - int iteration, - std::vector* out_gpair) = 0; virtual void GetGradient(HostDeviceVector* preds, const MetaInfo& info, int iteration, - HostDeviceVector* out_gpair); + HostDeviceVector* out_gpair) = 0; /*! \return the default evaluation metric for the objective */ virtual const char* DefaultEvalMetric() const = 0; @@ -60,17 +56,13 @@ class ObjFunction { * \brief transform prediction values, this is only called when Prediction is called * \param io_preds prediction values, saves to this vector as well */ - virtual void PredTransform(std::vector *io_preds) {} - virtual void PredTransform(HostDeviceVector *io_preds); + virtual void PredTransform(HostDeviceVector *io_preds) {} /*! * \brief transform prediction values, this is only called when Eval is called, * usually it redirect to PredTransform * \param io_preds prediction values, saves to this vector as well */ - virtual void EvalTransform(std::vector *io_preds) { - this->PredTransform(io_preds); - } virtual void EvalTransform(HostDeviceVector *io_preds) { this->PredTransform(io_preds); } diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 2c408740e..c8abd4b69 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -63,22 +63,6 @@ class Predictor { * limit trees. */ - virtual void PredictBatch(DMatrix* dmat, std::vector* out_preds, - const gbm::GBTreeModel& model, int tree_begin, - unsigned ntree_limit = 0) = 0; - - /** - * \brief Generate batch predictions for a given feature matrix. May use - * cached predictions if available instead of calculating from scratch. - * - * \param [in,out] dmat Feature matrix. - * \param [in,out] out_preds The output preds. - * \param model The model to predict from. - * \param tree_begin The tree begin index. - * \param ntree_limit (Optional) The ntree limit. 0 means do not - * limit trees. - */ - virtual void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) = 0; @@ -186,41 +170,14 @@ class Predictor { static Predictor* Create(std::string name); protected: - /** - * \fn bool PredictFromCache(DMatrix* dmat, std::vector* - * out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0) - * - * \brief Attempt to predict from cache. - * - * \return True if it succeeds, false if it fails. - */ - bool PredictFromCache(DMatrix* dmat, std::vector* out_preds, - const gbm::GBTreeModel& model, - unsigned ntree_limit = 0); - - /** - * \fn void Predictor::InitOutPredictions(const MetaInfo& info, - * std::vector* out_preds, const gbm::GBTreeModel& model) const; - * - * \brief Init out predictions according to base margin. - * - * \param info Dmatrix info possibly containing base margin. - * \param [in,out] out_preds The out preds. - * \param model The model. - */ - void InitOutPredictions(const MetaInfo& info, - std::vector* out_preds, - const gbm::GBTreeModel& model) const; - /** * \struct PredictionCacheEntry * * \brief Contains pointer to input matrix and associated cached predictions. */ - struct PredictionCacheEntry { std::shared_ptr data; - std::vector predictions; + HostDeviceVector predictions; }; /** diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h index 8dbfa6cae..07e44a64e 100644 --- a/include/xgboost/tree_updater.h +++ b/include/xgboost/tree_updater.h @@ -40,12 +40,9 @@ class TreeUpdater { * but maybe different random seeds, usually one tree is passed in at a time, * there can be multiple trees when we train random forest style model */ - virtual void Update(const std::vector& gpair, - DMatrix* data, - const std::vector& trees) = 0; virtual void Update(HostDeviceVector* gpair, DMatrix* data, - const std::vector& trees); + const std::vector& trees) = 0; /*! * \brief determines whether updater has enough knowledge about a given dataset @@ -58,11 +55,9 @@ class TreeUpdater { * updated by the time this function returns. */ virtual bool UpdatePredictionCache(const DMatrix* data, - std::vector* out_preds) { + HostDeviceVector* out_preds) { return false; } - virtual bool UpdatePredictionCache(const DMatrix* data, - HostDeviceVector* out_preds); /*! * \brief Create a tree updater given name diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc index 3b7eefa4f..5446ea9b1 100644 --- a/plugin/example/custom_obj.cc +++ b/plugin/example/custom_obj.cc @@ -33,30 +33,32 @@ class MyLogistic : public ObjFunction { void Configure(const std::vector >& args) override { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector &preds, + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, int iter, - std::vector *out_gpair) override { - out_gpair->resize(preds.size()); - for (size_t i = 0; i < preds.size(); ++i) { + HostDeviceVector *out_gpair) override { + out_gpair->resize(preds->size()); + std::vector& preds_h = preds->data_h(); + std::vector& out_gpair_h = out_gpair->data_h(); + for (size_t i = 0; i < preds_h.size(); ++i) { bst_float w = info.GetWeight(i); // scale the negative examples! if (info.labels[i] == 0.0f) w *= param_.scale_neg_weight; // logistic transformation - bst_float p = 1.0f / (1.0f + std::exp(-preds[i])); + bst_float p = 1.0f / (1.0f + std::exp(-preds_h[i])); // this is the gradient bst_float grad = (p - info.labels[i]) * w; // this is the second order gradient bst_float hess = p * (1.0f - p) * w; - out_gpair->at(i) = bst_gpair(grad, hess); + out_gpair_h.at(i) = bst_gpair(grad, hess); } } const char* DefaultEvalMetric() const override { return "error"; } - void PredTransform(std::vector *io_preds) override { + void PredTransform(HostDeviceVector *io_preds) override { // transform margin value to probability. - std::vector &preds = *io_preds; + std::vector &preds = io_preds->data_h(); for (size_t i = 0; i < preds.size(); ++i) { preds[i] = 1.0f / (1.0f + std::exp(-preds[i])); } diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 796b38f64..f9150156d 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -191,9 +191,9 @@ struct XGBAPIThreadLocalEntry { /*! \brief result holder for returning string pointers */ std::vector ret_vec_charp; /*! \brief returning float vector. */ - std::vector ret_vec_float; + HostDeviceVector ret_vec_float; /*! \brief temp variable of gradient pairs. */ - std::vector tmp_gpair; + HostDeviceVector tmp_gpair; }; // define the threadlocal store. @@ -705,14 +705,15 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, bst_float *grad, bst_float *hess, xgboost::bst_ulong len) { - std::vector& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair; + HostDeviceVector& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair; API_BEGIN(); Booster* bst = static_cast(handle); std::shared_ptr* dtr = static_cast*>(dtrain); tmp_gpair.resize(len); + std::vector& tmp_gpair_h = tmp_gpair.data_h(); for (xgboost::bst_ulong i = 0; i < len; ++i) { - tmp_gpair[i] = bst_gpair(grad[i], hess[i]); + tmp_gpair_h[i] = bst_gpair(grad[i], hess[i]); } bst->LazyInit(); @@ -749,7 +750,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, unsigned ntree_limit, xgboost::bst_ulong *len, const bst_float **out_result) { - std::vector& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float; + HostDeviceVector& preds = + XGBAPIThreadLocalStore::Get()->ret_vec_float; API_BEGIN(); Booster *bst = static_cast(handle); bst->LazyInit(); @@ -761,7 +763,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, (option_mask & 4) != 0, (option_mask & 8) != 0, (option_mask & 16) != 0); - *out_result = dmlc::BeginPtr(preds); + *out_result = dmlc::BeginPtr(preds.data_h()); *len = static_cast(preds.size()); API_END(); } diff --git a/src/cli_main.cc b/src/cli_main.cc index 5c301b626..59eafc581 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -324,7 +324,7 @@ void CLIPredict(const CLIParam& param) { if (param.silent == 0) { LOG(CONSOLE) << "start prediction..."; } - std::vector preds; + HostDeviceVector preds; learner->Predict(dtest.get(), param.pred_margin, &preds, param.ntree_limit); if (param.silent == 0) { LOG(CONSOLE) << "writing prediction to " << param.name_pred; @@ -332,7 +332,7 @@ void CLIPredict(const CLIParam& param) { std::unique_ptr fo( dmlc::Stream::Create(param.name_pred.c_str(), "w")); dmlc::ostream os(fo.get()); - for (bst_float p : preds) { + for (bst_float p : preds.data_h()) { os << p << '\n'; } // force flush before fo destruct. diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index 154a80cf3..41312d5c7 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -12,13 +12,27 @@ namespace xgboost { template struct HostDeviceVectorImpl { - explicit HostDeviceVectorImpl(size_t size) : data_h_(size) {} + explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {} + explicit HostDeviceVectorImpl(std::initializer_list init) : data_h_(init) {} + explicit HostDeviceVectorImpl(const std::vector& init) : data_h_(init) {} std::vector data_h_; }; template -HostDeviceVector::HostDeviceVector(size_t size, int device) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(size); +HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(size, v); +} + +template +HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init); +} + +template +HostDeviceVector::HostDeviceVector(const std::vector& init, int device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init); } template @@ -41,8 +55,8 @@ template std::vector& HostDeviceVector::data_h() { return impl_->data_h_; } template -void HostDeviceVector::resize(size_t new_size, int new_device) { - impl_->data_h_.resize(new_size); +void HostDeviceVector::resize(size_t new_size, T v, int new_device) { + impl_->data_h_.resize(new_size, v); } // explicit instantiations are required, as HostDeviceVector isn't header-only diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 4370ef21e..9a2a63020 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -1,6 +1,7 @@ /*! * Copyright 2017 XGBoost contributors */ + #include "./host_device_vector.h" #include "./device_helpers.cuh" @@ -8,13 +9,25 @@ namespace xgboost { template struct HostDeviceVectorImpl { - HostDeviceVectorImpl(size_t size, int device) + HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), on_d_(device >= 0) { if (on_d_) { dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(size); + data_d_.resize(size, v); } else { - data_h_.resize(size); + data_h_.resize(size, v); + } + } + // Init can be std::vector or std::initializer_list + template + HostDeviceVectorImpl(const Init& init, int device) + : device_(device), on_d_(device >= 0) { + if (on_d_) { + dh::safe_cuda(cudaSetDevice(device_)); + data_d_.resize(init.size()); + thrust::copy(init.begin(), init.end(), data_d_.begin()); + } else { + data_h_ = init; } } HostDeviceVectorImpl(const HostDeviceVectorImpl&) = delete; @@ -41,17 +54,18 @@ struct HostDeviceVectorImpl { lazy_sync_host(); return data_h_; } - void resize(size_t new_size, int new_device) { + void resize(size_t new_size, T v, int new_device) { if (new_size == this->size() && new_device == device_) return; - device_ = new_device; + if (new_device != -1) + device_ = new_device; // if !on_d_, but the data size is 0 and the device is set, // resize the data on device instead if (!on_d_ && (data_h_.size() > 0 || device_ == -1)) { - data_h_.resize(new_size); + data_h_.resize(new_size, v); } else { dh::safe_cuda(cudaSetDevice(device_)); - data_d_.resize(new_size); + data_d_.resize(new_size, v); on_d_ = true; } } @@ -90,8 +104,20 @@ struct HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, int device) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(size, device); +HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(size, v, device); +} + +template +HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init, device); +} + +template +HostDeviceVector::HostDeviceVector(const std::vector& init, int device) + : impl_(nullptr) { + impl_ = new HostDeviceVectorImpl(init, device); } template @@ -124,8 +150,8 @@ template std::vector& HostDeviceVector::data_h() { return impl_->data_h(); } template -void HostDeviceVector::resize(size_t new_size, int new_device) { - impl_->resize(new_size, new_device); +void HostDeviceVector::resize(size_t new_size, T v, int new_device) { + impl_->resize(new_size, v, new_device); } // explicit instantiations are required, as HostDeviceVector isn't header-only diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h index fc0ca0660..3f4cb2b94 100644 --- a/src/common/host_device_vector.h +++ b/src/common/host_device_vector.h @@ -5,6 +5,7 @@ #define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_ #include +#include #include // only include thrust-related files if host_device_vector.h @@ -61,7 +62,9 @@ template struct HostDeviceVectorImpl; template class HostDeviceVector { public: - explicit HostDeviceVector(size_t size = 0, int device = -1); + explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); + HostDeviceVector(std::initializer_list init, int device = -1); + explicit HostDeviceVector(const std::vector& init, int device = -1); ~HostDeviceVector(); HostDeviceVector(const HostDeviceVector&) = delete; HostDeviceVector(HostDeviceVector&&) = delete; @@ -70,6 +73,7 @@ class HostDeviceVector { size_t size() const; int device() const; T* ptr_d(int device); + T* ptr_h() { return data_h().data(); } // only define functions returning device_ptr // if HostDeviceVector.h is included from a .cu file @@ -79,17 +83,9 @@ class HostDeviceVector { #endif std::vector& data_h(); - void resize(size_t new_size, int new_device); - // helper functions in case a function needs to be templated - // to work for both HostDeviceVector and std::vector - static std::vector& data_h(HostDeviceVector* v) { - return v->data_h(); - } - - static std::vector& data_h(std::vector* v) { - return *v; - } + // passing in new_device == -1 keeps the device as is + void resize(size_t new_size, T v = T(), int new_device = -1); private: HostDeviceVectorImpl* impl_; diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index adccf6239..dde5231c5 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -76,8 +76,10 @@ class GBLinear : public GradientBooster { void Save(dmlc::Stream* fo) const override { model.Save(fo); } - void DoBoost(DMatrix *p_fmat, std::vector *in_gpair, - ObjFunction *obj) override { + + void DoBoost(DMatrix *p_fmat, + HostDeviceVector *in_gpair, + ObjFunction* obj) override { monitor.Start("DoBoost"); if (!p_fmat->HaveColAccess(false)) { @@ -91,14 +93,15 @@ class GBLinear : public GradientBooster { this->LazySumWeights(p_fmat); if (!this->CheckConvergence()) { - updater->Update(in_gpair, p_fmat, &model, sum_instance_weight); + updater->Update(&in_gpair->data_h(), p_fmat, &model, sum_instance_weight); } this->UpdatePredictionCache(); monitor.Stop("DoBoost"); } - void PredictBatch(DMatrix *p_fmat, std::vector *out_preds, + void PredictBatch(DMatrix *p_fmat, + HostDeviceVector *out_preds, unsigned ntree_limit) override { monitor.Start("PredictBatch"); CHECK_EQ(ntree_limit, 0U) @@ -109,9 +112,9 @@ class GBLinear : public GradientBooster { if (it != cache_.end() && it->second.predictions.size() != 0) { std::vector &y = it->second.predictions; out_preds->resize(y.size()); - std::copy(y.begin(), y.end(), out_preds->begin()); + std::copy(y.begin(), y.end(), out_preds->data_h().begin()); } else { - this->PredictBatchInternal(p_fmat, out_preds); + this->PredictBatchInternal(p_fmat, &out_preds->data_h()); } monitor.Stop("PredictBatch"); } diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc index 4d7ee0975..0d84abfd0 100644 --- a/src/gbm/gbm.cc +++ b/src/gbm/gbm.cc @@ -22,18 +22,6 @@ GradientBooster* GradientBooster::Create( return (e->body)(cache_mats, base_margin); } -void GradientBooster::DoBoost(DMatrix* p_fmat, - HostDeviceVector* in_gpair, - ObjFunction* obj) { - DoBoost(p_fmat, &in_gpair->data_h(), obj); -} - -void GradientBooster::PredictBatch(DMatrix* dmat, - HostDeviceVector* out_preds, - unsigned ntree_limit) { - PredictBatch(dmat, &out_preds->data_h(), ntree_limit); -} - } // namespace xgboost namespace xgboost { diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 7bbc57c1b..3ccf5782a 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -180,22 +180,39 @@ class GBTree : public GradientBooster { tparam.updater_seq.find("distcol") != std::string::npos; } - void DoBoost(DMatrix* p_fmat, - std::vector* in_gpair, - ObjFunction* obj) override { - DoBoostHelper(p_fmat, in_gpair, obj); - } - void DoBoost(DMatrix* p_fmat, HostDeviceVector* in_gpair, ObjFunction* obj) override { - DoBoostHelper(p_fmat, in_gpair, obj); - } - - void PredictBatch(DMatrix* p_fmat, - std::vector* out_preds, - unsigned ntree_limit) override { - predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit); + std::vector > > new_trees; + const int ngroup = model_.param.num_output_group; + monitor.Start("BoostNewTrees"); + if (ngroup == 1) { + std::vector > ret; + BoostNewTrees(in_gpair, p_fmat, 0, &ret); + new_trees.push_back(std::move(ret)); + } else { + CHECK_EQ(in_gpair->size() % ngroup, 0U) + << "must have exactly ngroup*nrow gpairs"; + // TODO(canonizer): perform this on GPU if HostDeviceVector has device set. + HostDeviceVector tmp(in_gpair->size() / ngroup, + bst_gpair(), in_gpair->device()); + std::vector& gpair_h = in_gpair->data_h(); + bst_omp_uint nsize = static_cast(tmp.size()); + for (int gid = 0; gid < ngroup; ++gid) { + std::vector& tmp_h = tmp.data_h(); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < nsize; ++i) { + tmp_h[i] = gpair_h[i * ngroup + gid]; + } + std::vector > ret; + BoostNewTrees(&tmp, p_fmat, gid, &ret); + new_trees.push_back(std::move(ret)); + } + } + monitor.Stop("BoostNewTrees"); + monitor.Start("CommitModel"); + this->CommitModel(std::move(new_trees)); + monitor.Stop("CommitModel"); } void PredictBatch(DMatrix* p_fmat, @@ -251,48 +268,11 @@ class GBTree : public GradientBooster { } } - // TVec is either std::vector or HostDeviceVector - template - void DoBoostHelper(DMatrix* p_fmat, - TVec* in_gpair, - ObjFunction* obj) { - std::vector > > new_trees; - const int ngroup = model_.param.num_output_group; - monitor.Start("BoostNewTrees"); - if (ngroup == 1) { - std::vector > ret; - BoostNewTrees(in_gpair, p_fmat, 0, &ret); - new_trees.push_back(std::move(ret)); - } else { - CHECK_EQ(in_gpair->size() % ngroup, 0U) - << "must have exactly ngroup*nrow gpairs"; - std::vector tmp(in_gpair->size() / ngroup); - auto& gpair_h = HostDeviceVector::data_h(in_gpair); - for (int gid = 0; gid < ngroup; ++gid) { - bst_omp_uint nsize = static_cast(tmp.size()); - #pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < nsize; ++i) { - tmp[i] = gpair_h[i * ngroup + gid]; - } - std::vector > ret; - BoostNewTrees(&tmp, p_fmat, gid, &ret); - new_trees.push_back(std::move(ret)); - } - } - monitor.Stop("BoostNewTrees"); - monitor.Start("CommitModel"); - this->CommitModel(std::move(new_trees)); - monitor.Stop("CommitModel"); - } - // do group specific group - // TVec is either const std::vector or HostDeviceVector - template - inline void - BoostNewTrees(TVec* gpair, - DMatrix *p_fmat, - int bst_group, - std::vector >* ret) { + inline void BoostNewTrees(HostDeviceVector* gpair, + DMatrix *p_fmat, + int bst_group, + std::vector >* ret) { this->InitUpdater(); std::vector new_trees; ret->clear(); @@ -315,23 +295,8 @@ class GBTree : public GradientBooster { } } // update the trees - for (auto& up : updaters) { - UpdateHelper(up.get(), gpair, p_fmat, new_trees); - } - } - - void UpdateHelper(TreeUpdater* updater, - std::vector* gpair, - DMatrix *p_fmat, - const std::vector& new_trees) { - updater->Update(*gpair, p_fmat, new_trees); - } - - void UpdateHelper(TreeUpdater* updater, - HostDeviceVector* gpair, - DMatrix *p_fmat, - const std::vector& new_trees) { - updater->Update(gpair, p_fmat, new_trees); + for (auto& up : updaters) + up->Update(gpair, p_fmat, new_trees); } // commit new trees all at once @@ -389,10 +354,10 @@ class Dart : public GBTree { // predict the leaf scores with dropout if ntree_limit = 0 void PredictBatch(DMatrix* p_fmat, - std::vector* out_preds, - unsigned ntree_limit) override { + HostDeviceVector* out_preds, + unsigned ntree_limit) override { DropTrees(ntree_limit); - PredLoopInternal(p_fmat, out_preds, 0, ntree_limit, true); + PredLoopInternal(p_fmat, &out_preds->data_h(), 0, ntree_limit, true); } void PredictInstance(const SparseBatch::Inst& inst, diff --git a/src/learner.cc b/src/learner.cc index 3f13ffba8..883c7a8e5 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -362,17 +362,17 @@ class LearnerImpl : public Learner { } this->LazyInitDMatrix(train); monitor.Start("PredictRaw"); - this->PredictRaw(train, &preds2_); + this->PredictRaw(train, &preds_); monitor.Stop("PredictRaw"); monitor.Start("GetGradient"); - obj_->GetGradient(&preds2_, train->info(), iter, &gpair_); + obj_->GetGradient(&preds_, train->info(), iter, &gpair_); monitor.Stop("GetGradient"); gbm_->DoBoost(train, &gpair_, obj_.get()); monitor.Stop("UpdateOneIter"); } void BoostOneIter(int iter, DMatrix* train, - std::vector* in_gpair) override { + HostDeviceVector* in_gpair) override { monitor.Start("BoostOneIter"); if (tparam.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter); @@ -395,7 +395,7 @@ class LearnerImpl : public Learner { obj_->EvalTransform(&preds_); for (auto& ev : metrics_) { os << '\t' << data_names[i] << '-' << ev->Name() << ':' - << ev->Eval(preds_, data_sets[i]->info(), tparam.dsplit == 2); + << ev->Eval(preds_.data_h(), data_sets[i]->info(), tparam.dsplit == 2); } } @@ -438,19 +438,20 @@ class LearnerImpl : public Learner { this->PredictRaw(data, &preds_); obj_->EvalTransform(&preds_); return std::make_pair(metric, - ev->Eval(preds_, data->info(), tparam.dsplit == 2)); + ev->Eval(preds_.data_h(), data->info(), tparam.dsplit == 2)); } void Predict(DMatrix* data, bool output_margin, - std::vector* out_preds, unsigned ntree_limit, + HostDeviceVector* out_preds, unsigned ntree_limit, bool pred_leaf, bool pred_contribs, bool approx_contribs, bool pred_interactions) const override { if (pred_contribs) { - gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs); + gbm_->PredictContribution(data, &out_preds->data_h(), ntree_limit, approx_contribs); } else if (pred_interactions) { - gbm_->PredictInteractionContributions(data, out_preds, ntree_limit, approx_contribs); + gbm_->PredictInteractionContributions(data, &out_preds->data_h(), ntree_limit, + approx_contribs); } else if (pred_leaf) { - gbm_->PredictLeaf(data, out_preds, ntree_limit); + gbm_->PredictLeaf(data, &out_preds->data_h(), ntree_limit); } else { this->PredictRaw(data, out_preds, ntree_limit); if (!output_margin) { @@ -546,12 +547,6 @@ class LearnerImpl : public Learner { * \param ntree_limit limit number of trees used for boosted tree * predictor, when it equals 0, this means we are using all the trees */ - inline void PredictRaw(DMatrix* data, std::vector* out_preds, - unsigned ntree_limit = 0) const { - CHECK(gbm_.get() != nullptr) - << "Predict must happen after Load or InitModel"; - gbm_->PredictBatch(data, out_preds, ntree_limit); - } inline void PredictRaw(DMatrix* data, HostDeviceVector* out_preds, unsigned ntree_limit = 0) const { CHECK(gbm_.get() != nullptr) @@ -572,8 +567,7 @@ class LearnerImpl : public Learner { // name of objective function std::string name_obj_; // temporal storages for prediction - std::vector preds_; - HostDeviceVector preds2_; + HostDeviceVector preds_; // gradient pairs HostDeviceVector gpair_; diff --git a/src/objective/multiclass_obj.cc b/src/objective/multiclass_obj.cc index dad4a3d60..9dcb85686 100644 --- a/src/objective/multiclass_obj.cc +++ b/src/objective/multiclass_obj.cc @@ -35,16 +35,18 @@ class SoftmaxMultiClassObj : public ObjFunction { void Configure(const std::vector >& args) override { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector& preds, + void GetGradient(HostDeviceVector* preds, const MetaInfo& info, int iter, - std::vector* out_gpair) override { + HostDeviceVector* out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK(preds.size() == (static_cast(param_.num_class) * info.labels.size())) + CHECK(preds->size() == (static_cast(param_.num_class) * info.labels.size())) << "SoftmaxMultiClassObj: label size and pred size does not match"; - out_gpair->resize(preds.size()); + std::vector& preds_h = preds->data_h(); + out_gpair->resize(preds_h.size()); + std::vector& gpair = out_gpair->data_h(); const int nclass = param_.num_class; - const omp_ulong ndata = static_cast(preds.size() / nclass); + const omp_ulong ndata = static_cast(preds_h.size() / nclass); int label_error = 0; #pragma omp parallel @@ -53,7 +55,7 @@ class SoftmaxMultiClassObj : public ObjFunction { #pragma omp for schedule(static) for (omp_ulong i = 0; i < ndata; ++i) { for (int k = 0; k < nclass; ++k) { - rec[k] = preds[i * nclass + k]; + rec[k] = preds_h[i * nclass + k]; } common::Softmax(&rec); int label = static_cast(info.labels[i]); @@ -65,9 +67,9 @@ class SoftmaxMultiClassObj : public ObjFunction { bst_float p = rec[k]; const bst_float h = 2.0f * p * (1.0f - p) * wt; if (label == k) { - (*out_gpair)[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h); + gpair[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h); } else { - (*out_gpair)[i * nclass + k] = bst_gpair(p* wt, h); + gpair[i * nclass + k] = bst_gpair(p* wt, h); } } } @@ -77,10 +79,10 @@ class SoftmaxMultiClassObj : public ObjFunction { << " num_class=" << nclass << " but found " << label_error << " in label."; } - void PredTransform(std::vector* io_preds) override { + void PredTransform(HostDeviceVector* io_preds) override { this->Transform(io_preds, output_prob_); } - void EvalTransform(std::vector* io_preds) override { + void EvalTransform(HostDeviceVector* io_preds) override { this->Transform(io_preds, true); } const char* DefaultEvalMetric() const override { @@ -88,8 +90,8 @@ class SoftmaxMultiClassObj : public ObjFunction { } private: - inline void Transform(std::vector *io_preds, bool prob) { - std::vector &preds = *io_preds; + inline void Transform(HostDeviceVector *io_preds, bool prob) { + std::vector &preds = io_preds->data_h(); std::vector tmp; const int nclass = param_.num_class; const omp_ulong ndata = static_cast(preds.size() / nclass); diff --git a/src/objective/objective.cc b/src/objective/objective.cc index 53f52ac9f..bf860a480 100644 --- a/src/objective/objective.cc +++ b/src/objective/objective.cc @@ -25,17 +25,6 @@ ObjFunction* ObjFunction::Create(const std::string& name) { return (e->body)(); } -void ObjFunction::GetGradient(HostDeviceVector* preds, - const MetaInfo& info, - int iteration, - HostDeviceVector* out_gpair) { - GetGradient(preds->data_h(), info, iteration, &out_gpair->data_h()); -} - -void ObjFunction::PredTransform(HostDeviceVector *io_preds) { - PredTransform(&io_preds->data_h()); -} - } // namespace xgboost namespace xgboost { diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc index 76ce3ad72..93559e135 100644 --- a/src/objective/rank_obj.cc +++ b/src/objective/rank_obj.cc @@ -37,13 +37,14 @@ class LambdaRankObj : public ObjFunction { void Configure(const std::vector >& args) override { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector& preds, + void GetGradient(HostDeviceVector* preds, const MetaInfo& info, int iter, - std::vector* out_gpair) override { - CHECK_EQ(preds.size(), info.labels.size()) << "label size predict size not match"; - std::vector& gpair = *out_gpair; - gpair.resize(preds.size()); + HostDeviceVector* out_gpair) override { + CHECK_EQ(preds->size(), info.labels.size()) << "label size predict size not match"; + auto& preds_h = preds->data_h(); + out_gpair->resize(preds_h.size()); + std::vector& gpair = out_gpair->data_h(); // quick consistency when group is not available std::vector tgptr(2, 0); tgptr[1] = static_cast(info.labels.size()); const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; @@ -63,7 +64,7 @@ class LambdaRankObj : public ObjFunction { for (bst_omp_uint k = 0; k < ngroup; ++k) { lst.clear(); pairs.clear(); for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) { - lst.push_back(ListEntry(preds[j], info.labels[j], j)); + lst.push_back(ListEntry(preds_h[j], info.labels[j], j)); gpair[j] = bst_gpair(0.0f, 0.0f); } std::sort(lst.begin(), lst.end(), ListEntry::CmpPred); diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc index 9fb0cc981..b1f75c221 100644 --- a/src/objective/regression_obj.cc +++ b/src/objective/regression_obj.cc @@ -38,18 +38,20 @@ class RegLossObj : public ObjFunction { const std::vector > &args) override { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector &preds, const MetaInfo &info, - int iter, std::vector *out_gpair) override { + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, + int iter, HostDeviceVector *out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) + CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided" - << "preds.size=" << preds.size() + << "preds.size=" << preds->size() << ", label.size=" << info.labels.size(); + auto& preds_h = preds->data_h(); this->LazyCheckLabels(info.labels); - out_gpair->resize(preds.size()); - const omp_ulong n = static_cast(preds.size()); - auto gpair_ptr = out_gpair->data(); + out_gpair->resize(preds_h.size()); + auto& gpair = out_gpair->data_h(); + const omp_ulong n = static_cast(preds_h.size()); + auto gpair_ptr = out_gpair->ptr_h(); avx::Float8 scale(param_.scale_pos_weight); const omp_ulong remainder = n % 8; @@ -58,7 +60,7 @@ class RegLossObj : public ObjFunction { #pragma omp parallel for schedule(static) num_threads(std::min(8, nthread)) for (omp_ulong i = 0; i < n - remainder; i += 8) { avx::Float8 y(&info.labels[i]); - avx::Float8 p = Loss::PredTransform(avx::Float8(&preds[i])); + avx::Float8 p = Loss::PredTransform(avx::Float8(&preds_h[i])); avx::Float8 w = info.weights.empty() ? avx::Float8(1.0f) : avx::Float8(&info.weights[i]); // Adjust weight @@ -69,11 +71,11 @@ class RegLossObj : public ObjFunction { } for (omp_ulong i = n - remainder; i < n; ++i) { auto y = info.labels[i]; - bst_float p = Loss::PredTransform(preds[i]); + bst_float p = Loss::PredTransform(preds_h[i]); bst_float w = info.GetWeight(i); w += y * ((param_.scale_pos_weight * w) - w); - (*out_gpair)[i] = bst_gpair(Loss::FirstOrderGradient(p, y) * w, - Loss::SecondOrderGradient(p, y) * w); + gpair[i] = bst_gpair(Loss::FirstOrderGradient(p, y) * w, + Loss::SecondOrderGradient(p, y) * w); } // Reset omp max threads @@ -82,8 +84,8 @@ class RegLossObj : public ObjFunction { const char *DefaultEvalMetric() const override { return Loss::DefaultEvalMetric(); } - void PredTransform(std::vector *io_preds) override { - std::vector &preds = *io_preds; + void PredTransform(HostDeviceVector *io_preds) override { + std::vector &preds = io_preds->data_h(); const bst_omp_uint ndata = static_cast(preds.size()); #pragma omp parallel for schedule(static) for (bst_omp_uint j = 0; j < ndata; ++j) { @@ -143,40 +145,42 @@ class PoissonRegression : public ObjFunction { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector &preds, + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, int iter, - std::vector *out_gpair) override { + HostDeviceVector *out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; - out_gpair->resize(preds.size()); + CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided"; + auto& preds_h = preds->data_h(); + out_gpair->resize(preds->size()); + auto& gpair = out_gpair->data_h(); // check if label in range bool label_correct = true; // start calculating gradient - const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + const omp_ulong ndata = static_cast(preds_h.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) - bst_float p = preds[i]; + bst_float p = preds_h[i]; bst_float w = info.GetWeight(i); bst_float y = info.labels[i]; if (y >= 0.0f) { - (*out_gpair)[i] = bst_gpair((std::exp(p) - y) * w, - std::exp(p + param_.max_delta_step) * w); + gpair[i] = bst_gpair((std::exp(p) - y) * w, + std::exp(p + param_.max_delta_step) * w); } else { label_correct = false; } } CHECK(label_correct) << "PoissonRegression: label must be nonnegative"; } - void PredTransform(std::vector *io_preds) override { - std::vector &preds = *io_preds; + void PredTransform(HostDeviceVector *io_preds) override { + std::vector &preds = io_preds->data_h(); const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (long j = 0; j < ndata; ++j) { // NOLINT(*) preds[j] = std::exp(preds[j]); } } - void EvalTransform(std::vector *io_preds) override { + void EvalTransform(HostDeviceVector *io_preds) override { PredTransform(io_preds); } bst_float ProbToMargin(bst_float base_score) const override { @@ -202,21 +206,23 @@ class CoxRegression : public ObjFunction { public: // declare functions void Configure(const std::vector >& args) override {} - void GetGradient(const std::vector &preds, + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, int iter, - std::vector *out_gpair) override { + HostDeviceVector *out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; - out_gpair->resize(preds.size()); + CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided"; + auto& preds_h = preds->data_h(); + out_gpair->resize(preds_h.size()); + auto& gpair = out_gpair->data_h(); const std::vector &label_order = info.LabelAbsSort(); - const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + const omp_ulong ndata = static_cast(preds_h.size()); // NOLINT(*) // pre-compute a sum double exp_p_sum = 0; // we use double because we might need the precision with large datasets for (omp_ulong i = 0; i < ndata; ++i) { - exp_p_sum += std::exp(preds[label_order[i]]); + exp_p_sum += std::exp(preds_h[label_order[i]]); } // start calculating grad and hess @@ -227,7 +233,7 @@ class CoxRegression : public ObjFunction { double accumulated_sum = 0; for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) const size_t ind = label_order[i]; - const double p = preds[ind]; + const double p = preds_h[ind]; const double exp_p = std::exp(p); const double w = info.GetWeight(ind); const double y = info.labels[ind]; @@ -251,21 +257,21 @@ class CoxRegression : public ObjFunction { const double grad = exp_p*r_k - static_cast(y > 0); const double hess = exp_p*r_k - exp_p*exp_p * s_k; - out_gpair->at(ind) = bst_gpair(grad * w, hess * w); + gpair.at(ind) = bst_gpair(grad * w, hess * w); last_abs_y = abs_y; last_exp_p = exp_p; } } - void PredTransform(std::vector *io_preds) override { - std::vector &preds = *io_preds; + void PredTransform(HostDeviceVector *io_preds) override { + std::vector &preds = io_preds->data_h(); const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (long j = 0; j < ndata; ++j) { // NOLINT(*) preds[j] = std::exp(preds[j]); } } - void EvalTransform(std::vector *io_preds) override { + void EvalTransform(HostDeviceVector *io_preds) override { PredTransform(io_preds); } bst_float ProbToMargin(bst_float base_score) const override { @@ -288,39 +294,41 @@ class GammaRegression : public ObjFunction { void Configure(const std::vector >& args) override { } - void GetGradient(const std::vector &preds, + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, int iter, - std::vector *out_gpair) override { + HostDeviceVector *out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; - out_gpair->resize(preds.size()); + CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided"; + auto& preds_h = preds->data_h(); + out_gpair->resize(preds_h.size()); + auto& gpair = out_gpair->data_h(); // check if label in range bool label_correct = true; // start calculating gradient - const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + const omp_ulong ndata = static_cast(preds_h.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) - bst_float p = preds[i]; + bst_float p = preds_h[i]; bst_float w = info.GetWeight(i); bst_float y = info.labels[i]; if (y >= 0.0f) { - (*out_gpair)[i] = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w); + gpair[i] = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w); } else { label_correct = false; } } CHECK(label_correct) << "GammaRegression: label must be positive"; } - void PredTransform(std::vector *io_preds) override { - std::vector &preds = *io_preds; + void PredTransform(HostDeviceVector *io_preds) override { + std::vector &preds = io_preds->data_h(); const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (long j = 0; j < ndata; ++j) { // NOLINT(*) preds[j] = std::exp(preds[j]); } } - void EvalTransform(std::vector *io_preds) override { + void EvalTransform(HostDeviceVector *io_preds) override { PredTransform(io_preds); } bst_float ProbToMargin(bst_float base_score) const override { @@ -353,20 +361,22 @@ class TweedieRegression : public ObjFunction { param_.InitAllowUnknown(args); } - void GetGradient(const std::vector &preds, + void GetGradient(HostDeviceVector *preds, const MetaInfo &info, int iter, - std::vector *out_gpair) override { + HostDeviceVector *out_gpair) override { CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; - out_gpair->resize(preds.size()); + CHECK_EQ(preds->size(), info.labels.size()) << "labels are not correctly provided"; + auto& preds_h = preds->data_h(); + out_gpair->resize(preds->size()); + auto& gpair = out_gpair->data_h(); // check if label in range bool label_correct = true; // start calculating gradient - const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + const omp_ulong ndata = static_cast(preds->size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) - bst_float p = preds[i]; + bst_float p = preds_h[i]; bst_float w = info.GetWeight(i); bst_float y = info.labels[i]; float rho = param_.tweedie_variance_power; @@ -374,15 +384,15 @@ class TweedieRegression : public ObjFunction { bst_float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p); bst_float hess = -y * (1 - rho) * \ std::exp((1 - rho) * p) + (2 - rho) * std::exp((2 - rho) * p); - (*out_gpair)[i] = bst_gpair(grad * w, hess * w); + gpair[i] = bst_gpair(grad * w, hess * w); } else { label_correct = false; } } CHECK(label_correct) << "TweedieRegression: label must be nonnegative"; } - void PredTransform(std::vector *io_preds) override { - std::vector &preds = *io_preds; + void PredTransform(HostDeviceVector *io_preds) override { + std::vector &preds = io_preds->data_h(); const long ndata = static_cast(preds.size()); // NOLINT(*) #pragma omp parallel for schedule(static) for (long j = 0; j < ndata; ++j) { // NOLINT(*) diff --git a/src/objective/regression_obj_gpu.cu b/src/objective/regression_obj_gpu.cu index 7d70a59b0..45270e316 100644 --- a/src/objective/regression_obj_gpu.cu +++ b/src/objective/regression_obj_gpu.cu @@ -103,8 +103,8 @@ class GPURegLossObj : public ObjFunction { // free the old data and allocate the new data ba_.reset(new bulk_allocator()); data_.reset(new DeviceData(ba_.get(), 0, n)); - preds_d_.resize(n, param_.gpu_id); - out_gpair_d_.resize(n, param_.gpu_id); + preds_d_.resize(n, 0.0f, param_.gpu_id); + out_gpair_d_.resize(n, bst_gpair(), param_.gpu_id); } public: @@ -114,23 +114,6 @@ class GPURegLossObj : public ObjFunction { param_.InitAllowUnknown(args); CHECK(param_.n_gpus != 0) << "Must have at least one device"; } - void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) override { - CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) - << "labels are not correctly provided" - << "preds.size=" << preds.size() << ", label.size=" << info.labels.size(); - - size_t ndata = preds.size(); - out_gpair->resize(ndata); - LazyResize(ndata); - thrust::copy(preds.begin(), preds.end(), preds_d_.tbegin(param_.gpu_id)); - GetGradientDevice(preds_d_.ptr_d(param_.gpu_id), info, iter, - out_gpair_d_.ptr_d(param_.gpu_id), ndata); - thrust::copy_n(out_gpair_d_.tbegin(param_.gpu_id), ndata, out_gpair->begin()); - } void GetGradient(HostDeviceVector* preds, const MetaInfo &info, @@ -141,7 +124,7 @@ class GPURegLossObj : public ObjFunction { << "labels are not correctly provided" << "preds.size=" << preds->size() << ", label.size=" << info.labels.size(); size_t ndata = preds->size(); - out_gpair->resize(ndata, param_.gpu_id); + out_gpair->resize(ndata, bst_gpair(), param_.gpu_id); LazyResize(ndata); GetGradientDevice(preds->ptr_d(param_.gpu_id), info, iter, out_gpair->ptr_d(param_.gpu_id), ndata); @@ -189,13 +172,6 @@ class GPURegLossObj : public ObjFunction { return Loss::DefaultEvalMetric(); } - void PredTransform(std::vector *io_preds) override { - LazyResize(io_preds->size()); - thrust::copy(io_preds->begin(), io_preds->end(), preds_d_.tbegin(param_.gpu_id)); - PredTransformDevice(preds_d_.ptr_d(param_.gpu_id), io_preds->size()); - thrust::copy_n(preds_d_.tbegin(param_.gpu_id), io_preds->size(), io_preds->begin()); - } - void PredTransform(HostDeviceVector *io_preds) override { PredTransformDevice(io_preds->ptr_d(param_.gpu_id), io_preds->size()); } diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index 04bfd9f7d..9c956b5d9 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -104,14 +104,43 @@ class CPUPredictor : public Predictor { tree_begin, ntree_limit); } - public: - void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, - const gbm::GBTreeModel& model, int tree_begin, - unsigned ntree_limit = 0) override { - PredictBatch(dmat, &out_preds->data_h(), model, tree_begin, ntree_limit); + bool PredictFromCache(DMatrix* dmat, + HostDeviceVector* out_preds, + const gbm::GBTreeModel& model, + unsigned ntree_limit) { + if (ntree_limit == 0 || + ntree_limit * model.param.num_output_group >= model.trees.size()) { + auto it = cache_.find(dmat); + if (it != cache_.end()) { + HostDeviceVector& y = it->second.predictions; + if (y.size() != 0) { + out_preds->resize(y.size()); + std::copy(y.data_h().begin(), y.data_h().end(), + out_preds->data_h().begin()); + return true; + } + } + } + return false; } - void PredictBatch(DMatrix* dmat, std::vector* out_preds, + void InitOutPredictions(const MetaInfo& info, + HostDeviceVector* out_preds, + const gbm::GBTreeModel& model) const { + size_t n = model.param.num_output_group * info.num_row; + const std::vector& base_margin = info.base_margin; + out_preds->resize(n); + std::vector& out_preds_h = out_preds->data_h(); + if (base_margin.size() != 0) { + CHECK_EQ(out_preds->size(), n); + std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); + } else { + std::fill(out_preds_h.begin(), out_preds_h.end(), model.base_margin); + } + } + + public: + void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { @@ -125,12 +154,14 @@ class CPUPredictor : public Predictor { ntree_limit = static_cast(model.trees.size()); } - this->PredLoopInternal(dmat, out_preds, model, tree_begin, ntree_limit); + this->PredLoopInternal(dmat, &out_preds->data_h(), model, + tree_begin, ntree_limit); } - void UpdatePredictionCache(const gbm::GBTreeModel& model, - std::vector>* updaters, - int num_new_trees) override { + void UpdatePredictionCache( + const gbm::GBTreeModel& model, + std::vector>* updaters, + int num_new_trees) override { int old_ntree = model.trees.size() - num_new_trees; // update cache entry for (auto& kv : cache_) { @@ -138,7 +169,7 @@ class CPUPredictor : public Predictor { if (e.predictions.size() == 0) { InitOutPredictions(e.data->info(), &(e.predictions), model); - PredLoopInternal(e.data.get(), &(e.predictions), model, 0, + PredLoopInternal(e.data.get(), &(e.predictions.data_h()), model, 0, model.trees.size()); } else if (model.param.num_output_group == 1 && updaters->size() > 0 && num_new_trees == 1 && @@ -146,7 +177,7 @@ class CPUPredictor : public Predictor { &(e.predictions))) { {} // do nothing } else { - PredLoopInternal(e.data.get(), &(e.predictions), model, old_ntree, + PredLoopInternal(e.data.get(), &(e.predictions.data_h()), model, old_ntree, model.trees.size()); } } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index b0f34529e..ca00e4b14 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -256,8 +256,6 @@ class GPUPredictor : public xgboost::Predictor { HostDeviceVector predictions; }; - std::unordered_map device_cache_; - private: void DevicePredictInternal(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, size_t tree_begin, @@ -337,25 +335,16 @@ class GPUPredictor : public xgboost::Predictor { public: GPUPredictor() : cpu_predictor(Predictor::Create("cpu_predictor")) {} - void PredictBatch(DMatrix* dmat, std::vector* out_preds, - const gbm::GBTreeModel& model, int tree_begin, - unsigned ntree_limit = 0) override { - HostDeviceVector out_preds_d; - PredictBatch(dmat, &out_preds_d, model, tree_begin, ntree_limit); - out_preds->resize(out_preds_d.size()); - thrust::copy(out_preds_d.tbegin(param.gpu_id), - out_preds_d.tend(param.gpu_id), out_preds->begin()); - } - void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { - if (this->PredictFromCacheDevice(dmat, out_preds, model, ntree_limit)) { + if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { return; } - this->InitOutPredictionsDevice(dmat->info(), out_preds, model); + this->InitOutPredictions(dmat->info(), out_preds, model); int tree_end = ntree_limit * model.param.num_output_group; + if (ntree_limit == 0 || ntree_limit > model.trees.size()) { tree_end = static_cast(model.trees.size()); } @@ -363,13 +352,13 @@ class GPUPredictor : public xgboost::Predictor { DevicePredictInternal(dmat, out_preds, model, tree_begin, tree_end); } - - void InitOutPredictionsDevice(const MetaInfo& info, + protected: + void InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_preds, const gbm::GBTreeModel& model) const { size_t n = model.param.num_output_group * info.num_row; const std::vector& base_margin = info.base_margin; - out_preds->resize(n, param.gpu_id); + out_preds->resize(n, 0.0f, param.gpu_id); if (base_margin.size() != 0) { CHECK_EQ(out_preds->size(), n); thrust::copy(base_margin.begin(), base_margin.end(), out_preds->tbegin(param.gpu_id)); @@ -380,29 +369,16 @@ class GPUPredictor : public xgboost::Predictor { } bool PredictFromCache(DMatrix* dmat, - std::vector* out_preds, + HostDeviceVector* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit) { - HostDeviceVector out_preds_d(0, -1); - bool result = PredictFromCacheDevice(dmat, &out_preds_d, model, ntree_limit); - if (!result) return false; - out_preds->resize(out_preds_d.size(), param.gpu_id); - thrust::copy(out_preds_d.tbegin(param.gpu_id), - out_preds_d.tend(param.gpu_id), out_preds->begin()); - return true; - } - - bool PredictFromCacheDevice(DMatrix* dmat, - HostDeviceVector* out_preds, - const gbm::GBTreeModel& model, - unsigned ntree_limit) { if (ntree_limit == 0 || ntree_limit * model.param.num_output_group >= model.trees.size()) { - auto it = device_cache_.find(dmat); - if (it != device_cache_.end()) { + auto it = cache_.find(dmat); + if (it != cache_.end()) { HostDeviceVector& y = it->second.predictions; if (y.size() != 0) { - out_preds->resize(y.size(), param.gpu_id); + out_preds->resize(y.size(), 0.0f, param.gpu_id); thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id), out_preds->tbegin(param.gpu_id)); return true; @@ -418,15 +394,15 @@ class GPUPredictor : public xgboost::Predictor { int num_new_trees) override { auto old_ntree = model.trees.size() - num_new_trees; // update cache entry - for (auto& kv : device_cache_) { - DevicePredictionCacheEntry& e = kv.second; + for (auto& kv : cache_) { + PredictionCacheEntry& e = kv.second; DMatrix* dmat = kv.first; HostDeviceVector& predictions = e.predictions; if (predictions.size() == 0) { // ensure that the device in predictions is correct - predictions.resize(0, param.gpu_id); - cpu_predictor->PredictBatch(dmat, &predictions.data_h(), model, 0, + predictions.resize(0, 0.0f, param.gpu_id); + cpu_predictor->PredictBatch(dmat, &predictions, model, 0, static_cast(model.trees.size())); } else if (model.param.num_output_group == 1 && updaters->size() > 0 && num_new_trees == 1 && @@ -477,8 +453,6 @@ class GPUPredictor : public xgboost::Predictor { Predictor::Init(cfg, cache); cpu_predictor->Init(cfg, cache); param.InitAllowUnknown(cfg); - for (const std::shared_ptr& d : cache) - device_cache_[d.get()].data = d; max_shared_memory_bytes = dh::max_shared_memory(param.gpu_id); } diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc index 7e1ee3312..a4ea6e82c 100644 --- a/src/predictor/predictor.cc +++ b/src/predictor/predictor.cc @@ -11,43 +11,8 @@ namespace xgboost { void Predictor::Init( const std::vector>& cfg, const std::vector>& cache) { - for (const std::shared_ptr& d : cache) { - PredictionCacheEntry e; - e.data = d; - cache_[d.get()] = std::move(e); - } -} -bool Predictor::PredictFromCache(DMatrix* dmat, - std::vector* out_preds, - const gbm::GBTreeModel& model, - unsigned ntree_limit) { - if (ntree_limit == 0 || - ntree_limit * model.param.num_output_group >= model.trees.size()) { - auto it = cache_.find(dmat); - if (it != cache_.end()) { - std::vector& y = it->second.predictions; - if (y.size() != 0) { - out_preds->resize(y.size()); - std::copy(y.begin(), y.end(), out_preds->begin()); - return true; - } - } - } - - return false; -} -void Predictor::InitOutPredictions(const MetaInfo& info, - std::vector* out_preds, - const gbm::GBTreeModel& model) const { - size_t n = model.param.num_output_group * info.num_row; - const std::vector& base_margin = info.base_margin; - out_preds->resize(n); - if (base_margin.size() != 0) { - CHECK_EQ(out_preds->size(), n); - std::copy(base_margin.begin(), base_margin.end(), out_preds->begin()); - } else { - std::fill(out_preds->begin(), out_preds->end(), model.base_margin); - } + for (const std::shared_ptr& d : cache) + cache_[d.get()].data = d; } Predictor* Predictor::Create(std::string name) { auto* e = ::dmlc::Registry::Get()->Find(name); diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc index 2ca949e21..66227a78a 100644 --- a/src/tree/tree_updater.cc +++ b/src/tree/tree_updater.cc @@ -22,17 +22,6 @@ TreeUpdater* TreeUpdater::Create(const std::string& name) { return (e->body)(); } -void TreeUpdater::Update(HostDeviceVector* gpair, - DMatrix* data, - const std::vector& trees) { - Update(gpair->data_h(), data, trees); -} - -bool TreeUpdater::UpdatePredictionCache(const DMatrix* data, - HostDeviceVector* out_preds) { - return UpdatePredictionCache(data, &out_preds->data_h()); -} - } // namespace xgboost namespace xgboost { diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 4044c75ab..5d687f2c4 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -26,7 +26,7 @@ class ColMaker: public TreeUpdater { param.InitAllowUnknown(args); } - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix* dmat, const std::vector &trees) override { TStats::CheckInfo(dmat->info()); @@ -37,7 +37,7 @@ class ColMaker: public TreeUpdater { // build tree for (size_t i = 0; i < trees.size(); ++i) { Builder builder(param); - builder.Update(gpair, dmat, trees[i]); + builder.Update(gpair->data_h(), dmat, trees[i]); } param.learning_rate = lr; } @@ -806,13 +806,13 @@ class DistColMaker : public ColMaker { param.InitAllowUnknown(args); pruner->Init(args); } - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix* dmat, const std::vector &trees) override { TStats::CheckInfo(dmat->info()); CHECK_EQ(trees.size(), 1U) << "DistColMaker: only support one tree at a time"; // build the tree - builder.Update(gpair, dmat, trees[0]); + builder.Update(gpair->data_h(), dmat, trees[0]); //// prune the tree, note that pruner will sync the tree pruner->Update(gpair, dmat, trees); // update position after the tree is pruned @@ -967,7 +967,7 @@ class TreeUpdaterSwitch : public TreeUpdater { inner_->Init(args); } - void Update(const std::vector& gpair, + void Update(HostDeviceVector* gpair, DMatrix* data, const std::vector& trees) override { CHECK(inner_ != nullptr); diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc index 70d39b60b..a3cb01a05 100644 --- a/src/tree/updater_fast_hist.cc +++ b/src/tree/updater_fast_hist.cc @@ -55,7 +55,7 @@ class FastHistMaker: public TreeUpdater { is_gmat_initialized_ = false; } - void Update(const std::vector& gpair, + void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) override { TStats::CheckInfo(dmat->info()); @@ -82,13 +82,14 @@ class FastHistMaker: public TreeUpdater { builder_.reset(new Builder(param, fhparam, std::move(pruner_))); } for (size_t i = 0; i < trees.size(); ++i) { - builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]); + builder_->Update + (gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]); } param.learning_rate = lr; } bool UpdatePredictionCache(const DMatrix* data, - std::vector* out_preds) override { + HostDeviceVector* out_preds) override { if (!builder_ || param.subsample < 1.0f) { return false; } else { @@ -139,7 +140,7 @@ class FastHistMaker: public TreeUpdater { virtual void Update(const GHistIndexMatrix& gmat, const GHistIndexBlockMatrix& gmatb, const ColumnMatrix& column_matrix, - const std::vector& gpair, + HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree) { double gstart = dmlc::GetTime(); @@ -154,8 +155,10 @@ class FastHistMaker: public TreeUpdater { double time_evaluate_split = 0; double time_apply_split = 0; + std::vector& gpair_h = gpair->data_h(); + tstart = dmlc::GetTime(); - this->InitData(gmat, gpair, *p_fmat, *p_tree); + this->InitData(gmat, gpair_h, *p_fmat, *p_tree); std::vector feat_set = feat_index; time_init_data = dmlc::GetTime() - tstart; @@ -165,11 +168,11 @@ class FastHistMaker: public TreeUpdater { for (int nid = 0; nid < p_tree->param.num_roots; ++nid) { tstart = dmlc::GetTime(); hist_.AddHistRow(nid); - BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]); + BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]); time_build_hist += dmlc::GetTime() - tstart; tstart = dmlc::GetTime(); - this->InitNewNode(nid, gmat, gpair, *p_fmat, *p_tree); + this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree); time_init_new_node += dmlc::GetTime() - tstart; tstart = dmlc::GetTime(); @@ -200,17 +203,17 @@ class FastHistMaker: public TreeUpdater { hist_.AddHistRow(cleft); hist_.AddHistRow(cright); if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) { - BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]); + BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]); SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]); } else { - BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]); + BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]); SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]); } time_build_hist += dmlc::GetTime() - tstart; tstart = dmlc::GetTime(); - this->InitNewNode(cleft, gmat, gpair, *p_fmat, *p_tree); - this->InitNewNode(cright, gmat, gpair, *p_fmat, *p_tree); + this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree); + this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree); time_init_new_node += dmlc::GetTime() - tstart; tstart = dmlc::GetTime(); @@ -293,8 +296,8 @@ class FastHistMaker: public TreeUpdater { } inline bool UpdatePredictionCache(const DMatrix* data, - std::vector* p_out_preds) { - std::vector& out_preds = *p_out_preds; + HostDeviceVector* p_out_preds) { + std::vector& out_preds = p_out_preds->data_h(); // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in // conjunction with Update(). diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu index cee7c3e88..77eba9d72 100644 --- a/src/tree/updater_gpu.cu +++ b/src/tree/updater_gpu.cu @@ -512,7 +512,7 @@ class GPUMaker : public TreeUpdater { maxLeaves = 1 << param.max_depth; } - void Update(const std::vector& gpair, DMatrix* dmat, + void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) override { GradStats::CheckInfo(dmat->info()); // rescale learning rate according to size of trees @@ -530,7 +530,7 @@ class GPUMaker : public TreeUpdater { param.learning_rate = lr; } /// @note: Update should be only after Init!! - void UpdateTree(const std::vector& gpair, DMatrix* dmat, + void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, RegTree* hTree) { if (!allocated) { setupOneTimeData(dmat); @@ -687,11 +687,11 @@ class GPUMaker : public TreeUpdater { assignColIds<<>>(colIds.data(), colOffsets.data()); } - void transferGrads(const std::vector& gpair) { + void transferGrads(HostDeviceVector* gpair) { // HACK - dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]), + dh::safe_cuda(cudaMemcpy(gradsInst.data(), gpair->ptr_d(param.gpu_id), sizeof(bst_gpair) * nRows, - cudaMemcpyHostToDevice)); + cudaMemcpyDefault)); // evaluate the full-grad reduction for the root node dh::sumReduction(tmp_mem, gradsInst, gradSums, nRows); } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 688d28031..48bd45f09 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -506,27 +506,9 @@ class GPUHistMaker : public TreeUpdater { monitor.Init("updater_gpu_hist", param.debug_verbose); } - void Update(const std::vector& gpair, DMatrix* dmat, - const std::vector& trees) override { - monitor.Start("Update", dList); - // TODO(canonizer): move it into the class if this ever becomes a bottleneck - HostDeviceVector gpair_d(gpair.size(), param.gpu_id); - dh::safe_cuda(cudaSetDevice(param.gpu_id)); - thrust::copy(gpair.begin(), gpair.end(), gpair_d.tbegin(param.gpu_id)); - Update(&gpair_d, dmat, trees); - monitor.Stop("Update", dList); - } - void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) override { monitor.Start("Update", dList); - UpdateHelper(gpair, dmat, trees); - monitor.Stop("Update", dList); - } - - private: - void UpdateHelper(HostDeviceVector* gpair, DMatrix* dmat, - const std::vector& trees) { GradStats::CheckInfo(dmat->info()); // rescale learning rate according to size of trees float lr = param.learning_rate; @@ -541,9 +523,9 @@ class GPUHistMaker : public TreeUpdater { LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl; } param.learning_rate = lr; + monitor.Stop("Update", dList); } - public: void InitDataOnce(DMatrix* dmat) { info = &dmat->info(); monitor.Start("Quantiles", dList); @@ -876,16 +858,6 @@ class GPUHistMaker : public TreeUpdater { omp_set_num_threads(nthread); } - bool UpdatePredictionCache(const DMatrix* data, - std::vector* p_out_preds) override { - return false; - } - - bool UpdatePredictionCache( - const DMatrix* data, HostDeviceVector* p_out_preds) override { - return false; - } - struct ExpandEntry { int nid; int depth; diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index d4f011d06..04012f4b8 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker); template class HistMaker: public BaseMaker { public: - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix *p_fmat, const std::vector &trees) override { TStats::CheckInfo(p_fmat->info()); @@ -30,7 +30,7 @@ class HistMaker: public BaseMaker { param.learning_rate = lr / trees.size(); // build tree for (size_t i = 0; i < trees.size(); ++i) { - this->Update(gpair, p_fmat, trees[i]); + this->Update(gpair->data_h(), p_fmat, trees[i]); } param.learning_rate = lr; } diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc index af52f73f4..bbdc155d1 100644 --- a/src/tree/updater_prune.cc +++ b/src/tree/updater_prune.cc @@ -29,7 +29,7 @@ class TreePruner: public TreeUpdater { syncher->Init(args); } // update the tree, do pruning - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix *p_fmat, const std::vector &trees) override { // rescale learning rate according to size of trees diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc index fb4e72caf..e94a92147 100644 --- a/src/tree/updater_refresh.cc +++ b/src/tree/updater_refresh.cc @@ -25,10 +25,11 @@ class TreeRefresher: public TreeUpdater { param.InitAllowUnknown(args); } // update the tree, do pruning - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix *p_fmat, const std::vector &trees) override { if (trees.size() == 0) return; + std::vector &gpair_h = gpair->data_h(); // number of threads // thread temporal space std::vector > stemp; @@ -71,7 +72,7 @@ class TreeRefresher: public TreeUpdater { feats.Fill(inst); int offset = 0; for (size_t j = 0; j < trees.size(); ++j) { - AddStats(*trees[j], feats, gpair, info, ridx, + AddStats(*trees[j], feats, gpair_h, info, ridx, dmlc::BeginPtr(stemp[tid]) + offset); offset += trees[j]->param.num_nodes; } diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc index 1994cb6d3..688e2026c 100644 --- a/src/tree/updater_skmaker.cc +++ b/src/tree/updater_skmaker.cc @@ -22,7 +22,7 @@ DMLC_REGISTRY_FILE_TAG(updater_skmaker); class SketchMaker: public BaseMaker { public: - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix *p_fmat, const std::vector &trees) override { // rescale learning rate according to size of trees @@ -30,7 +30,7 @@ class SketchMaker: public BaseMaker { param.learning_rate = lr / trees.size(); // build tree for (size_t i = 0; i < trees.size(); ++i) { - this->Update(gpair, p_fmat, trees[i]); + this->Update(gpair->data_h(), p_fmat, trees[i]); } param.learning_rate = lr; } diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc index bd17968cd..f2a5da48b 100644 --- a/src/tree/updater_sync.cc +++ b/src/tree/updater_sync.cc @@ -23,7 +23,7 @@ class TreeSyncher: public TreeUpdater { public: void Init(const std::vector >& args) override {} - void Update(const std::vector &gpair, + void Update(HostDeviceVector *gpair, DMatrix* dmat, const std::vector &trees) override { if (rabit::GetWorldSize() == 1) return; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 7f46e43b6..3318be60a 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -38,10 +38,13 @@ void CheckObjFunction(xgboost::ObjFunction * obj, info.labels = labels; info.weights = weights; - std::vector gpair; - obj->GetGradient(preds, info, 1, &gpair); + xgboost::HostDeviceVector in_preds(preds); - ASSERT_EQ(gpair.size(), preds.size()); + xgboost::HostDeviceVector out_gpair; + obj->GetGradient(&in_preds, info, 1, &out_gpair); + std::vector& gpair = out_gpair.data_h(); + + ASSERT_EQ(gpair.size(), in_preds.size()); for (int i = 0; i < static_cast(gpair.size()); ++i) { EXPECT_NEAR(gpair[i].GetGrad(), out_grad[i], 0.01) << "Unexpected grad for pred=" << preds[i] << " label=" << labels[i] diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc index 3c679a6a5..8a1d3f6ec 100644 --- a/tests/cpp/objective/test_regression_obj.cc +++ b/tests/cpp/objective/test_regression_obj.cc @@ -46,10 +46,11 @@ TEST(Objective, LogisticRegressionBasic) { << "Expected error when base_score not in range [0,1f] for LogisticRegression"; // test PredTransform - std::vector preds = {0, 0.1f, 0.5f, 0.9f, 1}; + xgboost::HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; std::vector out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f}; - obj->PredTransform(&preds); - for (int i = 0; i < static_cast(preds.size()); ++i) { + obj->PredTransform(&io_preds); + auto& preds = io_preds.data_h(); + for (int i = 0; i < static_cast(io_preds.size()); ++i) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } @@ -94,10 +95,11 @@ TEST(Objective, PoissonRegressionBasic) { EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f); // test PredTransform - std::vector preds = {0, 0.1f, 0.5f, 0.9f, 1}; + xgboost::HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; std::vector out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f}; - obj->PredTransform(&preds); - for (int i = 0; i < static_cast(preds.size()); ++i) { + obj->PredTransform(&io_preds); + auto& preds = io_preds.data_h(); + for (int i = 0; i < static_cast(io_preds.size()); ++i) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } @@ -129,10 +131,11 @@ TEST(Objective, GammaRegressionBasic) { EXPECT_NEAR(obj->ProbToMargin(0.9f), -0.10f, 0.01f); // test PredTransform - std::vector preds = {0, 0.1f, 0.5f, 0.9f, 1}; + xgboost::HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; std::vector out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f}; - obj->PredTransform(&preds); - for (int i = 0; i < static_cast(preds.size()); ++i) { + obj->PredTransform(&io_preds); + auto& preds = io_preds.data_h(); + for (int i = 0; i < static_cast(io_preds.size()); ++i) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } @@ -165,10 +168,11 @@ TEST(Objective, TweedieRegressionBasic) { EXPECT_NEAR(obj->ProbToMargin(0.9f), 0.89f, 0.01f); // test PredTransform - std::vector preds = {0, 0.1f, 0.5f, 0.9f, 1}; + xgboost::HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; std::vector out_preds = {1, 1.10f, 1.64f, 2.45f, 2.71f}; - obj->PredTransform(&preds); - for (int i = 0; i < static_cast(preds.size()); ++i) { + obj->PredTransform(&io_preds); + auto& preds = io_preds.data_h(); + for (int i = 0; i < static_cast(io_preds.size()); ++i) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } diff --git a/tests/cpp/objective/test_regression_obj_gpu.cu b/tests/cpp/objective/test_regression_obj_gpu.cu index 0e507dc07..0ea8a8e1e 100644 --- a/tests/cpp/objective/test_regression_obj_gpu.cu +++ b/tests/cpp/objective/test_regression_obj_gpu.cu @@ -48,10 +48,11 @@ TEST(Objective, GPULogisticRegressionBasic) { << "Expected error when base_score not in range [0,1f] for LogisticRegression"; // test PredTransform - std::vector preds = {0, 0.1f, 0.5f, 0.9f, 1}; + xgboost::HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; std::vector out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f}; - obj->PredTransform(&preds); - for (int i = 0; i < static_cast(preds.size()); ++i) { + obj->PredTransform(&io_preds); + auto& preds = io_preds.data_h(); + for (int i = 0; i < static_cast(io_preds.size()); ++i) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 025aa11e5..0a9c4c8cf 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -24,10 +24,11 @@ TEST(cpu_predictor, Test) { auto dmat = CreateDMatrix(n_row, n_col, 0); // Test predict batch - std::vector out_predictions; + HostDeviceVector out_predictions; cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0); + std::vector& out_predictions_h = out_predictions.data_h(); for (int i = 0; i < out_predictions.size(); i++) { - ASSERT_EQ(out_predictions[i], 1.5); + ASSERT_EQ(out_predictions_h[i], 1.5); } // Test predict instance diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 1a4a48f83..45fad97d6 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -33,13 +33,15 @@ TEST(gpu_predictor, Test) { auto dmat = CreateDMatrix(n_row, n_col, 0); // Test predict batch - std::vector gpu_out_predictions; - std::vector cpu_out_predictions; + HostDeviceVector gpu_out_predictions; + HostDeviceVector cpu_out_predictions; gpu_predictor->PredictBatch(dmat.get(), &gpu_out_predictions, model, 0); cpu_predictor->PredictBatch(dmat.get(), &cpu_out_predictions, model, 0); + std::vector& gpu_out_predictions_h = gpu_out_predictions.data_h(); + std::vector& cpu_out_predictions_h = cpu_out_predictions.data_h(); float abs_tolerance = 0.001; for (int i = 0; i < gpu_out_predictions.size(); i++) { - ASSERT_LT(std::abs(gpu_out_predictions[i] - cpu_out_predictions[i]), + ASSERT_LT(std::abs(gpu_out_predictions_h[i] - cpu_out_predictions_h[i]), abs_tolerance); } // Test predict instance