[WIP] Extract prediction into separate interface (#2531)
* [WIP] Extract prediction into separate interface * Add copyright, fix linter errors * Add predictor to amalgamation * Fix documentation * Move prediction cache into predictor, add GBTreeModel * Updated predictor doc comments
This commit is contained in:
parent
00eda28b3c
commit
0e06d1805d
@ -38,6 +38,7 @@ include_directories (
|
|||||||
file(GLOB_RECURSE SOURCES
|
file(GLOB_RECURSE SOURCES
|
||||||
src/*.cc
|
src/*.cc
|
||||||
src/*.h
|
src/*.h
|
||||||
|
include/*.h
|
||||||
)
|
)
|
||||||
# Only add main function for executable target
|
# Only add main function for executable target
|
||||||
list(REMOVE_ITEM SOURCES ${PROJECT_SOURCE_DIR}/src/cli_main.cc)
|
list(REMOVE_ITEM SOURCES ${PROJECT_SOURCE_DIR}/src/cli_main.cc)
|
||||||
|
|||||||
@ -32,6 +32,10 @@
|
|||||||
#include "../src/data/simple_dmatrix.cc"
|
#include "../src/data/simple_dmatrix.cc"
|
||||||
#include "../src/data/sparse_page_raw_format.cc"
|
#include "../src/data/sparse_page_raw_format.cc"
|
||||||
|
|
||||||
|
// prediction
|
||||||
|
#include "../src/predictor/predictor.cc"
|
||||||
|
#include "../src/predictor/cpu_predictor.cc"
|
||||||
|
|
||||||
#if DMLC_ENABLE_STD_THREAD
|
#if DMLC_ENABLE_STD_THREAD
|
||||||
#include "../src/data/sparse_page_source.cc"
|
#include "../src/data/sparse_page_source.cc"
|
||||||
#include "../src/data/sparse_page_dmatrix.cc"
|
#include "../src/data/sparse_page_dmatrix.cc"
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class GradientBooster {
|
|||||||
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
|
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
|
||||||
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
|
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
|
||||||
*/
|
*/
|
||||||
virtual void Predict(DMatrix* dmat,
|
virtual void PredictBatch(DMatrix* dmat,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit = 0) = 0;
|
unsigned ntree_limit = 0) = 0;
|
||||||
/*!
|
/*!
|
||||||
@ -92,7 +92,7 @@ class GradientBooster {
|
|||||||
* \param root_index the root index
|
* \param root_index the root index
|
||||||
* \sa Predict
|
* \sa Predict
|
||||||
*/
|
*/
|
||||||
virtual void Predict(const SparseBatch::Inst& inst,
|
virtual void PredictInstance(const SparseBatch::Inst& inst,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit = 0,
|
unsigned ntree_limit = 0,
|
||||||
unsigned root_index = 0) = 0;
|
unsigned root_index = 0) = 0;
|
||||||
|
|||||||
@ -189,7 +189,7 @@ inline void Learner::Predict(const SparseBatch::Inst& inst,
|
|||||||
bool output_margin,
|
bool output_margin,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit) const {
|
unsigned ntree_limit) const {
|
||||||
gbm_->Predict(inst, out_preds, ntree_limit);
|
gbm_->PredictInstance(inst, out_preds, ntree_limit);
|
||||||
if (!output_margin) {
|
if (!output_margin) {
|
||||||
obj_->PredTransform(out_preds);
|
obj_->PredTransform(out_preds);
|
||||||
}
|
}
|
||||||
|
|||||||
172
include/xgboost/predictor.h
Normal file
172
include/xgboost/predictor.h
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright by Contributors
|
||||||
|
* \file predictor.h
|
||||||
|
* \brief Interface of predictor,
|
||||||
|
* performs predictions for a gradient booster.
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <xgboost/base.h>
|
||||||
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include "../../src/gbm/gbtree_model.h"
|
||||||
|
|
||||||
|
// Forward declarations
|
||||||
|
namespace xgboost {
|
||||||
|
class DMatrix;
|
||||||
|
class TreeUpdater;
|
||||||
|
}
|
||||||
|
namespace xgboost {
|
||||||
|
namespace gbm {
|
||||||
|
struct GBTreeModel;
|
||||||
|
}
|
||||||
|
} // namespace xgboost
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \class Predictor
|
||||||
|
*
|
||||||
|
* \brief Performs prediction on individual training instances or batches of instances for GBTree.
|
||||||
|
* The predictor also manages a prediction cache associated with input matrices. If possible,
|
||||||
|
* it will use previously calculated predictions instead of calculating new predictions.
|
||||||
|
* Prediction functions all take a GBTreeModel and a DMatrix as input and output a vector of
|
||||||
|
* predictions. The predictor does not modify any state of the model itself.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class Predictor {
|
||||||
|
public:
|
||||||
|
virtual ~Predictor() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn void Predictor::InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache);
|
||||||
|
*
|
||||||
|
* \brief Register input matrices in prediction cache.
|
||||||
|
*
|
||||||
|
* \param cache Vector of DMatrix's to be used in prediction.
|
||||||
|
*/
|
||||||
|
|
||||||
|
void InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn virtual void Predictor::PredictBatch( DMatrix* dmat, std::vector<bst_float>* out_preds, const gbm::GBTreeModel &model, int tree_begin, unsigned ntree_limit = 0) = 0;
|
||||||
|
*
|
||||||
|
* \brief Generate batch predictions for a given feature matrix. May use cached predictions if available instead of calculating from scratch.
|
||||||
|
*
|
||||||
|
* \param [in,out] dmat Feature matrix.
|
||||||
|
* \param [in,out] out_preds The output preds.
|
||||||
|
* \param model The model to predict from.
|
||||||
|
* \param tree_begin The tree begin index.
|
||||||
|
* \param ntree_limit (Optional) The ntree limit. 0 means do not limit trees.
|
||||||
|
*/
|
||||||
|
|
||||||
|
virtual void PredictBatch(
|
||||||
|
DMatrix* dmat, std::vector<bst_float>* out_preds, const gbm::GBTreeModel &model,
|
||||||
|
int tree_begin, unsigned ntree_limit = 0) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn virtual void Predictor::UpdatePredictionCache( const gbm::GBTreeModel &model, std::vector<std::unique_ptr<TreeUpdater> >* updaters, int num_new_trees) = 0;
|
||||||
|
*
|
||||||
|
* \brief Update the internal prediction cache using newly added trees. Will use the tree updater
|
||||||
|
* to do this if possible. Should be called as a part of the tree boosting process to facilitate the look up of predictions at a later time.
|
||||||
|
*
|
||||||
|
* \param model The model.
|
||||||
|
* \param [in,out] updaters The updater sequence for gradient boosting.
|
||||||
|
* \param num_new_trees Number of new trees.
|
||||||
|
*/
|
||||||
|
|
||||||
|
virtual void UpdatePredictionCache(
|
||||||
|
const gbm::GBTreeModel &model, std::vector<std::unique_ptr<TreeUpdater> >* updaters,
|
||||||
|
int num_new_trees) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn virtual void Predictor::PredictInstance( const SparseBatch::Inst& inst, std::vector<bst_float>* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0, unsigned root_index = 0) = 0;
|
||||||
|
*
|
||||||
|
* \brief online prediction function, predict score for one instance at a time NOTE: use the batch
|
||||||
|
* prediction interface if possible, batch prediction is usually more efficient than online
|
||||||
|
* prediction This function is NOT threadsafe, make sure you only call from one thread.
|
||||||
|
*
|
||||||
|
* \param inst The instance to predict.
|
||||||
|
* \param [in,out] out_preds The output preds.
|
||||||
|
* \param model The model to predict from
|
||||||
|
* \param ntree_limit (Optional) The ntree limit.
|
||||||
|
* \param root_index (Optional) Zero-based index of the root.
|
||||||
|
*/
|
||||||
|
|
||||||
|
virtual void PredictInstance(
|
||||||
|
const SparseBatch::Inst& inst, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit = 0, unsigned root_index = 0) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn virtual void Predictor::PredictLeaf(DMatrix* dmat, std::vector<bst_float>* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0) = 0;
|
||||||
|
*
|
||||||
|
* \brief predict the leaf index of each tree, the output will be nsample * ntree vector this is
|
||||||
|
* only valid in gbtree predictor.
|
||||||
|
*
|
||||||
|
* \param [in,out] dmat The input feature matrix.
|
||||||
|
* \param [in,out] out_preds The output preds.
|
||||||
|
* \param model Model to make predictions from.
|
||||||
|
* \param ntree_limit (Optional) The ntree limit.
|
||||||
|
*/
|
||||||
|
|
||||||
|
virtual void PredictLeaf(DMatrix* dmat, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit = 0) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn virtual void Predictor::PredictContribution( DMatrix* dmat, std::vector<bst_float>* out_contribs, const gbm::GBTreeModel& model, unsigned ntree_limit = 0) = 0;
|
||||||
|
*
|
||||||
|
* \brief feature contributions to individual predictions; the output will be a vector of length
|
||||||
|
* (nfeats + 1) * num_output_group * nsample, arranged in that order.
|
||||||
|
*
|
||||||
|
* \param [in,out] dmat The input feature matrix.
|
||||||
|
* \param [in,out] out_contribs The output feature contribs.
|
||||||
|
* \param model Model to make predictions from.
|
||||||
|
* \param ntree_limit (Optional) The ntree limit.
|
||||||
|
*/
|
||||||
|
|
||||||
|
virtual void PredictContribution(
|
||||||
|
DMatrix* dmat, std::vector<bst_float>* out_contribs,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit = 0) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn static Predictor* Predictor::Create(std::string name);
|
||||||
|
*
|
||||||
|
* \brief Creates a new Predictor*.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
static Predictor* Create(std::string name);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/**
|
||||||
|
* \struct PredictionCacheEntry
|
||||||
|
*
|
||||||
|
* \brief Contains pointer to input matrix and associated cached predictions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct PredictionCacheEntry {
|
||||||
|
std::shared_ptr<DMatrix> data;
|
||||||
|
std::vector<bst_float> predictions;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Map of matrices and associated cached predictions to facilitate storing and looking up
|
||||||
|
* predictions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
std::unordered_map<DMatrix*, PredictionCacheEntry> cache_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Registry entry for predictor.
|
||||||
|
*/
|
||||||
|
struct PredictorReg
|
||||||
|
: public dmlc::FunctionRegEntryBase<PredictorReg,
|
||||||
|
std::function<Predictor*()>> {};
|
||||||
|
|
||||||
|
#define XGBOOST_REGISTER_PREDICTOR(UniqueId, Name) \
|
||||||
|
static DMLC_ATTRIBUTE_UNUSED ::xgboost::PredictorReg& \
|
||||||
|
__make_##PredictorReg##_##UniqueId##__ = \
|
||||||
|
::dmlc::Registry<::xgboost::PredictorReg>::Get()->__REGISTER__(Name)
|
||||||
|
} // namespace xgboost
|
||||||
@ -170,7 +170,7 @@ class GBLinear : public GradientBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Predict(DMatrix *p_fmat,
|
void PredictBatch(DMatrix *p_fmat,
|
||||||
std::vector<bst_float> *out_preds,
|
std::vector<bst_float> *out_preds,
|
||||||
unsigned ntree_limit) override {
|
unsigned ntree_limit) override {
|
||||||
if (model.weight.size() == 0) {
|
if (model.weight.size() == 0) {
|
||||||
@ -205,7 +205,7 @@ class GBLinear : public GradientBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// add base margin
|
// add base margin
|
||||||
void Predict(const SparseBatch::Inst &inst,
|
void PredictInstance(const SparseBatch::Inst &inst,
|
||||||
std::vector<bst_float> *out_preds,
|
std::vector<bst_float> *out_preds,
|
||||||
unsigned ntree_limit,
|
unsigned ntree_limit,
|
||||||
unsigned root_index) override {
|
unsigned root_index) override {
|
||||||
|
|||||||
@ -9,18 +9,17 @@
|
|||||||
#include <dmlc/timer.h>
|
#include <dmlc/timer.h>
|
||||||
#include <xgboost/logging.h>
|
#include <xgboost/logging.h>
|
||||||
#include <xgboost/gbm.h>
|
#include <xgboost/gbm.h>
|
||||||
|
#include <xgboost/predictor.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <unordered_map>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
|
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
|
#include "gbtree_model.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace gbm {
|
namespace gbm {
|
||||||
@ -121,47 +120,6 @@ struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief model parameters */
|
|
||||||
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
|
|
||||||
/*! \brief number of trees */
|
|
||||||
int num_trees;
|
|
||||||
/*! \brief number of roots */
|
|
||||||
int num_roots;
|
|
||||||
/*! \brief number of features to be used by trees */
|
|
||||||
int num_feature;
|
|
||||||
/*! \brief pad this space, for backward compatibility reason.*/
|
|
||||||
int pad_32bit;
|
|
||||||
/*! \brief deprecated padding space. */
|
|
||||||
int64_t num_pbuffer_deprecated;
|
|
||||||
/*!
|
|
||||||
* \brief how many output group a single instance can produce
|
|
||||||
* this affects the behavior of number of output we have:
|
|
||||||
* suppose we have n instance and k group, output will be k * n
|
|
||||||
*/
|
|
||||||
int num_output_group;
|
|
||||||
/*! \brief size of leaf vector needed in tree */
|
|
||||||
int size_leaf_vector;
|
|
||||||
/*! \brief reserved parameters */
|
|
||||||
int reserved[32];
|
|
||||||
/*! \brief constructor */
|
|
||||||
GBTreeModelParam() {
|
|
||||||
std::memset(this, 0, sizeof(GBTreeModelParam));
|
|
||||||
static_assert(sizeof(GBTreeModelParam) == (4 + 2 + 2 + 32) * sizeof(int),
|
|
||||||
"64/32 bit compatibility issue");
|
|
||||||
}
|
|
||||||
// declare parameters, only declare those that need to be set.
|
|
||||||
DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
|
|
||||||
DMLC_DECLARE_FIELD(num_output_group).set_lower_bound(1).set_default(1)
|
|
||||||
.describe("Number of output groups to be predicted,"\
|
|
||||||
" used for multi-class classification.");
|
|
||||||
DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1)
|
|
||||||
.describe("Tree updater sequence.");
|
|
||||||
DMLC_DECLARE_FIELD(num_feature).set_lower_bound(0)
|
|
||||||
.describe("Number of features used for training and prediction.");
|
|
||||||
DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
|
|
||||||
.describe("Reserved option for vector tree.");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// cache entry
|
// cache entry
|
||||||
struct CacheEntry {
|
struct CacheEntry {
|
||||||
@ -172,22 +130,18 @@ struct CacheEntry {
|
|||||||
// gradient boosted trees
|
// gradient boosted trees
|
||||||
class GBTree : public GradientBooster {
|
class GBTree : public GradientBooster {
|
||||||
public:
|
public:
|
||||||
explicit GBTree(bst_float base_margin) : base_margin_(base_margin) {}
|
explicit GBTree(bst_float base_margin)
|
||||||
|
: model_(base_margin),
|
||||||
|
predictor(
|
||||||
|
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor"))) {}
|
||||||
|
|
||||||
void InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache) {
|
void InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache) {
|
||||||
for (const std::shared_ptr<DMatrix>& d : cache) {
|
predictor->InitCache(cache);
|
||||||
CacheEntry e;
|
|
||||||
e.data = d;
|
|
||||||
cache_[d.get()] = std::move(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
|
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
|
||||||
this->cfg = cfg;
|
this->cfg = cfg;
|
||||||
// initialize model parameters if not yet been initialized.
|
model_.Configure(cfg);
|
||||||
if (trees.size() == 0) {
|
|
||||||
mparam.InitAllowUnknown(cfg);
|
|
||||||
}
|
|
||||||
// initialize the updaters only when needed.
|
// initialize the updaters only when needed.
|
||||||
std::string updater_seq = tparam.updater_seq;
|
std::string updater_seq = tparam.updater_seq;
|
||||||
tparam.InitAllowUnknown(cfg);
|
tparam.InitAllowUnknown(cfg);
|
||||||
@ -196,48 +150,25 @@ class GBTree : public GradientBooster {
|
|||||||
up->Init(cfg);
|
up->Init(cfg);
|
||||||
}
|
}
|
||||||
// for the 'update' process_type, move trees into trees_to_update
|
// for the 'update' process_type, move trees into trees_to_update
|
||||||
if (tparam.process_type == kUpdate && trees_to_update.size() == 0u) {
|
if (tparam.process_type == kUpdate) {
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
model_.InitTreesToUpdate();
|
||||||
trees_to_update.push_back(std::move(trees[i]));
|
|
||||||
}
|
|
||||||
trees.clear();
|
|
||||||
mparam.num_trees = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Load(dmlc::Stream* fi) override {
|
void Load(dmlc::Stream* fi) override {
|
||||||
CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
|
model_.Load(fi);
|
||||||
<< "GBTree: invalid model file";
|
|
||||||
trees.clear();
|
|
||||||
trees_to_update.clear();
|
|
||||||
for (int i = 0; i < mparam.num_trees; ++i) {
|
|
||||||
std::unique_ptr<RegTree> ptr(new RegTree());
|
|
||||||
ptr->Load(fi);
|
|
||||||
trees.push_back(std::move(ptr));
|
|
||||||
}
|
|
||||||
tree_info.resize(mparam.num_trees);
|
|
||||||
if (mparam.num_trees != 0) {
|
|
||||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * mparam.num_trees),
|
|
||||||
sizeof(int) * mparam.num_trees);
|
|
||||||
}
|
|
||||||
this->cfg.clear();
|
this->cfg.clear();
|
||||||
this->cfg.push_back(std::make_pair(std::string("num_feature"),
|
this->cfg.push_back(std::make_pair(std::string("num_feature"),
|
||||||
common::ToString(mparam.num_feature)));
|
common::ToString(model_.param.num_feature)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Save(dmlc::Stream* fo) const override {
|
void Save(dmlc::Stream* fo) const override {
|
||||||
CHECK_EQ(mparam.num_trees, static_cast<int>(trees.size()));
|
model_.Save(fo);
|
||||||
fo->Write(&mparam, sizeof(mparam));
|
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
|
||||||
trees[i]->Save(fo);
|
|
||||||
}
|
|
||||||
if (tree_info.size() != 0) {
|
|
||||||
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AllowLazyCheckPoint() const override {
|
bool AllowLazyCheckPoint() const override {
|
||||||
return mparam.num_output_group == 1 ||
|
return model_.param.num_output_group == 1 ||
|
||||||
tparam.updater_seq.find("distcol") != std::string::npos;
|
tparam.updater_seq.find("distcol") != std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,7 +177,7 @@ class GBTree : public GradientBooster {
|
|||||||
ObjFunction* obj) override {
|
ObjFunction* obj) override {
|
||||||
const std::vector<bst_gpair>& gpair = *in_gpair;
|
const std::vector<bst_gpair>& gpair = *in_gpair;
|
||||||
std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
|
std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
|
||||||
const int ngroup = mparam.num_output_group;
|
const int ngroup = model_.param.num_output_group;
|
||||||
if (ngroup == 1) {
|
if (ngroup == 1) {
|
||||||
std::vector<std::unique_ptr<RegTree> > ret;
|
std::vector<std::unique_ptr<RegTree> > ret;
|
||||||
BoostNewTrees(gpair, p_fmat, 0, &ret);
|
BoostNewTrees(gpair, p_fmat, 0, &ret);
|
||||||
@ -275,72 +206,163 @@ class GBTree : public GradientBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Predict(DMatrix* p_fmat,
|
void PredictBatch(DMatrix* p_fmat,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit) override {
|
unsigned ntree_limit) override {
|
||||||
if (ntree_limit == 0 ||
|
predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
|
||||||
ntree_limit * mparam.num_output_group >= trees.size()) {
|
|
||||||
auto it = cache_.find(p_fmat);
|
|
||||||
if (it != cache_.end()) {
|
|
||||||
std::vector<bst_float>& y = it->second.predictions;
|
|
||||||
if (y.size() != 0) {
|
|
||||||
out_preds->resize(y.size());
|
|
||||||
std::copy(y.begin(), y.end(), out_preds->begin());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
PredLoopInternal<GBTree>(p_fmat, out_preds, 0, ntree_limit, true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Predict(const SparseBatch::Inst& inst,
|
void PredictInstance(const SparseBatch::Inst& inst,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit,
|
unsigned ntree_limit,
|
||||||
unsigned root_index) override {
|
unsigned root_index) override {
|
||||||
if (thread_temp.size() == 0) {
|
predictor->PredictInstance(inst, out_preds, model_,
|
||||||
thread_temp.resize(1, RegTree::FVec());
|
ntree_limit, root_index);
|
||||||
thread_temp[0].Init(mparam.num_feature);
|
|
||||||
}
|
|
||||||
ntree_limit *= mparam.num_output_group;
|
|
||||||
if (ntree_limit == 0 || ntree_limit > trees.size()) {
|
|
||||||
ntree_limit = static_cast<unsigned>(trees.size());
|
|
||||||
}
|
|
||||||
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
|
|
||||||
// loop over output groups
|
|
||||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
|
||||||
(*out_preds)[gid] =
|
|
||||||
PredValue(inst, gid, root_index,
|
|
||||||
&thread_temp[0], 0, ntree_limit) + base_margin_;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void PredictLeaf(DMatrix* p_fmat,
|
void PredictLeaf(DMatrix* p_fmat,
|
||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
unsigned ntree_limit) override {
|
unsigned ntree_limit) override {
|
||||||
const int nthread = omp_get_max_threads();
|
predictor->PredictLeaf(p_fmat, out_preds, model_, ntree_limit);
|
||||||
InitThreadTemp(nthread);
|
|
||||||
this->PredPath(p_fmat, out_preds, ntree_limit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void PredictContribution(DMatrix* p_fmat,
|
void PredictContribution(DMatrix* p_fmat,
|
||||||
std::vector<bst_float>* out_contribs,
|
std::vector<bst_float>* out_contribs,
|
||||||
unsigned ntree_limit) override {
|
unsigned ntree_limit) override {
|
||||||
const int nthread = omp_get_max_threads();
|
predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit);
|
||||||
InitThreadTemp(nthread);
|
|
||||||
this->PredContrib(p_fmat, out_contribs, ntree_limit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> DumpModel(const FeatureMap& fmap,
|
std::vector<std::string> DumpModel(const FeatureMap& fmap,
|
||||||
bool with_stats,
|
bool with_stats,
|
||||||
std::string format) const override {
|
std::string format) const override {
|
||||||
std::vector<std::string> dump;
|
return model_.DumpModel(fmap, with_stats, format);
|
||||||
for (size_t i = 0; i < trees.size(); i++) {
|
|
||||||
dump.push_back(trees[i]->DumpModel(fmap, with_stats, format));
|
|
||||||
}
|
|
||||||
return dump;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
// initialize updater before using them
|
||||||
|
inline void InitUpdater() {
|
||||||
|
if (updaters.size() != 0) return;
|
||||||
|
std::string tval = tparam.updater_seq;
|
||||||
|
std::vector<std::string> ups = common::Split(tval, ',');
|
||||||
|
for (const std::string& pstr : ups) {
|
||||||
|
std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr.c_str()));
|
||||||
|
up->Init(this->cfg);
|
||||||
|
updaters.push_back(std::move(up));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// do group specific group
|
||||||
|
inline void
|
||||||
|
BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||||
|
DMatrix *p_fmat,
|
||||||
|
int bst_group,
|
||||||
|
std::vector<std::unique_ptr<RegTree> >* ret) {
|
||||||
|
this->InitUpdater();
|
||||||
|
std::vector<RegTree*> new_trees;
|
||||||
|
ret->clear();
|
||||||
|
// create the trees
|
||||||
|
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
||||||
|
if (tparam.process_type == kDefault) {
|
||||||
|
// create new tree
|
||||||
|
std::unique_ptr<RegTree> ptr(new RegTree());
|
||||||
|
ptr->param.InitAllowUnknown(this->cfg);
|
||||||
|
ptr->InitModel();
|
||||||
|
new_trees.push_back(ptr.get());
|
||||||
|
ret->push_back(std::move(ptr));
|
||||||
|
} else if (tparam.process_type == kUpdate) {
|
||||||
|
CHECK_LT(model_.trees.size(), model_.trees_to_update.size());
|
||||||
|
// move an existing tree from trees_to_update
|
||||||
|
auto t = std::move(model_.trees_to_update[model_.trees.size() +
|
||||||
|
bst_group * tparam.num_parallel_tree + i]);
|
||||||
|
new_trees.push_back(t.get());
|
||||||
|
ret->push_back(std::move(t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// update the trees
|
||||||
|
for (auto& up : updaters) {
|
||||||
|
up->Update(gpair, p_fmat, new_trees);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// commit new trees all at once
|
||||||
|
virtual void
|
||||||
|
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||||
|
int bst_group) {
|
||||||
|
model_.CommitModel(std::move(new_trees), bst_group);
|
||||||
|
|
||||||
|
predictor->UpdatePredictionCache(model_, &updaters, new_trees.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- data structure ---
|
||||||
|
GBTreeModel model_;
|
||||||
|
// training parameter
|
||||||
|
GBTreeTrainParam tparam;
|
||||||
|
// ----training fields----
|
||||||
|
// configurations for tree
|
||||||
|
std::vector<std::pair<std::string, std::string> > cfg;
|
||||||
|
// the updaters that can be applied to each of tree
|
||||||
|
std::vector<std::unique_ptr<TreeUpdater>> updaters;
|
||||||
|
|
||||||
|
std::unique_ptr<Predictor> predictor;
|
||||||
|
};
|
||||||
|
|
||||||
|
// dart
|
||||||
|
class Dart : public GBTree {
|
||||||
|
public:
|
||||||
|
explicit Dart(bst_float base_margin) : GBTree(base_margin) {}
|
||||||
|
|
||||||
|
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
|
||||||
|
GBTree::Configure(cfg);
|
||||||
|
if (model_.trees.size() == 0) {
|
||||||
|
dparam.InitAllowUnknown(cfg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Load(dmlc::Stream* fi) override {
|
||||||
|
GBTree::Load(fi);
|
||||||
|
weight_drop.resize(model_.param.num_trees);
|
||||||
|
if (model_.param.num_trees != 0) {
|
||||||
|
fi->Read(&weight_drop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Save(dmlc::Stream* fo) const override {
|
||||||
|
GBTree::Save(fo);
|
||||||
|
if (weight_drop.size() != 0) {
|
||||||
|
fo->Write(weight_drop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// predict the leaf scores with dropout if ntree_limit = 0
|
||||||
|
void PredictBatch(DMatrix* p_fmat,
|
||||||
|
std::vector<bst_float>* out_preds,
|
||||||
|
unsigned ntree_limit) override {
|
||||||
|
DropTrees(ntree_limit);
|
||||||
|
PredLoopInternal<Dart>(p_fmat, out_preds, 0, ntree_limit, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PredictInstance(const SparseBatch::Inst& inst,
|
||||||
|
std::vector<bst_float>* out_preds,
|
||||||
|
unsigned ntree_limit,
|
||||||
|
unsigned root_index) override {
|
||||||
|
DropTrees(1);
|
||||||
|
if (thread_temp.size() == 0) {
|
||||||
|
thread_temp.resize(1, RegTree::FVec());
|
||||||
|
thread_temp[0].Init(model_.param.num_feature);
|
||||||
|
}
|
||||||
|
out_preds->resize(model_.param.num_output_group);
|
||||||
|
ntree_limit *= model_.param.num_output_group;
|
||||||
|
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
|
||||||
|
ntree_limit = static_cast<unsigned>(model_.trees.size());
|
||||||
|
}
|
||||||
|
// loop over output groups
|
||||||
|
for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
|
||||||
|
(*out_preds)[gid]
|
||||||
|
= PredValue(inst, gid, root_index,
|
||||||
|
&thread_temp[0], 0, ntree_limit) + model_.base_margin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
friend class GBTree;
|
||||||
// internal prediction loop
|
// internal prediction loop
|
||||||
// add predictions to out_preds
|
// add predictions to out_preds
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
@ -350,10 +372,10 @@ class GBTree : public GradientBooster {
|
|||||||
unsigned tree_begin,
|
unsigned tree_begin,
|
||||||
unsigned ntree_limit,
|
unsigned ntree_limit,
|
||||||
bool init_out_preds) {
|
bool init_out_preds) {
|
||||||
int num_group = mparam.num_output_group;
|
int num_group = model_.param.num_output_group;
|
||||||
ntree_limit *= num_group;
|
ntree_limit *= num_group;
|
||||||
if (ntree_limit == 0 || ntree_limit > trees.size()) {
|
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
|
||||||
ntree_limit = static_cast<unsigned>(trees.size());
|
ntree_limit = static_cast<unsigned>(model_.trees.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (init_out_preds) {
|
if (init_out_preds) {
|
||||||
@ -364,7 +386,7 @@ class GBTree : public GradientBooster {
|
|||||||
CHECK_EQ(out_preds->size(), n);
|
CHECK_EQ(out_preds->size(), n);
|
||||||
std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
|
std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
|
||||||
} else {
|
} else {
|
||||||
std::fill(out_preds->begin(), out_preds->end(), base_margin_);
|
std::fill(out_preds->begin(), out_preds->end(), model_.base_margin);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -386,10 +408,10 @@ class GBTree : public GradientBooster {
|
|||||||
unsigned tree_end) {
|
unsigned tree_end) {
|
||||||
const MetaInfo& info = p_fmat->info();
|
const MetaInfo& info = p_fmat->info();
|
||||||
const int nthread = omp_get_max_threads();
|
const int nthread = omp_get_max_threads();
|
||||||
CHECK_EQ(num_group, mparam.num_output_group);
|
CHECK_EQ(num_group, model_.param.num_output_group);
|
||||||
InitThreadTemp(nthread);
|
InitThreadTemp(nthread);
|
||||||
std::vector<bst_float>& preds = *out_preds;
|
std::vector<bst_float>& preds = *out_preds;
|
||||||
CHECK_EQ(mparam.size_leaf_vector, 0)
|
CHECK_EQ(model_.param.size_leaf_vector, 0)
|
||||||
<< "size_leaf_vector is enforced to 0 so far";
|
<< "size_leaf_vector is enforced to 0 so far";
|
||||||
CHECK_EQ(preds.size(), p_fmat->info().num_row * num_group);
|
CHECK_EQ(preds.size(), p_fmat->info().num_row * num_group);
|
||||||
// start collecting the prediction
|
// start collecting the prediction
|
||||||
@ -436,289 +458,14 @@ class GBTree : public GradientBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// initialize updater before using them
|
|
||||||
inline void InitUpdater() {
|
|
||||||
if (updaters.size() != 0) return;
|
|
||||||
std::string tval = tparam.updater_seq;
|
|
||||||
std::vector<std::string> ups = common::Split(tval, ',');
|
|
||||||
for (const std::string& pstr : ups) {
|
|
||||||
std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr.c_str()));
|
|
||||||
up->Init(this->cfg);
|
|
||||||
updaters.push_back(std::move(up));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// do group specific group
|
|
||||||
inline void
|
|
||||||
BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
|
||||||
DMatrix *p_fmat,
|
|
||||||
int bst_group,
|
|
||||||
std::vector<std::unique_ptr<RegTree> >* ret) {
|
|
||||||
this->InitUpdater();
|
|
||||||
std::vector<RegTree*> new_trees;
|
|
||||||
ret->clear();
|
|
||||||
// create the trees
|
|
||||||
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
|
||||||
if (tparam.process_type == kDefault) {
|
|
||||||
// create new tree
|
|
||||||
std::unique_ptr<RegTree> ptr(new RegTree());
|
|
||||||
ptr->param.InitAllowUnknown(this->cfg);
|
|
||||||
ptr->InitModel();
|
|
||||||
new_trees.push_back(ptr.get());
|
|
||||||
ret->push_back(std::move(ptr));
|
|
||||||
} else if (tparam.process_type == kUpdate) {
|
|
||||||
CHECK_LT(trees.size(), trees_to_update.size());
|
|
||||||
// move an existing tree from trees_to_update
|
|
||||||
auto t = std::move(trees_to_update[trees.size() +
|
|
||||||
bst_group * tparam.num_parallel_tree + i]);
|
|
||||||
new_trees.push_back(t.get());
|
|
||||||
ret->push_back(std::move(t));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// update the trees
|
|
||||||
for (auto& up : updaters) {
|
|
||||||
up->Update(gpair, p_fmat, new_trees);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// commit new trees all at once
|
|
||||||
virtual void
|
|
||||||
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
|
||||||
int bst_group) {
|
|
||||||
size_t old_ntree = trees.size();
|
|
||||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
|
||||||
trees.push_back(std::move(new_trees[i]));
|
|
||||||
tree_info.push_back(bst_group);
|
|
||||||
}
|
|
||||||
mparam.num_trees += static_cast<int>(new_trees.size());
|
|
||||||
|
|
||||||
// update cache entry
|
|
||||||
for (auto &kv : cache_) {
|
|
||||||
CacheEntry& e = kv.second;
|
|
||||||
|
|
||||||
if (e.predictions.size() == 0) {
|
|
||||||
PredLoopInternal<GBTree>(
|
|
||||||
e.data.get(), &(e.predictions),
|
|
||||||
0, trees.size(), true);
|
|
||||||
} else {
|
|
||||||
if (mparam.num_output_group == 1 && updaters.size() > 0 && new_trees.size() == 1
|
|
||||||
&& updaters.back()->UpdatePredictionCache(e.data.get(), &(e.predictions)) ) {
|
|
||||||
{} // do nothing
|
|
||||||
} else {
|
|
||||||
PredLoopInternal<GBTree>(
|
|
||||||
e.data.get(), &(e.predictions),
|
|
||||||
old_ntree, trees.size(), false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// make a prediction for a single instance
|
|
||||||
inline bst_float PredValue(const RowBatch::Inst &inst,
|
|
||||||
int bst_group,
|
|
||||||
unsigned root_index,
|
|
||||||
RegTree::FVec *p_feats,
|
|
||||||
unsigned tree_begin,
|
|
||||||
unsigned tree_end) {
|
|
||||||
bst_float psum = 0.0f;
|
|
||||||
p_feats->Fill(inst);
|
|
||||||
for (size_t i = tree_begin; i < tree_end; ++i) {
|
|
||||||
if (tree_info[i] == bst_group) {
|
|
||||||
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
|
||||||
psum += (*trees[i])[tid].leaf_value();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p_feats->Drop(inst);
|
|
||||||
return psum;
|
|
||||||
}
|
|
||||||
// predict independent leaf index
|
|
||||||
inline void PredPath(DMatrix *p_fmat,
|
|
||||||
std::vector<bst_float> *out_preds,
|
|
||||||
unsigned ntree_limit) {
|
|
||||||
const MetaInfo& info = p_fmat->info();
|
|
||||||
// number of valid trees
|
|
||||||
ntree_limit *= mparam.num_output_group;
|
|
||||||
if (ntree_limit == 0 || ntree_limit > trees.size()) {
|
|
||||||
ntree_limit = static_cast<unsigned>(trees.size());
|
|
||||||
}
|
|
||||||
std::vector<bst_float>& preds = *out_preds;
|
|
||||||
preds.resize(info.num_row * ntree_limit);
|
|
||||||
// start collecting the prediction
|
|
||||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
|
||||||
iter->BeforeFirst();
|
|
||||||
while (iter->Next()) {
|
|
||||||
const RowBatch& batch = iter->Value();
|
|
||||||
// parallel over local batch
|
|
||||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
const int tid = omp_get_thread_num();
|
|
||||||
size_t ridx = static_cast<size_t>(batch.base_rowid + i);
|
|
||||||
RegTree::FVec &feats = thread_temp[tid];
|
|
||||||
feats.Fill(batch[i]);
|
|
||||||
for (unsigned j = 0; j < ntree_limit; ++j) {
|
|
||||||
int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
|
|
||||||
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
|
|
||||||
}
|
|
||||||
feats.Drop(batch[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// predict contributions
|
|
||||||
inline void PredContrib(DMatrix *p_fmat,
|
|
||||||
std::vector<bst_float> *out_contribs,
|
|
||||||
unsigned ntree_limit) {
|
|
||||||
const MetaInfo& info = p_fmat->info();
|
|
||||||
// number of valid trees
|
|
||||||
ntree_limit *= mparam.num_output_group;
|
|
||||||
if (ntree_limit == 0 || ntree_limit > trees.size()) {
|
|
||||||
ntree_limit = static_cast<unsigned>(trees.size());
|
|
||||||
}
|
|
||||||
const int ngroup = mparam.num_output_group;
|
|
||||||
size_t ncolumns = mparam.num_feature + 1;
|
|
||||||
// allocate space for (number of features + bias) times the number of rows
|
|
||||||
std::vector<bst_float>& contribs = *out_contribs;
|
|
||||||
contribs.resize(info.num_row * ncolumns * mparam.num_output_group);
|
|
||||||
// make sure contributions is zeroed, we could be reusing a previously allocated one
|
|
||||||
std::fill(contribs.begin(), contribs.end(), 0);
|
|
||||||
// initialize tree node mean values
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for (bst_omp_uint i=0; i < ntree_limit; ++i) {
|
|
||||||
trees[i]->FillNodeMeanValues();
|
|
||||||
}
|
|
||||||
// start collecting the contributions
|
|
||||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
|
||||||
const std::vector<bst_float>& base_margin = info.base_margin;
|
|
||||||
iter->BeforeFirst();
|
|
||||||
while (iter->Next()) {
|
|
||||||
const RowBatch& batch = iter->Value();
|
|
||||||
// parallel over local batch
|
|
||||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
|
|
||||||
unsigned root_id = info.GetRoot(row_idx);
|
|
||||||
RegTree::FVec &feats = thread_temp[omp_get_thread_num()];
|
|
||||||
// loop over all classes
|
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
|
||||||
bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
|
|
||||||
feats.Fill(batch[i]);
|
|
||||||
// calculate contributions
|
|
||||||
for (unsigned j = 0; j < ntree_limit; ++j) {
|
|
||||||
if (tree_info[j] != gid) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
trees[j]->CalculateContributions(feats, root_id, p_contribs);
|
|
||||||
}
|
|
||||||
feats.Drop(batch[i]);
|
|
||||||
// add base margin to BIAS
|
|
||||||
if (base_margin.size() != 0) {
|
|
||||||
p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
|
|
||||||
} else {
|
|
||||||
p_contribs[ncolumns - 1] += base_margin_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// init thread buffers
|
|
||||||
inline void InitThreadTemp(int nthread) {
|
|
||||||
int prev_thread_temp_size = thread_temp.size();
|
|
||||||
if (prev_thread_temp_size < nthread) {
|
|
||||||
thread_temp.resize(nthread, RegTree::FVec());
|
|
||||||
for (int i = prev_thread_temp_size; i < nthread; ++i) {
|
|
||||||
thread_temp[i].Init(mparam.num_feature);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// --- data structure ---
|
|
||||||
// base margin
|
|
||||||
bst_float base_margin_;
|
|
||||||
// training parameter
|
|
||||||
GBTreeTrainParam tparam;
|
|
||||||
// model parameter
|
|
||||||
GBTreeModelParam mparam;
|
|
||||||
/*! \brief vector of trees stored in the model */
|
|
||||||
std::vector<std::unique_ptr<RegTree> > trees;
|
|
||||||
/*! \brief for the update process, a place to keep the initial trees */
|
|
||||||
std::vector<std::unique_ptr<RegTree> > trees_to_update;
|
|
||||||
/*! \brief some information indicator of the tree, reserved */
|
|
||||||
std::vector<int> tree_info;
|
|
||||||
// ----training fields----
|
|
||||||
std::unordered_map<DMatrix*, CacheEntry> cache_;
|
|
||||||
// configurations for tree
|
|
||||||
std::vector<std::pair<std::string, std::string> > cfg;
|
|
||||||
// temporal storage for per thread
|
|
||||||
std::vector<RegTree::FVec> thread_temp;
|
|
||||||
// the updaters that can be applied to each of tree
|
|
||||||
std::vector<std::unique_ptr<TreeUpdater> > updaters;
|
|
||||||
};
|
|
||||||
|
|
||||||
// dart
|
|
||||||
class Dart : public GBTree {
|
|
||||||
public:
|
|
||||||
explicit Dart(bst_float base_margin) : GBTree(base_margin) {}
|
|
||||||
|
|
||||||
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
|
|
||||||
GBTree::Configure(cfg);
|
|
||||||
if (trees.size() == 0) {
|
|
||||||
dparam.InitAllowUnknown(cfg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Load(dmlc::Stream* fi) override {
|
|
||||||
GBTree::Load(fi);
|
|
||||||
weight_drop.resize(mparam.num_trees);
|
|
||||||
if (mparam.num_trees != 0) {
|
|
||||||
fi->Read(&weight_drop);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Save(dmlc::Stream* fo) const override {
|
|
||||||
GBTree::Save(fo);
|
|
||||||
if (weight_drop.size() != 0) {
|
|
||||||
fo->Write(weight_drop);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// predict the leaf scores with dropout if ntree_limit = 0
|
|
||||||
void Predict(DMatrix* p_fmat,
|
|
||||||
std::vector<bst_float>* out_preds,
|
|
||||||
unsigned ntree_limit) override {
|
|
||||||
DropTrees(ntree_limit);
|
|
||||||
PredLoopInternal<Dart>(p_fmat, out_preds, 0, ntree_limit, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Predict(const SparseBatch::Inst& inst,
|
|
||||||
std::vector<bst_float>* out_preds,
|
|
||||||
unsigned ntree_limit,
|
|
||||||
unsigned root_index) override {
|
|
||||||
DropTrees(1);
|
|
||||||
if (thread_temp.size() == 0) {
|
|
||||||
thread_temp.resize(1, RegTree::FVec());
|
|
||||||
thread_temp[0].Init(mparam.num_feature);
|
|
||||||
}
|
|
||||||
out_preds->resize(mparam.num_output_group);
|
|
||||||
ntree_limit *= mparam.num_output_group;
|
|
||||||
if (ntree_limit == 0 || ntree_limit > trees.size()) {
|
|
||||||
ntree_limit = static_cast<unsigned>(trees.size());
|
|
||||||
}
|
|
||||||
// loop over output groups
|
|
||||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
|
||||||
(*out_preds)[gid]
|
|
||||||
= PredValue(inst, gid, root_index,
|
|
||||||
&thread_temp[0], 0, ntree_limit) + base_margin_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
friend class GBTree;
|
|
||||||
// commit new trees all at once
|
// commit new trees all at once
|
||||||
void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||||
int bst_group) override {
|
int bst_group) override {
|
||||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||||
trees.push_back(std::move(new_trees[i]));
|
model_.trees.push_back(std::move(new_trees[i]));
|
||||||
tree_info.push_back(bst_group);
|
model_.tree_info.push_back(bst_group);
|
||||||
}
|
}
|
||||||
mparam.num_trees += static_cast<int>(new_trees.size());
|
model_.param.num_trees += static_cast<int>(new_trees.size());
|
||||||
size_t num_drop = NormalizeTrees(new_trees.size());
|
size_t num_drop = NormalizeTrees(new_trees.size());
|
||||||
if (dparam.silent != 1) {
|
if (dparam.silent != 1) {
|
||||||
LOG(INFO) << "drop " << num_drop << " trees, "
|
LOG(INFO) << "drop " << num_drop << " trees, "
|
||||||
@ -735,11 +482,11 @@ class Dart : public GBTree {
|
|||||||
bst_float psum = 0.0f;
|
bst_float psum = 0.0f;
|
||||||
p_feats->Fill(inst);
|
p_feats->Fill(inst);
|
||||||
for (size_t i = tree_begin; i < tree_end; ++i) {
|
for (size_t i = tree_begin; i < tree_end; ++i) {
|
||||||
if (tree_info[i] == bst_group) {
|
if (model_.tree_info[i] == bst_group) {
|
||||||
bool drop = (std::binary_search(idx_drop.begin(), idx_drop.end(), i));
|
bool drop = (std::binary_search(idx_drop.begin(), idx_drop.end(), i));
|
||||||
if (!drop) {
|
if (!drop) {
|
||||||
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
int tid = model_.trees[i]->GetLeafIndex(*p_feats, root_index);
|
||||||
psum += weight_drop[i] * (*trees[i])[tid].leaf_value();
|
psum += weight_drop[i] * (*model_.trees[i])[tid].leaf_value();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -825,6 +572,17 @@ class Dart : public GBTree {
|
|||||||
return num_drop;
|
return num_drop;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// init thread buffers
|
||||||
|
inline void InitThreadTemp(int nthread) {
|
||||||
|
int prev_thread_temp_size = thread_temp.size();
|
||||||
|
if (prev_thread_temp_size < nthread) {
|
||||||
|
thread_temp.resize(nthread, RegTree::FVec());
|
||||||
|
for (int i = prev_thread_temp_size; i < nthread; ++i) {
|
||||||
|
thread_temp[i].Init(model_.param.num_feature);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- data structure ---
|
// --- data structure ---
|
||||||
// training parameter
|
// training parameter
|
||||||
DartTrainParam dparam;
|
DartTrainParam dparam;
|
||||||
@ -832,6 +590,8 @@ class Dart : public GBTree {
|
|||||||
std::vector<bst_float> weight_drop;
|
std::vector<bst_float> weight_drop;
|
||||||
// indexes of dropped trees
|
// indexes of dropped trees
|
||||||
std::vector<size_t> idx_drop;
|
std::vector<size_t> idx_drop;
|
||||||
|
// temporal storage for per thread
|
||||||
|
std::vector<RegTree::FVec> thread_temp;
|
||||||
};
|
};
|
||||||
|
|
||||||
// register the objective functions
|
// register the objective functions
|
||||||
|
|||||||
140
src/gbm/gbtree_model.h
Normal file
140
src/gbm/gbtree_model.h
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright by Contributors 2017
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <dmlc/parameter.h>
|
||||||
|
#include <dmlc/io.h>
|
||||||
|
#include <xgboost/tree_model.h>
|
||||||
|
#include <utility>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace gbm {
|
||||||
|
/*! \brief model parameters */
|
||||||
|
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
|
||||||
|
/*! \brief number of trees */
|
||||||
|
int num_trees;
|
||||||
|
/*! \brief number of roots */
|
||||||
|
int num_roots;
|
||||||
|
/*! \brief number of features to be used by trees */
|
||||||
|
int num_feature;
|
||||||
|
/*! \brief pad this space, for backward compatibility reason.*/
|
||||||
|
int pad_32bit;
|
||||||
|
/*! \brief deprecated padding space. */
|
||||||
|
int64_t num_pbuffer_deprecated;
|
||||||
|
/*!
|
||||||
|
* \brief how many output group a single instance can produce
|
||||||
|
* this affects the behavior of number of output we have:
|
||||||
|
* suppose we have n instance and k group, output will be k * n
|
||||||
|
*/
|
||||||
|
int num_output_group;
|
||||||
|
/*! \brief size of leaf vector needed in tree */
|
||||||
|
int size_leaf_vector;
|
||||||
|
/*! \brief reserved parameters */
|
||||||
|
int reserved[32];
|
||||||
|
/*! \brief constructor */
|
||||||
|
GBTreeModelParam() {
|
||||||
|
std::memset(this, 0, sizeof(GBTreeModelParam));
|
||||||
|
static_assert(sizeof(GBTreeModelParam) == (4 + 2 + 2 + 32) * sizeof(int),
|
||||||
|
"64/32 bit compatibility issue");
|
||||||
|
}
|
||||||
|
// declare parameters, only declare those that need to be set.
|
||||||
|
DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
|
||||||
|
DMLC_DECLARE_FIELD(num_output_group)
|
||||||
|
.set_lower_bound(1)
|
||||||
|
.set_default(1)
|
||||||
|
.describe(
|
||||||
|
"Number of output groups to be predicted,"
|
||||||
|
" used for multi-class classification.");
|
||||||
|
DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1).describe(
|
||||||
|
"Tree updater sequence.");
|
||||||
|
DMLC_DECLARE_FIELD(num_feature)
|
||||||
|
.set_lower_bound(0)
|
||||||
|
.describe("Number of features used for training and prediction.");
|
||||||
|
DMLC_DECLARE_FIELD(size_leaf_vector)
|
||||||
|
.set_lower_bound(0)
|
||||||
|
.set_default(0)
|
||||||
|
.describe("Reserved option for vector tree.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GBTreeModel {
|
||||||
|
explicit GBTreeModel(bst_float base_margin) : base_margin(base_margin) {}
|
||||||
|
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) {
|
||||||
|
// initialize model parameters if not yet been initialized.
|
||||||
|
if (trees.size() == 0) {
|
||||||
|
param.InitAllowUnknown(cfg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitTreesToUpdate() {
|
||||||
|
if (trees_to_update.size() == 0u) {
|
||||||
|
for (size_t i = 0; i < trees.size(); ++i) {
|
||||||
|
trees_to_update.push_back(std::move(trees[i]));
|
||||||
|
}
|
||||||
|
trees.clear();
|
||||||
|
param.num_trees = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Load(dmlc::Stream* fi) {
|
||||||
|
CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param))
|
||||||
|
<< "GBTree: invalid model file";
|
||||||
|
trees.clear();
|
||||||
|
trees_to_update.clear();
|
||||||
|
for (int i = 0; i < param.num_trees; ++i) {
|
||||||
|
std::unique_ptr<RegTree> ptr(new RegTree());
|
||||||
|
ptr->Load(fi);
|
||||||
|
trees.push_back(std::move(ptr));
|
||||||
|
}
|
||||||
|
tree_info.resize(param.num_trees);
|
||||||
|
if (param.num_trees != 0) {
|
||||||
|
CHECK_EQ(
|
||||||
|
fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * param.num_trees),
|
||||||
|
sizeof(int) * param.num_trees);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Save(dmlc::Stream* fo) const {
|
||||||
|
CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
|
||||||
|
fo->Write(¶m, sizeof(param));
|
||||||
|
for (size_t i = 0; i < trees.size(); ++i) {
|
||||||
|
trees[i]->Save(fo);
|
||||||
|
}
|
||||||
|
if (tree_info.size() != 0) {
|
||||||
|
fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
|
||||||
|
std::string format) const {
|
||||||
|
std::vector<std::string> dump;
|
||||||
|
for (size_t i = 0; i < trees.size(); i++) {
|
||||||
|
dump.push_back(trees[i]->DumpModel(fmap, with_stats, format));
|
||||||
|
}
|
||||||
|
return dump;
|
||||||
|
}
|
||||||
|
void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||||
|
int bst_group) {
|
||||||
|
size_t old_ntree = trees.size();
|
||||||
|
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||||
|
trees.push_back(std::move(new_trees[i]));
|
||||||
|
tree_info.push_back(bst_group);
|
||||||
|
}
|
||||||
|
param.num_trees += static_cast<int>(new_trees.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// base margin
|
||||||
|
bst_float base_margin;
|
||||||
|
// model parameter
|
||||||
|
GBTreeModelParam param;
|
||||||
|
/*! \brief vector of trees stored in the model */
|
||||||
|
std::vector<std::unique_ptr<RegTree> > trees;
|
||||||
|
/*! \brief for the update process, a place to keep the initial trees */
|
||||||
|
std::vector<std::unique_ptr<RegTree> > trees_to_update;
|
||||||
|
/*! \brief some information indicator of the tree, reserved */
|
||||||
|
std::vector<int> tree_info;
|
||||||
|
};
|
||||||
|
} // namespace gbm
|
||||||
|
} // namespace xgboost
|
||||||
@ -517,7 +517,7 @@ class LearnerImpl : public Learner {
|
|||||||
unsigned ntree_limit = 0) const {
|
unsigned ntree_limit = 0) const {
|
||||||
CHECK(gbm_.get() != nullptr)
|
CHECK(gbm_.get() != nullptr)
|
||||||
<< "Predict must happen after Load or InitModel";
|
<< "Predict must happen after Load or InitModel";
|
||||||
gbm_->Predict(data, out_preds, ntree_limit);
|
gbm_->PredictBatch(data, out_preds, ntree_limit);
|
||||||
}
|
}
|
||||||
// model parameter
|
// model parameter
|
||||||
LearnerModelParam mparam;
|
LearnerModelParam mparam;
|
||||||
|
|||||||
314
src/predictor/cpu_predictor.cc
Normal file
314
src/predictor/cpu_predictor.cc
Normal file
@ -0,0 +1,314 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright by Contributors 2017
|
||||||
|
*/
|
||||||
|
#include <xgboost/predictor.h>
|
||||||
|
#include <xgboost/tree_model.h>
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
#include "dmlc/logging.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace predictor {
|
||||||
|
|
||||||
|
class CPUPredictor : public Predictor {
|
||||||
|
protected:
|
||||||
|
static bst_float PredValue(const RowBatch::Inst& inst,
|
||||||
|
const std::vector<std::unique_ptr<RegTree>>& trees,
|
||||||
|
const std::vector<int>& tree_info, int bst_group,
|
||||||
|
unsigned root_index, RegTree::FVec* p_feats,
|
||||||
|
unsigned tree_begin, unsigned tree_end) {
|
||||||
|
bst_float psum = 0.0f;
|
||||||
|
p_feats->Fill(inst);
|
||||||
|
for (size_t i = tree_begin; i < tree_end; ++i) {
|
||||||
|
if (tree_info[i] == bst_group) {
|
||||||
|
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
||||||
|
psum += (*trees[i])[tid].leaf_value();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p_feats->Drop(inst);
|
||||||
|
return psum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitOutPredictions(const MetaInfo& info,
|
||||||
|
std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model) const {
|
||||||
|
size_t n = model.param.num_output_group * info.num_row;
|
||||||
|
const std::vector<bst_float>& base_margin = info.base_margin;
|
||||||
|
out_preds->resize(n);
|
||||||
|
if (base_margin.size() != 0) {
|
||||||
|
CHECK_EQ(out_preds->size(), n);
|
||||||
|
std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
|
||||||
|
} else {
|
||||||
|
std::fill(out_preds->begin(), out_preds->end(), model.base_margin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// init thread buffers
|
||||||
|
inline void InitThreadTemp(int nthread, int num_feature) {
|
||||||
|
int prev_thread_temp_size = thread_temp.size();
|
||||||
|
if (prev_thread_temp_size < nthread) {
|
||||||
|
thread_temp.resize(nthread, RegTree::FVec());
|
||||||
|
for (int i = prev_thread_temp_size; i < nthread; ++i) {
|
||||||
|
thread_temp[i].Init(num_feature);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void PredLoopSpecalize(DMatrix* p_fmat,
|
||||||
|
std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, int num_group,
|
||||||
|
unsigned tree_begin, unsigned tree_end) {
|
||||||
|
const MetaInfo& info = p_fmat->info();
|
||||||
|
const int nthread = omp_get_max_threads();
|
||||||
|
InitThreadTemp(nthread, model.param.num_feature);
|
||||||
|
std::vector<bst_float>& preds = *out_preds;
|
||||||
|
CHECK_EQ(model.param.size_leaf_vector, 0)
|
||||||
|
<< "size_leaf_vector is enforced to 0 so far";
|
||||||
|
CHECK_EQ(preds.size(), p_fmat->info().num_row * num_group);
|
||||||
|
// start collecting the prediction
|
||||||
|
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||||
|
iter->BeforeFirst();
|
||||||
|
while (iter->Next()) {
|
||||||
|
const RowBatch& batch = iter->Value();
|
||||||
|
// parallel over local batch
|
||||||
|
const int K = 8;
|
||||||
|
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||||
|
const bst_omp_uint rest = nsize % K;
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nsize - rest; i += K) {
|
||||||
|
const int tid = omp_get_thread_num();
|
||||||
|
RegTree::FVec& feats = thread_temp[tid];
|
||||||
|
int64_t ridx[K];
|
||||||
|
RowBatch::Inst inst[K];
|
||||||
|
for (int k = 0; k < K; ++k) {
|
||||||
|
ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
|
||||||
|
}
|
||||||
|
for (int k = 0; k < K; ++k) {
|
||||||
|
inst[k] = batch[i + k];
|
||||||
|
}
|
||||||
|
for (int k = 0; k < K; ++k) {
|
||||||
|
for (int gid = 0; gid < num_group; ++gid) {
|
||||||
|
const size_t offset = ridx[k] * num_group + gid;
|
||||||
|
preds[offset] += this->PredValue(
|
||||||
|
inst[k], model.trees, model.tree_info, gid,
|
||||||
|
info.GetRoot(ridx[k]), &feats, tree_begin, tree_end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
|
||||||
|
RegTree::FVec& feats = thread_temp[0];
|
||||||
|
const int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
|
||||||
|
const RowBatch::Inst inst = batch[i];
|
||||||
|
for (int gid = 0; gid < num_group; ++gid) {
|
||||||
|
const size_t offset = ridx * num_group + gid;
|
||||||
|
preds[offset] +=
|
||||||
|
this->PredValue(inst, model.trees, model.tree_info, gid,
|
||||||
|
info.GetRoot(ridx), &feats, tree_begin, tree_end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \fn bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>*
|
||||||
|
* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0)
|
||||||
|
*
|
||||||
|
* \brief Attempt to predict from cache.
|
||||||
|
*
|
||||||
|
* \return True if it succeeds, false if it fails.
|
||||||
|
*/
|
||||||
|
bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model,
|
||||||
|
unsigned ntree_limit = 0) {
|
||||||
|
if (ntree_limit == 0 ||
|
||||||
|
ntree_limit * model.param.num_output_group >= model.trees.size()) {
|
||||||
|
auto it = cache_.find(dmat);
|
||||||
|
if (it != cache_.end()) {
|
||||||
|
std::vector<bst_float>& y = it->second.predictions;
|
||||||
|
if (y.size() != 0) {
|
||||||
|
out_preds->resize(y.size());
|
||||||
|
std::copy(y.begin(), y.end(), out_preds->begin());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PredLoopInternal(DMatrix* dmat, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, int tree_begin,
|
||||||
|
unsigned ntree_limit) {
|
||||||
|
// TODO(Rory): Check if this specialisation actually improves performance
|
||||||
|
if (model.param.num_output_group == 1) {
|
||||||
|
PredLoopSpecalize(dmat, out_preds, model, 1, tree_begin, ntree_limit);
|
||||||
|
} else {
|
||||||
|
PredLoopSpecalize(dmat, out_preds, model, model.param.num_output_group,
|
||||||
|
tree_begin, ntree_limit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
void PredictBatch(DMatrix* dmat, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, int tree_begin,
|
||||||
|
unsigned ntree_limit = 0) override {
|
||||||
|
if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->InitOutPredictions(dmat->info(), out_preds, model);
|
||||||
|
|
||||||
|
ntree_limit *= model.param.num_output_group;
|
||||||
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
|
ntree_limit = static_cast<unsigned>(model.trees.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
this->PredLoopInternal(dmat, out_preds, model, tree_begin, ntree_limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
void UpdatePredictionCache(
|
||||||
|
const gbm::GBTreeModel& model,
|
||||||
|
std::vector<std::unique_ptr<TreeUpdater>>* updaters,
|
||||||
|
int num_new_trees) override {
|
||||||
|
int old_ntree = model.trees.size() - num_new_trees;
|
||||||
|
// update cache entry
|
||||||
|
for (auto& kv : cache_) {
|
||||||
|
PredictionCacheEntry& e = kv.second;
|
||||||
|
|
||||||
|
if (e.predictions.size() == 0) {
|
||||||
|
InitOutPredictions(e.data->info(), &(e.predictions), model);
|
||||||
|
PredLoopInternal(e.data.get(), &(e.predictions), model, 0,
|
||||||
|
model.trees.size());
|
||||||
|
} else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
||||||
|
num_new_trees == 1 &&
|
||||||
|
updaters->back()->UpdatePredictionCache(e.data.get(),
|
||||||
|
&(e.predictions))) {
|
||||||
|
{} // do nothing
|
||||||
|
} else {
|
||||||
|
PredLoopInternal(e.data.get(), &(e.predictions), model, old_ntree,
|
||||||
|
model.trees.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PredictInstance(const SparseBatch::Inst& inst,
|
||||||
|
std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||||
|
unsigned root_index) override {
|
||||||
|
if (thread_temp.size() == 0) {
|
||||||
|
thread_temp.resize(1, RegTree::FVec());
|
||||||
|
thread_temp[0].Init(model.param.num_feature);
|
||||||
|
}
|
||||||
|
ntree_limit *= model.param.num_output_group;
|
||||||
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
|
ntree_limit = static_cast<unsigned>(model.trees.size());
|
||||||
|
}
|
||||||
|
out_preds->resize(model.param.num_output_group *
|
||||||
|
(model.param.size_leaf_vector + 1));
|
||||||
|
// loop over output groups
|
||||||
|
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||||
|
(*out_preds)[gid] =
|
||||||
|
PredValue(inst, model.trees, model.tree_info, gid, root_index,
|
||||||
|
&thread_temp[0], 0, ntree_limit) +
|
||||||
|
model.base_margin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit) override {
|
||||||
|
const int nthread = omp_get_max_threads();
|
||||||
|
InitThreadTemp(nthread, model.param.num_feature);
|
||||||
|
const MetaInfo& info = p_fmat->info();
|
||||||
|
// number of valid trees
|
||||||
|
ntree_limit *= model.param.num_output_group;
|
||||||
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
|
ntree_limit = static_cast<unsigned>(model.trees.size());
|
||||||
|
}
|
||||||
|
std::vector<bst_float>& preds = *out_preds;
|
||||||
|
preds.resize(info.num_row * ntree_limit);
|
||||||
|
// start collecting the prediction
|
||||||
|
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||||
|
iter->BeforeFirst();
|
||||||
|
while (iter->Next()) {
|
||||||
|
const RowBatch& batch = iter->Value();
|
||||||
|
// parallel over local batch
|
||||||
|
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
|
const int tid = omp_get_thread_num();
|
||||||
|
size_t ridx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
|
RegTree::FVec& feats = thread_temp[tid];
|
||||||
|
feats.Fill(batch[i]);
|
||||||
|
for (unsigned j = 0; j < ntree_limit; ++j) {
|
||||||
|
int tid = model.trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
|
||||||
|
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
|
||||||
|
}
|
||||||
|
feats.Drop(batch[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PredictContribution(DMatrix* p_fmat,
|
||||||
|
std::vector<bst_float>* out_contribs,
|
||||||
|
const gbm::GBTreeModel& model, unsigned ntree_limit) override {
|
||||||
|
const int nthread = omp_get_max_threads();
|
||||||
|
InitThreadTemp(nthread, model.param.num_feature);
|
||||||
|
const MetaInfo& info = p_fmat->info();
|
||||||
|
// number of valid trees
|
||||||
|
ntree_limit *= model.param.num_output_group;
|
||||||
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
|
ntree_limit = static_cast<unsigned>(model.trees.size());
|
||||||
|
}
|
||||||
|
const int ngroup = model.param.num_output_group;
|
||||||
|
size_t ncolumns = model.param.num_feature + 1;
|
||||||
|
// allocate space for (number of features + bias) times the number of rows
|
||||||
|
std::vector<bst_float>& contribs = *out_contribs;
|
||||||
|
contribs.resize(info.num_row * ncolumns * model.param.num_output_group);
|
||||||
|
// make sure contributions is zeroed, we could be reusing a previously
|
||||||
|
// allocated one
|
||||||
|
std::fill(contribs.begin(), contribs.end(), 0);
|
||||||
|
// initialize tree node mean values
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
|
||||||
|
model.trees[i]->FillNodeMeanValues();
|
||||||
|
}
|
||||||
|
// start collecting the contributions
|
||||||
|
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||||
|
const std::vector<bst_float>& base_margin = info.base_margin;
|
||||||
|
iter->BeforeFirst();
|
||||||
|
while (iter->Next()) {
|
||||||
|
const RowBatch& batch = iter->Value();
|
||||||
|
// parallel over local batch
|
||||||
|
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
|
size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
|
unsigned root_id = info.GetRoot(row_idx);
|
||||||
|
RegTree::FVec& feats = thread_temp[omp_get_thread_num()];
|
||||||
|
// loop over all classes
|
||||||
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
|
bst_float* p_contribs =
|
||||||
|
&contribs[(row_idx * ngroup + gid) * ncolumns];
|
||||||
|
feats.Fill(batch[i]);
|
||||||
|
// calculate contributions
|
||||||
|
for (unsigned j = 0; j < ntree_limit; ++j) {
|
||||||
|
if (model.tree_info[j] != gid) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
|
||||||
|
}
|
||||||
|
feats.Drop(batch[i]);
|
||||||
|
// add base margin to BIAS
|
||||||
|
if (base_margin.size() != 0) {
|
||||||
|
p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
|
||||||
|
} else {
|
||||||
|
p_contribs[ncolumns - 1] += model.base_margin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector<RegTree::FVec> thread_temp;
|
||||||
|
};
|
||||||
|
|
||||||
|
XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
|
||||||
|
.describe("Make predictions using CPU.")
|
||||||
|
.set_body([]() { return new CPUPredictor(); });
|
||||||
|
} // namespace predictor
|
||||||
|
} // namespace xgboost
|
||||||
25
src/predictor/predictor.cc
Normal file
25
src/predictor/predictor.cc
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright by Contributors 2017
|
||||||
|
*/
|
||||||
|
#include <dmlc/registry.h>
|
||||||
|
#include <xgboost/predictor.h>
|
||||||
|
|
||||||
|
namespace dmlc {
|
||||||
|
DMLC_REGISTRY_ENABLE(::xgboost::PredictorReg);
|
||||||
|
} // namespace dmlc
|
||||||
|
namespace xgboost {
|
||||||
|
void Predictor::InitCache(const std::vector<std::shared_ptr<DMatrix> >& cache) {
|
||||||
|
for (const std::shared_ptr<DMatrix>& d : cache) {
|
||||||
|
PredictionCacheEntry e;
|
||||||
|
e.data = d;
|
||||||
|
cache_[d.get()] = std::move(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Predictor* Predictor::Create(std::string name) {
|
||||||
|
auto* e = ::dmlc::Registry<PredictorReg>::Get()->Find(name);
|
||||||
|
if (e == nullptr) {
|
||||||
|
LOG(FATAL) << "Unknown predictor type " << name;
|
||||||
|
}
|
||||||
|
return (e->body)();
|
||||||
|
}
|
||||||
|
} // namespace xgboost
|
||||||
Loading…
x
Reference in New Issue
Block a user