add Dart booster (#1220)
This commit is contained in:
parent
e034fdf74c
commit
949d1e3027
@ -13,7 +13,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e
|
||||
General Parameters
|
||||
------------------
|
||||
* booster [default=gbtree]
|
||||
- which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
|
||||
- which booster to use, can be gbtree, gblinear or dart.
|
||||
gbtree and dart use tree based model while gblinear uses linear function.
|
||||
* silent [default=0]
|
||||
- 0 means printing running messages, 1 means silent mode.
|
||||
* nthread [default to maximum number of threads available if not set]
|
||||
@ -74,6 +75,28 @@ Parameters for Tree Booster
|
||||
* scale_pos_weight, [default=0]
|
||||
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases) See [Parameters Tuning](how_to/param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)
|
||||
|
||||
Additional parameters for Dart Booster
|
||||
--------------------------------------
|
||||
* sample_type [default="uniform"]
|
||||
- type of sampling algorithm.
|
||||
- "uniform": dropped trees are selected uniformly.
|
||||
- "weighted": dropped trees are selected in proportion to weight.
|
||||
* normalize_type [default="tree]
|
||||
- type of normalization algorithm.
|
||||
- "tree": New trees have the same weight of each of dropped trees.
|
||||
weight of new trees are learning_rate / (k + learnig_rate)
|
||||
dropped trees are scaled by a factor of k / (k + learning_rate)
|
||||
- "forest": New trees have the same weight of sum of dropped trees (forest).
|
||||
weight of new trees are learning_rate / (1 + learning_rate)
|
||||
dropped trees are scaled by a factor of 1 / (1 + learning_rate)
|
||||
* rate_drop [default=0.0]
|
||||
- dropout rate.
|
||||
- range: [0.0, 1.0]
|
||||
* skip_drop [default=0.0]
|
||||
- probability of skip dropout.
|
||||
If a dropout is skipped, new trees are added in the same manner as gbtree.
|
||||
- range: [0.0, 1.0]
|
||||
|
||||
Parameters for Linear Booster
|
||||
-----------------------------
|
||||
* lambda [default=0]
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
#include <limits>
|
||||
#include "../common/common.h"
|
||||
|
||||
#include "../common/random.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
@ -47,6 +49,42 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief training parameters */
|
||||
struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
|
||||
/*! \brief whether to not print info during training */
|
||||
bool silent;
|
||||
/*! \brief type of sampling algorithm */
|
||||
int sample_type;
|
||||
/*! \brief type of normalization algorithm */
|
||||
int normalize_type;
|
||||
/*! \brief how many trees are dropped */
|
||||
float rate_drop;
|
||||
/*! \brief whether to drop trees */
|
||||
float skip_drop;
|
||||
/*! \brief learning step size for a time */
|
||||
float learning_rate;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(DartTrainParam) {
|
||||
DMLC_DECLARE_FIELD(silent).set_default(false)
|
||||
.describe("Not print information during trainig.");
|
||||
DMLC_DECLARE_FIELD(sample_type).set_default(0)
|
||||
.add_enum("uniform", 0)
|
||||
.add_enum("weighted", 1)
|
||||
.describe("Different types of sampling algorithm.");
|
||||
DMLC_DECLARE_FIELD(normalize_type).set_default(0)
|
||||
.add_enum("tree", 0)
|
||||
.add_enum("forest", 1)
|
||||
.describe("Different types of normalization algorithm.");
|
||||
DMLC_DECLARE_FIELD(rate_drop).set_range(0.0f, 1.0f).set_default(0.0f)
|
||||
.describe("Parameter of how many trees are dropped.");
|
||||
DMLC_DECLARE_FIELD(skip_drop).set_range(0.0f, 1.0f).set_default(0.0f)
|
||||
.describe("Parameter of whether to drop trees.");
|
||||
DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(0.3f)
|
||||
.describe("Learning rate(step size) of update.");
|
||||
DMLC_DECLARE_ALIAS(learning_rate, eta);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief model parameters */
|
||||
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
|
||||
/*! \brief number of trees */
|
||||
@ -313,8 +351,9 @@ class GBTree : public GradientBooster {
|
||||
}
|
||||
}
|
||||
// commit new trees all at once
|
||||
inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||
int bst_group) {
|
||||
virtual void
|
||||
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||
int bst_group) {
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
trees.push_back(std::move(new_trees[i]));
|
||||
tree_info.push_back(bst_group);
|
||||
@ -475,14 +514,236 @@ class GBTree : public GradientBooster {
|
||||
std::vector<std::unique_ptr<TreeUpdater> > updaters;
|
||||
};
|
||||
|
||||
// dart
|
||||
class Dart : public GBTree {
|
||||
public:
|
||||
Dart() {}
|
||||
|
||||
void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
|
||||
GBTree::Configure(cfg);
|
||||
if (trees.size() == 0) {
|
||||
dparam.InitAllowUnknown(cfg);
|
||||
}
|
||||
}
|
||||
|
||||
void Load(dmlc::Stream* fi) override {
|
||||
GBTree::Load(fi);
|
||||
weight_drop.resize(mparam.num_trees);
|
||||
if (mparam.num_trees != 0) {
|
||||
fi->Read(&weight_drop);
|
||||
}
|
||||
}
|
||||
|
||||
void Save(dmlc::Stream* fo) const override {
|
||||
GBTree::Save(fo);
|
||||
if (weight_drop.size() != 0) {
|
||||
fo->Write(weight_drop);
|
||||
}
|
||||
}
|
||||
|
||||
// predict the leaf scores with dropout if ntree_limit = 0
|
||||
void Predict(DMatrix* p_fmat,
|
||||
int64_t buffer_offset,
|
||||
std::vector<float>* out_preds,
|
||||
unsigned ntree_limit) override {
|
||||
DropTrees(ntree_limit);
|
||||
const MetaInfo& info = p_fmat->info();
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
InitThreadTemp(nthread);
|
||||
std::vector<float> &preds = *out_preds;
|
||||
const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
|
||||
preds.resize(stride * (mparam.size_leaf_vector+1));
|
||||
// start collecting the prediction
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
// parallel over local batch
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
RegTree::FVec &feats = thread_temp[tid];
|
||||
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
|
||||
CHECK_LT(static_cast<size_t>(ridx), info.num_row);
|
||||
// loop over output groups
|
||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||
this->Pred(batch[i],
|
||||
buffer_offset < 0 ? -1 : buffer_offset + ridx,
|
||||
gid, info.GetRoot(ridx), &feats,
|
||||
&preds[ridx * mparam.num_output_group + gid], stride,
|
||||
ntree_limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Predict(const SparseBatch::Inst& inst,
|
||||
std::vector<float>* out_preds,
|
||||
unsigned ntree_limit,
|
||||
unsigned root_index) override {
|
||||
DropTrees(1);
|
||||
if (thread_temp.size() == 0) {
|
||||
thread_temp.resize(1, RegTree::FVec());
|
||||
thread_temp[0].Init(mparam.num_feature);
|
||||
}
|
||||
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
|
||||
// loop over output groups
|
||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||
this->Pred(inst, -1, gid, root_index, &thread_temp[0],
|
||||
&(*out_preds)[gid], mparam.num_output_group,
|
||||
ntree_limit);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// commit new trees all at once
|
||||
virtual void
|
||||
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
|
||||
int bst_group) {
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
trees.push_back(std::move(new_trees[i]));
|
||||
tree_info.push_back(bst_group);
|
||||
}
|
||||
mparam.num_trees += static_cast<int>(new_trees.size());
|
||||
size_t num_drop = NormalizeTrees(new_trees.size());
|
||||
if (dparam.silent != 1) {
|
||||
LOG(INFO) << "drop " << num_drop << " trees, "
|
||||
<< "weight = " << weight_drop.back();
|
||||
}
|
||||
}
|
||||
// predict the leaf scores without dropped trees
|
||||
inline void Pred(const RowBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
RegTree::FVec *p_feats,
|
||||
float *out_pred,
|
||||
size_t stride,
|
||||
unsigned ntree_limit) {
|
||||
float psum = 0.0f;
|
||||
// sum of leaf vector
|
||||
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
|
||||
const int64_t bid = this->BufferOffset(buffer_index, bst_group);
|
||||
p_feats->Fill(inst);
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
if (tree_info[i] == bst_group) {
|
||||
bool drop = (std::find(idx_drop.begin(), idx_drop.end(), i) != idx_drop.end());
|
||||
if (!drop) {
|
||||
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
||||
psum += weight_drop[i] * (*trees[i])[tid].leaf_value();
|
||||
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
|
||||
vec_psum[j] += weight_drop[i] * trees[i]->leafvec(tid)[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
p_feats->Drop(inst);
|
||||
// updated the buffered results
|
||||
if (bid >= 0 && ntree_limit == 0) {
|
||||
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
||||
pred_buffer[bid] = psum;
|
||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||
pred_buffer[bid + i + 1] = vec_psum[i];
|
||||
}
|
||||
}
|
||||
out_pred[0] = psum;
|
||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||
out_pred[stride * (i + 1)] = vec_psum[i];
|
||||
}
|
||||
}
|
||||
|
||||
// select dropped trees
|
||||
inline void DropTrees(unsigned ntree_limit_drop) {
|
||||
std::uniform_real_distribution<> runif(0.0, 1.0);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
// reset
|
||||
idx_drop.clear();
|
||||
// sample dropped trees
|
||||
bool skip = false;
|
||||
if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
|
||||
if (ntree_limit_drop == 0 && !skip) {
|
||||
if (dparam.sample_type == 1) {
|
||||
float sum_weight = 0.0;
|
||||
for (size_t i = 0; i < weight_drop.size(); ++i) {
|
||||
sum_weight += weight_drop[i];
|
||||
}
|
||||
for (size_t i = 0; i < weight_drop.size(); ++i) {
|
||||
if (runif(rnd) < dparam.rate_drop * weight_drop.size() * weight_drop[i] / sum_weight) {
|
||||
idx_drop.push_back(i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < weight_drop.size(); ++i) {
|
||||
if (runif(rnd) < dparam.rate_drop) {
|
||||
idx_drop.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// set normalization factors
|
||||
inline size_t NormalizeTrees(size_t size_new_trees) {
|
||||
float lr = 1.0 * dparam.learning_rate / size_new_trees;
|
||||
size_t num_drop = idx_drop.size();
|
||||
if (num_drop == 0) {
|
||||
for (size_t i = 0; i < size_new_trees; ++i) {
|
||||
weight_drop.push_back(1.0);
|
||||
}
|
||||
} else {
|
||||
if (dparam.normalize_type == 1) {
|
||||
// normalize_type 1
|
||||
float factor = 1.0 / (1.0 + lr);
|
||||
for (size_t i = 0; i < idx_drop.size(); ++i) {
|
||||
weight_drop[i] *= factor;
|
||||
}
|
||||
for (size_t i = 0; i < size_new_trees; ++i) {
|
||||
weight_drop.push_back(lr * factor);
|
||||
}
|
||||
} else {
|
||||
// normalize_type 0
|
||||
float factor = 1.0 * num_drop / (num_drop + lr);
|
||||
for (size_t i = 0; i < idx_drop.size(); ++i) {
|
||||
weight_drop[i] *= factor;
|
||||
}
|
||||
for (size_t i = 0; i < size_new_trees; ++i) {
|
||||
weight_drop.push_back(1.0 * lr / (num_drop + lr));
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset
|
||||
idx_drop.clear();
|
||||
return num_drop;
|
||||
}
|
||||
|
||||
// --- data structure ---
|
||||
// training parameter
|
||||
DartTrainParam dparam;
|
||||
/*! \brief prediction buffer */
|
||||
std::vector<float> weight_drop;
|
||||
// indexes of dropped trees
|
||||
std::vector<size_t> idx_drop;
|
||||
};
|
||||
|
||||
// register the ojective functions
|
||||
DMLC_REGISTER_PARAMETER(GBTreeModelParam);
|
||||
DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
|
||||
DMLC_REGISTER_PARAMETER(DartTrainParam);
|
||||
|
||||
XGBOOST_REGISTER_GBM(GBTree, "gbtree")
|
||||
.describe("Tree booster, gradient boosted trees.")
|
||||
.set_body([]() {
|
||||
return new GBTree();
|
||||
});
|
||||
XGBOOST_REGISTER_GBM(Dart, "dart")
|
||||
.describe("Tree booster, dart.")
|
||||
.set_body([]() {
|
||||
return new Dart();
|
||||
});
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
@ -23,6 +23,51 @@ class TestModels(unittest.TestCase):
|
||||
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
assert err < 0.1
|
||||
|
||||
def test_dart(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
param = {'max_depth': 5, 'objective': 'binary:logistic', 'booster': 'dart', 'silent': False}
|
||||
# specify validations set to watch performance
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
# this is prediction
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
labels = dtest.get_label()
|
||||
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
# error must be smaller than 10%
|
||||
assert err < 0.1
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
dtest.save_binary('dtest.buffer')
|
||||
# save model
|
||||
bst.save_model('xgb.model.dart')
|
||||
# load model and data in
|
||||
bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
|
||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||
preds2 = bst2.predict(dtest2, ntree_limit=num_round)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||
|
||||
# check whether sample_type and normalize_type work
|
||||
num_round = 50
|
||||
param['silent'] = True
|
||||
param['learning_rate'] = 0.1
|
||||
param['rate_drop'] = 0.1
|
||||
preds_list = []
|
||||
for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]:
|
||||
param['sample_type'] = p[0]
|
||||
param['normalize_type'] = p[1]
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
assert err < 0.1
|
||||
preds_list.append(preds)
|
||||
|
||||
for ii in range(len(preds_list)):
|
||||
for jj in range(ii + 1, len(preds_list)):
|
||||
assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
|
||||
|
||||
def test_eta_decay(self):
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
num_round = 4
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user