Refactor fast-hist, add tests for some updaters. (#3836)
Add unittest for prune. Add unittest for refresh. Refactor fast_hist. * Remove fast_hist_param. * Rename to quantile_hist. Add unittests for QuantileHist. * Refactor QuantileHist into .h and .cc file. * Remove sync.h. * Remove MGPU_mock test. Rename fast hist method to quantile hist.
This commit is contained in:
parent
2b045aa805
commit
19ee0a3579
@ -48,7 +48,7 @@
|
|||||||
#include "../src/tree/tree_model.cc"
|
#include "../src/tree/tree_model.cc"
|
||||||
#include "../src/tree/tree_updater.cc"
|
#include "../src/tree/tree_updater.cc"
|
||||||
#include "../src/tree/updater_colmaker.cc"
|
#include "../src/tree/updater_colmaker.cc"
|
||||||
#include "../src/tree/updater_fast_hist.cc"
|
#include "../src/tree/updater_quantile_hist.cc"
|
||||||
#include "../src/tree/updater_prune.cc"
|
#include "../src/tree/updater_prune.cc"
|
||||||
#include "../src/tree/updater_refresh.cc"
|
#include "../src/tree/updater_refresh.cc"
|
||||||
#include "../src/tree/updater_sync.cc"
|
#include "../src/tree/updater_sync.cc"
|
||||||
|
|||||||
@ -19,7 +19,6 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "./common/sync.h"
|
|
||||||
#include "./common/config.h"
|
#include "./common/config.h"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,10 +4,11 @@
|
|||||||
* \brief Utilities to store histograms
|
* \brief Utilities to store histograms
|
||||||
* \author Philip Cho, Tianqi Chen
|
* \author Philip Cho, Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <dmlc/omp.h>
|
#include <dmlc/omp.h>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "./sync.h"
|
|
||||||
#include "./random.h"
|
#include "./random.h"
|
||||||
#include "./column_matrix.h"
|
#include "./column_matrix.h"
|
||||||
#include "./hist_util.h"
|
#include "./hist_util.h"
|
||||||
@ -216,7 +217,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
|
|||||||
const std::vector<size_t>& feature_nnz,
|
const std::vector<size_t>& feature_nnz,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
size_t nrow,
|
size_t nrow,
|
||||||
const FastHistParam& param) {
|
const tree::TrainParam& param) {
|
||||||
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
||||||
only a few data points should have nonzero values for
|
only a few data points should have nonzero values for
|
||||||
member features.
|
member features.
|
||||||
@ -278,7 +279,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
|
|||||||
inline std::vector<std::vector<unsigned>>
|
inline std::vector<std::vector<unsigned>>
|
||||||
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
const FastHistParam& param) {
|
const tree::TrainParam& param) {
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||||
const size_t nfeature = gmat.cut.row_ptr.size() - 1;
|
const size_t nfeature = gmat.cut.row_ptr.size() - 1;
|
||||||
|
|
||||||
@ -332,7 +333,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
|||||||
|
|
||||||
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
const FastHistParam& param) {
|
const tree::TrainParam& param) {
|
||||||
cut_ = &gmat.cut;
|
cut_ = &gmat.cut;
|
||||||
|
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||||
|
|||||||
@ -11,7 +11,6 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "row_set.h"
|
#include "row_set.h"
|
||||||
#include "../tree/fast_hist_param.h"
|
|
||||||
#include "../tree/param.h"
|
#include "../tree/param.h"
|
||||||
#include "./quantile.h"
|
#include "./quantile.h"
|
||||||
|
|
||||||
@ -19,8 +18,6 @@ namespace xgboost {
|
|||||||
|
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
using tree::FastHistParam;
|
|
||||||
|
|
||||||
/*! \brief sums of gradient statistics corresponding to a histogram bin */
|
/*! \brief sums of gradient statistics corresponding to a histogram bin */
|
||||||
struct GHistEntry {
|
struct GHistEntry {
|
||||||
/*! \brief sum of first-order gradient statistics */
|
/*! \brief sum of first-order gradient statistics */
|
||||||
@ -145,7 +142,7 @@ class GHistIndexBlockMatrix {
|
|||||||
public:
|
public:
|
||||||
void Init(const GHistIndexMatrix& gmat,
|
void Init(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
const FastHistParam& param);
|
const tree::TrainParam& param);
|
||||||
|
|
||||||
inline GHistIndexBlock operator[](size_t i) const {
|
inline GHistIndexBlock operator[](size_t i) const {
|
||||||
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
|
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
|
||||||
|
|||||||
@ -9,9 +9,9 @@
|
|||||||
#define XGBOOST_COMMON_IO_H_
|
#define XGBOOST_COMMON_IO_H_
|
||||||
|
|
||||||
#include <dmlc/io.h>
|
#include <dmlc/io.h>
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include "./sync.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|||||||
@ -1,13 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright 2014 by Contributors
|
|
||||||
* \file sync.h
|
|
||||||
* \brief the synchronization module of rabit
|
|
||||||
* redirects to rabit header
|
|
||||||
* \author Tianqi Chen
|
|
||||||
*/
|
|
||||||
#ifndef XGBOOST_COMMON_SYNC_H_
|
|
||||||
#define XGBOOST_COMMON_SYNC_H_
|
|
||||||
|
|
||||||
#include <rabit/rabit.h>
|
|
||||||
|
|
||||||
#endif // XGBOOST_COMMON_SYNC_H_
|
|
||||||
@ -211,8 +211,8 @@ class LearnerImpl : public Learner {
|
|||||||
break;
|
break;
|
||||||
case TreeMethod::kHist:
|
case TreeMethod::kHist:
|
||||||
LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
|
LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
|
||||||
"single updater grow_fast_histmaker.";
|
"single updater grow_quantile_histmaker.";
|
||||||
cfg_["updater"] = "grow_fast_histmaker";
|
cfg_["updater"] = "grow_quantile_histmaker";
|
||||||
break;
|
break;
|
||||||
case TreeMethod::kGPUExact:
|
case TreeMethod::kGPUExact:
|
||||||
this->AssertGPUSupport();
|
this->AssertGPUSupport();
|
||||||
|
|||||||
@ -4,9 +4,9 @@
|
|||||||
* \brief Implementation of loggers.
|
* \brief Implementation of loggers.
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/logging.h>
|
#include <xgboost/logging.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "./common/sync.h"
|
|
||||||
|
|
||||||
#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
|
#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
|
||||||
// Override logging mechanism for non-R interfaces
|
// Override logging mechanism for non-R interfaces
|
||||||
|
|||||||
@ -4,11 +4,11 @@
|
|||||||
* \brief evaluation metrics for elementwise binary or regression.
|
* \brief evaluation metrics for elementwise binary or regression.
|
||||||
* \author Kailong Chen, Tianqi Chen
|
* \author Kailong Chen, Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/metric.h>
|
#include <xgboost/metric.h>
|
||||||
#include <dmlc/registry.h>
|
#include <dmlc/registry.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
#include "../common/sync.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace metric {
|
namespace metric {
|
||||||
|
|||||||
@ -4,9 +4,9 @@
|
|||||||
* \brief evaluation metrics for multiclass classification.
|
* \brief evaluation metrics for multiclass classification.
|
||||||
* \author Kailong Chen, Tianqi Chen
|
* \author Kailong Chen, Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/metric.h>
|
#include <xgboost/metric.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -4,10 +4,10 @@
|
|||||||
* \brief prediction rank based metrics.
|
* \brief prediction rank based metrics.
|
||||||
* \author Kailong Chen, Tianqi Chen
|
* \author Kailong Chen, Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/metric.h>
|
#include <xgboost/metric.h>
|
||||||
#include <dmlc/registry.h>
|
#include <dmlc/registry.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -1,54 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright 2017 by Contributors
|
|
||||||
* \file updater_fast_hist.h
|
|
||||||
* \brief parameters for histogram-based training
|
|
||||||
* \author Philip Cho, Tianqi Chen
|
|
||||||
*/
|
|
||||||
#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
|
|
||||||
#define XGBOOST_TREE_FAST_HIST_PARAM_H_
|
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
namespace tree {
|
|
||||||
|
|
||||||
/*! \brief training parameters for histogram-based training */
|
|
||||||
struct FastHistParam : public dmlc::Parameter<FastHistParam> {
|
|
||||||
int colmat_dtype;
|
|
||||||
// percentage threshold for treating a feature as sparse
|
|
||||||
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
|
|
||||||
double sparse_threshold;
|
|
||||||
// use feature grouping? (default yes)
|
|
||||||
int enable_feature_grouping;
|
|
||||||
// when grouping features, how many "conflicts" to allow.
|
|
||||||
// conflict is when an instance has nonzero values for two or more features
|
|
||||||
// default is 0, meaning features should be strictly complementary
|
|
||||||
double max_conflict_rate;
|
|
||||||
// when grouping features, how much effort to expend to prevent singleton groups
|
|
||||||
// we'll try to insert each feature into existing groups before creating a new group
|
|
||||||
// for that feature; to save time, only up to (max_search_group) of existing groups
|
|
||||||
// will be considered. If set to zero, ALL existing groups will be examined
|
|
||||||
unsigned max_search_group;
|
|
||||||
|
|
||||||
// declare the parameters
|
|
||||||
DMLC_DECLARE_PARAMETER(FastHistParam) {
|
|
||||||
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
|
|
||||||
.describe("percentage threshold for treating a feature as sparse");
|
|
||||||
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
|
|
||||||
.describe("if >0, enable feature grouping to ameliorate work imbalance "
|
|
||||||
"among worker threads");
|
|
||||||
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
|
|
||||||
.describe("when grouping features, how many \"conflicts\" to allow."
|
|
||||||
"conflict is when an instance has nonzero values for two or more features."
|
|
||||||
"default is 0, meaning features should be strictly complementary.");
|
|
||||||
DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
|
|
||||||
.describe("when grouping features, how much effort to expend to prevent "
|
|
||||||
"singleton groups. We'll try to insert each feature into existing "
|
|
||||||
"groups before creating a new group for that feature; to save time, "
|
|
||||||
"only up to (max_search_group) of existing groups will be "
|
|
||||||
"considered. If set to zero, ALL existing groups will be examined.");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace tree
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|
||||||
#endif // XGBOOST_TREE_FAST_HIST_PARAM_H_
|
|
||||||
@ -81,6 +81,23 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
|||||||
int gpu_batch_nrows;
|
int gpu_batch_nrows;
|
||||||
// the criteria to use for ranking splits
|
// the criteria to use for ranking splits
|
||||||
std::string split_evaluator;
|
std::string split_evaluator;
|
||||||
|
|
||||||
|
// ------ From cpu quantile histogram -------.
|
||||||
|
// percentage threshold for treating a feature as sparse
|
||||||
|
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
|
||||||
|
double sparse_threshold;
|
||||||
|
// use feature grouping? (default yes)
|
||||||
|
int enable_feature_grouping;
|
||||||
|
// when grouping features, how many "conflicts" to allow.
|
||||||
|
// conflict is when an instance has nonzero values for two or more features
|
||||||
|
// default is 0, meaning features should be strictly complementary
|
||||||
|
double max_conflict_rate;
|
||||||
|
// when grouping features, how much effort to expend to prevent singleton groups
|
||||||
|
// we'll try to insert each feature into existing groups before creating a new group
|
||||||
|
// for that feature; to save time, only up to (max_search_group) of existing groups
|
||||||
|
// will be considered. If set to zero, ALL existing groups will be examined
|
||||||
|
unsigned max_search_group;
|
||||||
|
|
||||||
// declare the parameters
|
// declare the parameters
|
||||||
DMLC_DECLARE_PARAMETER(TrainParam) {
|
DMLC_DECLARE_PARAMETER(TrainParam) {
|
||||||
DMLC_DECLARE_FIELD(learning_rate)
|
DMLC_DECLARE_FIELD(learning_rate)
|
||||||
@ -196,6 +213,24 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
|||||||
DMLC_DECLARE_FIELD(split_evaluator)
|
DMLC_DECLARE_FIELD(split_evaluator)
|
||||||
.set_default("elastic_net,monotonic,interaction")
|
.set_default("elastic_net,monotonic,interaction")
|
||||||
.describe("The criteria to use for ranking splits");
|
.describe("The criteria to use for ranking splits");
|
||||||
|
|
||||||
|
// ------ From cpu quantile histogram -------.
|
||||||
|
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
|
||||||
|
.describe("percentage threshold for treating a feature as sparse");
|
||||||
|
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
|
||||||
|
.describe("if >0, enable feature grouping to ameliorate work imbalance "
|
||||||
|
"among worker threads");
|
||||||
|
DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
|
||||||
|
.describe("when grouping features, how many \"conflicts\" to allow."
|
||||||
|
"conflict is when an instance has nonzero values for two or more features."
|
||||||
|
"default is 0, meaning features should be strictly complementary.");
|
||||||
|
DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
|
||||||
|
.describe("when grouping features, how much effort to expend to prevent "
|
||||||
|
"singleton groups. We'll try to insert each feature into existing "
|
||||||
|
"groups before creating a new group for that feature; to save time, "
|
||||||
|
"only up to (max_search_group) of existing groups will be "
|
||||||
|
"considered. If set to zero, ALL existing groups will be examined.");
|
||||||
|
|
||||||
// add alias of parameters
|
// add alias of parameters
|
||||||
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
|
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
|
||||||
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
|
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
|
||||||
@ -518,7 +553,7 @@ struct SplitEntry {
|
|||||||
this->loss_chg = new_loss_chg;
|
this->loss_chg = new_loss_chg;
|
||||||
if (default_left) {
|
if (default_left) {
|
||||||
split_index |= (1U << 31);
|
split_index |= (1U << 31);
|
||||||
}
|
}
|
||||||
this->sindex = split_index;
|
this->sindex = split_index;
|
||||||
this->split_value = new_split_value;
|
this->split_value = new_split_value;
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -31,7 +31,7 @@ DMLC_REGISTRY_LINK_TAG(updater_colmaker);
|
|||||||
DMLC_REGISTRY_LINK_TAG(updater_skmaker);
|
DMLC_REGISTRY_LINK_TAG(updater_skmaker);
|
||||||
DMLC_REGISTRY_LINK_TAG(updater_refresh);
|
DMLC_REGISTRY_LINK_TAG(updater_refresh);
|
||||||
DMLC_REGISTRY_LINK_TAG(updater_prune);
|
DMLC_REGISTRY_LINK_TAG(updater_prune);
|
||||||
DMLC_REGISTRY_LINK_TAG(updater_fast_hist);
|
DMLC_REGISTRY_LINK_TAG(updater_quantile_hist);
|
||||||
DMLC_REGISTRY_LINK_TAG(updater_histmaker);
|
DMLC_REGISTRY_LINK_TAG(updater_histmaker);
|
||||||
DMLC_REGISTRY_LINK_TAG(updater_sync);
|
DMLC_REGISTRY_LINK_TAG(updater_sync);
|
||||||
#ifdef XGBOOST_USE_CUDA
|
#ifdef XGBOOST_USE_CUDA
|
||||||
|
|||||||
@ -7,6 +7,8 @@
|
|||||||
#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
|
#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
|
||||||
#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
|
#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_H_
|
||||||
|
|
||||||
|
#include <rabit/rabit.h>
|
||||||
|
|
||||||
#include <xgboost/base.h>
|
#include <xgboost/base.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -14,8 +16,8 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
#include "../common/quantile.h"
|
#include "../common/quantile.h"
|
||||||
|
|||||||
@ -4,15 +4,16 @@
|
|||||||
* \brief use columnwise update to construct a tree
|
* \brief use columnwise update to construct a tree
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
#include "../common/bitmap.h"
|
#include "../common/bitmap.h"
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "split_evaluator.h"
|
#include "split_evaluator.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -1,873 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright 2017 by Contributors
|
|
||||||
* \file updater_fast_hist.cc
|
|
||||||
* \brief use quantized feature values to construct a tree
|
|
||||||
* \author Philip Cho, Tianqi Checn
|
|
||||||
*/
|
|
||||||
#include <dmlc/timer.h>
|
|
||||||
#include <xgboost/tree_updater.h>
|
|
||||||
#include <cmath>
|
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <queue>
|
|
||||||
#include <iomanip>
|
|
||||||
#include <numeric>
|
|
||||||
#include "./param.h"
|
|
||||||
#include "./fast_hist_param.h"
|
|
||||||
#include "./split_evaluator.h"
|
|
||||||
#include "../common/random.h"
|
|
||||||
#include "../common/bitmap.h"
|
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/hist_util.h"
|
|
||||||
#include "../common/row_set.h"
|
|
||||||
#include "../common/column_matrix.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
namespace tree {
|
|
||||||
|
|
||||||
using xgboost::common::HistCutMatrix;
|
|
||||||
using xgboost::common::GHistIndexMatrix;
|
|
||||||
using xgboost::common::GHistIndexBlockMatrix;
|
|
||||||
using xgboost::common::GHistIndexRow;
|
|
||||||
using xgboost::common::GHistEntry;
|
|
||||||
using xgboost::common::HistCollection;
|
|
||||||
using xgboost::common::RowSetCollection;
|
|
||||||
using xgboost::common::GHistRow;
|
|
||||||
using xgboost::common::GHistBuilder;
|
|
||||||
using xgboost::common::ColumnMatrix;
|
|
||||||
using xgboost::common::Column;
|
|
||||||
|
|
||||||
DMLC_REGISTRY_FILE_TAG(updater_fast_hist);
|
|
||||||
|
|
||||||
DMLC_REGISTER_PARAMETER(FastHistParam);
|
|
||||||
|
|
||||||
/*! \brief construct a tree using quantized feature values */
|
|
||||||
class FastHistMaker: public TreeUpdater {
|
|
||||||
public:
|
|
||||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
|
|
||||||
// initialize pruner
|
|
||||||
if (!pruner_) {
|
|
||||||
pruner_.reset(TreeUpdater::Create("prune"));
|
|
||||||
}
|
|
||||||
pruner_->Init(args);
|
|
||||||
param_.InitAllowUnknown(args);
|
|
||||||
fhparam_.InitAllowUnknown(args);
|
|
||||||
is_gmat_initialized_ = false;
|
|
||||||
|
|
||||||
// initialise the split evaluator
|
|
||||||
if (!spliteval_) {
|
|
||||||
spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
|
|
||||||
}
|
|
||||||
|
|
||||||
spliteval_->Init(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Update(HostDeviceVector<GradientPair>* gpair,
|
|
||||||
DMatrix* dmat,
|
|
||||||
const std::vector<RegTree*>& trees) override {
|
|
||||||
GradStats::CheckInfo(dmat->Info());
|
|
||||||
if (is_gmat_initialized_ == false) {
|
|
||||||
double tstart = dmlc::GetTime();
|
|
||||||
gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
|
|
||||||
column_matrix_.Init(gmat_, fhparam_.sparse_threshold);
|
|
||||||
if (fhparam_.enable_feature_grouping > 0) {
|
|
||||||
gmatb_.Init(gmat_, column_matrix_, fhparam_);
|
|
||||||
}
|
|
||||||
is_gmat_initialized_ = true;
|
|
||||||
if (param_.debug_verbose > 0) {
|
|
||||||
LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// rescale learning rate according to size of trees
|
|
||||||
float lr = param_.learning_rate;
|
|
||||||
param_.learning_rate = lr / trees.size();
|
|
||||||
// build tree
|
|
||||||
if (!builder_) {
|
|
||||||
builder_.reset(new Builder(
|
|
||||||
param_,
|
|
||||||
fhparam_,
|
|
||||||
std::move(pruner_),
|
|
||||||
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
|
|
||||||
}
|
|
||||||
for (auto tree : trees) {
|
|
||||||
builder_->Update
|
|
||||||
(gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
|
|
||||||
}
|
|
||||||
param_.learning_rate = lr;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool UpdatePredictionCache(const DMatrix* data,
|
|
||||||
HostDeviceVector<bst_float>* out_preds) override {
|
|
||||||
if (!builder_ || param_.subsample < 1.0f) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
return builder_->UpdatePredictionCache(data, out_preds);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
// training parameter
|
|
||||||
TrainParam param_;
|
|
||||||
FastHistParam fhparam_;
|
|
||||||
// quantized data matrix
|
|
||||||
GHistIndexMatrix gmat_;
|
|
||||||
// (optional) data matrix with feature grouping
|
|
||||||
GHistIndexBlockMatrix gmatb_;
|
|
||||||
// column accessor
|
|
||||||
ColumnMatrix column_matrix_;
|
|
||||||
bool is_gmat_initialized_;
|
|
||||||
|
|
||||||
// data structure
|
|
||||||
struct NodeEntry {
|
|
||||||
/*! \brief statics for node entry */
|
|
||||||
GradStats stats;
|
|
||||||
/*! \brief loss of this node, without split */
|
|
||||||
bst_float root_gain;
|
|
||||||
/*! \brief weight calculated related to current data */
|
|
||||||
float weight;
|
|
||||||
/*! \brief current best solution */
|
|
||||||
SplitEntry best;
|
|
||||||
// constructor
|
|
||||||
explicit NodeEntry(const TrainParam& param)
|
|
||||||
: stats(param), root_gain(0.0f), weight(0.0f) {
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// actual builder that runs the algorithm
|
|
||||||
|
|
||||||
struct Builder {
|
|
||||||
public:
|
|
||||||
// constructor
|
|
||||||
explicit Builder(const TrainParam& param,
|
|
||||||
const FastHistParam& fhparam,
|
|
||||||
std::unique_ptr<TreeUpdater> pruner,
|
|
||||||
std::unique_ptr<SplitEvaluator> spliteval)
|
|
||||||
: param_(param), fhparam_(fhparam), pruner_(std::move(pruner)),
|
|
||||||
spliteval_(std::move(spliteval)), p_last_tree_(nullptr),
|
|
||||||
p_last_fmat_(nullptr) {}
|
|
||||||
// update one tree, growing
|
|
||||||
virtual void Update(const GHistIndexMatrix& gmat,
|
|
||||||
const GHistIndexBlockMatrix& gmatb,
|
|
||||||
const ColumnMatrix& column_matrix,
|
|
||||||
HostDeviceVector<GradientPair>* gpair,
|
|
||||||
DMatrix* p_fmat,
|
|
||||||
RegTree* p_tree) {
|
|
||||||
double gstart = dmlc::GetTime();
|
|
||||||
|
|
||||||
int num_leaves = 0;
|
|
||||||
unsigned timestamp = 0;
|
|
||||||
|
|
||||||
double tstart;
|
|
||||||
double time_init_data = 0;
|
|
||||||
double time_init_new_node = 0;
|
|
||||||
double time_build_hist = 0;
|
|
||||||
double time_evaluate_split = 0;
|
|
||||||
double time_apply_split = 0;
|
|
||||||
|
|
||||||
const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
|
|
||||||
|
|
||||||
spliteval_->Reset();
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
|
|
||||||
time_init_data = dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
// FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
|
|
||||||
CHECK_EQ(p_tree->param.num_roots, 1)
|
|
||||||
<< "tree_method=hist does not support multiple roots at this moment";
|
|
||||||
for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
hist_.AddHistRow(nid);
|
|
||||||
BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid]);
|
|
||||||
time_build_hist += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
|
|
||||||
time_init_new_node += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
|
|
||||||
time_evaluate_split += dmlc::GetTime() - tstart;
|
|
||||||
qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
|
|
||||||
snode_[nid].best.loss_chg,
|
|
||||||
timestamp++));
|
|
||||||
++num_leaves;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!qexpand_->empty()) {
|
|
||||||
const ExpandEntry candidate = qexpand_->top();
|
|
||||||
const int nid = candidate.nid;
|
|
||||||
qexpand_->pop();
|
|
||||||
if (candidate.loss_chg <= kRtEps
|
|
||||||
|| (param_.max_depth > 0 && candidate.depth == param_.max_depth)
|
|
||||||
|| (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
|
|
||||||
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
|
||||||
} else {
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
|
|
||||||
time_apply_split += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
const int cleft = (*p_tree)[nid].LeftChild();
|
|
||||||
const int cright = (*p_tree)[nid].RightChild();
|
|
||||||
hist_.AddHistRow(cleft);
|
|
||||||
hist_.AddHistRow(cright);
|
|
||||||
if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
|
|
||||||
BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
|
|
||||||
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
|
|
||||||
} else {
|
|
||||||
BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
|
|
||||||
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
|
|
||||||
}
|
|
||||||
time_build_hist += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
|
|
||||||
this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
|
|
||||||
bst_uint featureid = snode_[nid].best.SplitIndex();
|
|
||||||
spliteval_->AddSplit(nid, cleft, cright, featureid,
|
|
||||||
snode_[cleft].weight, snode_[cright].weight);
|
|
||||||
time_init_new_node += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
tstart = dmlc::GetTime();
|
|
||||||
this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree);
|
|
||||||
this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree);
|
|
||||||
time_evaluate_split += dmlc::GetTime() - tstart;
|
|
||||||
|
|
||||||
qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
|
|
||||||
snode_[cleft].best.loss_chg,
|
|
||||||
timestamp++));
|
|
||||||
qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
|
|
||||||
snode_[cright].best.loss_chg,
|
|
||||||
timestamp++));
|
|
||||||
|
|
||||||
++num_leaves; // give two and take one, as parent is no longer a leaf
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// set all the rest expanding nodes to leaf
|
|
||||||
// This post condition is not needed in current code, but may be necessary
|
|
||||||
// when there are stopping rule that leaves qexpand non-empty
|
|
||||||
while (!qexpand_->empty()) {
|
|
||||||
const int nid = qexpand_->top().nid;
|
|
||||||
qexpand_->pop();
|
|
||||||
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
|
||||||
}
|
|
||||||
// remember auxiliary statistics in the tree node
|
|
||||||
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
|
|
||||||
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
|
|
||||||
p_tree->Stat(nid).base_weight = snode_[nid].weight;
|
|
||||||
p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
|
|
||||||
snode_[nid].stats.SetLeafVec(param_, p_tree->Leafvec(nid));
|
|
||||||
}
|
|
||||||
|
|
||||||
pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
|
|
||||||
|
|
||||||
if (param_.debug_verbose > 0) {
|
|
||||||
double total_time = dmlc::GetTime() - gstart;
|
|
||||||
LOG(INFO) << "\nInitData: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
|
|
||||||
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
|
||||||
<< time_init_data / total_time * 100 << "%)\n"
|
|
||||||
<< "InitNewNode: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
|
|
||||||
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
|
||||||
<< time_init_new_node / total_time * 100 << "%)\n"
|
|
||||||
<< "BuildHist: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
|
|
||||||
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
|
||||||
<< time_build_hist / total_time * 100 << "%)\n"
|
|
||||||
<< "EvaluateSplit: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
|
|
||||||
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
|
||||||
<< time_evaluate_split / total_time * 100 << "%)\n"
|
|
||||||
<< "ApplySplit: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
|
|
||||||
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
|
||||||
<< time_apply_split / total_time * 100 << "%)\n"
|
|
||||||
<< "========================================\n"
|
|
||||||
<< "Total: "
|
|
||||||
<< std::fixed << std::setw(6) << std::setprecision(4) << total_time;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void BuildHist(const std::vector<GradientPair>& gpair,
|
|
||||||
const RowSetCollection::Elem row_indices,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const GHistIndexBlockMatrix& gmatb,
|
|
||||||
GHistRow hist) {
|
|
||||||
if (fhparam_.enable_feature_grouping > 0) {
|
|
||||||
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
|
|
||||||
} else {
|
|
||||||
hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
|
||||||
hist_builder_.SubtractionTrick(self, sibling, parent);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool UpdatePredictionCache(const DMatrix* data,
|
|
||||||
HostDeviceVector<bst_float>* p_out_preds) {
|
|
||||||
std::vector<bst_float>& out_preds = p_out_preds->HostVector();
|
|
||||||
|
|
||||||
// p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
|
|
||||||
// conjunction with Update().
|
|
||||||
if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (leaf_value_cache_.empty()) {
|
|
||||||
leaf_value_cache_.resize(p_last_tree_->param.num_nodes,
|
|
||||||
std::numeric_limits<float>::infinity());
|
|
||||||
}
|
|
||||||
|
|
||||||
CHECK_GT(out_preds.size(), 0U);
|
|
||||||
|
|
||||||
for (const RowSetCollection::Elem rowset : row_set_collection_) {
|
|
||||||
if (rowset.begin != nullptr && rowset.end != nullptr) {
|
|
||||||
int nid = rowset.node_id;
|
|
||||||
bst_float leaf_value;
|
|
||||||
// if a node is marked as deleted by the pruner, traverse upward to locate
|
|
||||||
// a non-deleted leaf.
|
|
||||||
if ((*p_last_tree_)[nid].IsDeleted()) {
|
|
||||||
while ((*p_last_tree_)[nid].IsDeleted()) {
|
|
||||||
nid = (*p_last_tree_)[nid].Parent();
|
|
||||||
}
|
|
||||||
CHECK((*p_last_tree_)[nid].IsLeaf());
|
|
||||||
}
|
|
||||||
leaf_value = (*p_last_tree_)[nid].LeafValue();
|
|
||||||
|
|
||||||
for (const size_t* it = rowset.begin; it < rowset.end; ++it) {
|
|
||||||
out_preds[*it] += leaf_value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
// initialize temp data structure
|
|
||||||
inline void InitData(const GHistIndexMatrix& gmat,
|
|
||||||
const std::vector<GradientPair>& gpair,
|
|
||||||
const DMatrix& fmat,
|
|
||||||
const RegTree& tree) {
|
|
||||||
CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
|
|
||||||
<< "ColMakerHist: can only grow new tree";
|
|
||||||
CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
|
|
||||||
<< "max_depth or max_leaves cannot be both 0 (unlimited); "
|
|
||||||
<< "at least one should be a positive quantity.";
|
|
||||||
if (param_.grow_policy == TrainParam::kDepthWise) {
|
|
||||||
CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
|
|
||||||
<< "when grow_policy is depthwise.";
|
|
||||||
}
|
|
||||||
const auto& info = fmat.Info();
|
|
||||||
|
|
||||||
{
|
|
||||||
// initialize the row set
|
|
||||||
row_set_collection_.Clear();
|
|
||||||
// clear local prediction cache
|
|
||||||
leaf_value_cache_.clear();
|
|
||||||
// initialize histogram collection
|
|
||||||
uint32_t nbins = gmat.cut.row_ptr.back();
|
|
||||||
hist_.Init(nbins);
|
|
||||||
|
|
||||||
// initialize histogram builder
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
this->nthread_ = omp_get_num_threads();
|
|
||||||
}
|
|
||||||
hist_builder_.Init(this->nthread_, nbins);
|
|
||||||
|
|
||||||
CHECK_EQ(info.root_index_.size(), 0U);
|
|
||||||
std::vector<size_t>& row_indices = row_set_collection_.row_indices_;
|
|
||||||
// mark subsample and build list of member rows
|
|
||||||
if (param_.subsample < 1.0f) {
|
|
||||||
std::bernoulli_distribution coin_flip(param_.subsample);
|
|
||||||
auto& rnd = common::GlobalRandom();
|
|
||||||
for (size_t i = 0; i < info.num_row_; ++i) {
|
|
||||||
if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
|
|
||||||
row_indices.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < info.num_row_; ++i) {
|
|
||||||
if (gpair[i].GetHess() >= 0.0f) {
|
|
||||||
row_indices.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
row_set_collection_.Init();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
/* determine layout of data */
|
|
||||||
const size_t nrow = info.num_row_;
|
|
||||||
const size_t ncol = info.num_col_;
|
|
||||||
const size_t nnz = info.num_nonzero_;
|
|
||||||
// number of discrete bins for feature 0
|
|
||||||
const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
|
|
||||||
if (nrow * ncol == nnz) {
|
|
||||||
// dense data with zero-based indexing
|
|
||||||
data_layout_ = kDenseDataZeroBased;
|
|
||||||
} else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
|
|
||||||
// dense data with one-based indexing
|
|
||||||
data_layout_ = kDenseDataOneBased;
|
|
||||||
} else {
|
|
||||||
// sparse data
|
|
||||||
data_layout_ = kSparseData;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// store a pointer to the tree
|
|
||||||
p_last_tree_ = &tree;
|
|
||||||
// store a pointer to training data
|
|
||||||
p_last_fmat_ = &fmat;
|
|
||||||
// initialize feature index
|
|
||||||
if (data_layout_ == kDenseDataOneBased) {
|
|
||||||
column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
|
|
||||||
param_.colsample_bytree, true);
|
|
||||||
} else {
|
|
||||||
column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
|
|
||||||
param_.colsample_bytree, false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
|
||||||
/* specialized code for dense data:
|
|
||||||
choose the column that has a least positive number of discrete bins.
|
|
||||||
For dense data (with no missing value),
|
|
||||||
the sum of gradient histogram is equal to snode[nid] */
|
|
||||||
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
|
||||||
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
|
||||||
uint32_t min_nbins_per_feature = 0;
|
|
||||||
for (bst_uint i = 0; i < nfeature; ++i) {
|
|
||||||
const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
|
|
||||||
if (nbins > 0) {
|
|
||||||
if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) {
|
|
||||||
min_nbins_per_feature = nbins;
|
|
||||||
fid_least_bins_ = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CHECK_GT(min_nbins_per_feature, 0U);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
snode_.reserve(256);
|
|
||||||
snode_.clear();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
if (param_.grow_policy == TrainParam::kLossGuide) {
|
|
||||||
qexpand_.reset(new ExpandQueue(LossGuide));
|
|
||||||
} else {
|
|
||||||
qexpand_.reset(new ExpandQueue(DepthWise));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void EvaluateSplit(int nid,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const HistCollection& hist,
|
|
||||||
const DMatrix& fmat,
|
|
||||||
const RegTree& tree) {
|
|
||||||
// start enumeration
|
|
||||||
const MetaInfo& info = fmat.Info();
|
|
||||||
const auto& feature_set = column_sampler_.GetFeatureSet(tree.GetDepth(nid)).HostVector();
|
|
||||||
const auto nfeature = static_cast<bst_uint>(feature_set.size());
|
|
||||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
|
||||||
best_split_tloc_.resize(nthread);
|
|
||||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
|
||||||
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
|
|
||||||
best_split_tloc_[tid] = snode_[nid].best;
|
|
||||||
}
|
|
||||||
#pragma omp parallel for schedule(dynamic) num_threads(nthread)
|
|
||||||
for (bst_omp_uint i = 0; i < nfeature; ++i) {
|
|
||||||
const bst_uint fid = feature_set[i];
|
|
||||||
const unsigned tid = omp_get_thread_num();
|
|
||||||
this->EnumerateSplit(-1, gmat, hist[nid], snode_[nid], info,
|
|
||||||
&best_split_tloc_[tid], fid, nid);
|
|
||||||
this->EnumerateSplit(+1, gmat, hist[nid], snode_[nid], info,
|
|
||||||
&best_split_tloc_[tid], fid, nid);
|
|
||||||
}
|
|
||||||
for (unsigned tid = 0; tid < nthread; ++tid) {
|
|
||||||
snode_[nid].best.Update(best_split_tloc_[tid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ApplySplit(int nid,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const ColumnMatrix& column_matrix,
|
|
||||||
const HistCollection& hist,
|
|
||||||
const DMatrix& fmat,
|
|
||||||
RegTree* p_tree) {
|
|
||||||
// TODO(hcho3): support feature sampling by levels
|
|
||||||
|
|
||||||
/* 1. Create child nodes */
|
|
||||||
NodeEntry& e = snode_[nid];
|
|
||||||
|
|
||||||
p_tree->AddChilds(nid);
|
|
||||||
(*p_tree)[nid].SetSplit(e.best.SplitIndex(), e.best.split_value, e.best.DefaultLeft());
|
|
||||||
// mark right child as 0, to indicate fresh leaf
|
|
||||||
int cleft = (*p_tree)[nid].LeftChild();
|
|
||||||
int cright = (*p_tree)[nid].RightChild();
|
|
||||||
(*p_tree)[cleft].SetLeaf(0.0f, 0);
|
|
||||||
(*p_tree)[cright].SetLeaf(0.0f, 0);
|
|
||||||
|
|
||||||
/* 2. Categorize member rows */
|
|
||||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
|
||||||
row_split_tloc_.resize(nthread);
|
|
||||||
for (bst_omp_uint i = 0; i < nthread; ++i) {
|
|
||||||
row_split_tloc_[i].left.clear();
|
|
||||||
row_split_tloc_[i].right.clear();
|
|
||||||
}
|
|
||||||
const bool default_left = (*p_tree)[nid].DefaultLeft();
|
|
||||||
const bst_uint fid = (*p_tree)[nid].SplitIndex();
|
|
||||||
const bst_float split_pt = (*p_tree)[nid].SplitCond();
|
|
||||||
const uint32_t lower_bound = gmat.cut.row_ptr[fid];
|
|
||||||
const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
|
|
||||||
int32_t split_cond = -1;
|
|
||||||
// convert floating-point split_pt into corresponding bin_id
|
|
||||||
// split_cond = -1 indicates that split_pt is less than all known cut points
|
|
||||||
CHECK_LT(upper_bound,
|
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
for (uint32_t i = lower_bound; i < upper_bound; ++i) {
|
|
||||||
if (split_pt == gmat.cut.cut[i]) {
|
|
||||||
split_cond = static_cast<int32_t>(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto& rowset = row_set_collection_[nid];
|
|
||||||
|
|
||||||
Column column = column_matrix.GetColumn(fid);
|
|
||||||
if (column.GetType() == xgboost::common::kDenseColumn) {
|
|
||||||
ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
|
|
||||||
default_left);
|
|
||||||
} else {
|
|
||||||
ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound,
|
|
||||||
upper_bound, split_cond, default_left);
|
|
||||||
}
|
|
||||||
|
|
||||||
row_set_collection_.AddSplit(
|
|
||||||
nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ApplySplitDenseData(const RowSetCollection::Elem rowset,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
|
||||||
const Column& column,
|
|
||||||
bst_int split_cond,
|
|
||||||
bool default_left) {
|
|
||||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
|
||||||
constexpr int kUnroll = 8; // loop unrolling factor
|
|
||||||
const size_t nrows = rowset.end - rowset.begin;
|
|
||||||
const size_t rest = nrows % kUnroll;
|
|
||||||
|
|
||||||
#pragma omp parallel for num_threads(nthread_) schedule(static)
|
|
||||||
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
|
|
||||||
const bst_uint tid = omp_get_thread_num();
|
|
||||||
auto& left = row_split_tloc[tid].left;
|
|
||||||
auto& right = row_split_tloc[tid].right;
|
|
||||||
size_t rid[kUnroll];
|
|
||||||
uint32_t rbin[kUnroll];
|
|
||||||
for (int k = 0; k < kUnroll; ++k) {
|
|
||||||
rid[k] = rowset.begin[i + k];
|
|
||||||
}
|
|
||||||
for (int k = 0; k < kUnroll; ++k) {
|
|
||||||
rbin[k] = column.GetFeatureBinIdx(rid[k]);
|
|
||||||
}
|
|
||||||
for (int k = 0; k < kUnroll; ++k) { // NOLINT
|
|
||||||
if (rbin[k] == std::numeric_limits<uint32_t>::max()) { // missing value
|
|
||||||
if (default_left) {
|
|
||||||
left.push_back(rid[k]);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid[k]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (static_cast<int32_t>(rbin[k] + column.GetBaseIdx()) <= split_cond) {
|
|
||||||
left.push_back(rid[k]);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid[k]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
|
||||||
auto& left = row_split_tloc[nthread_-1].left;
|
|
||||||
auto& right = row_split_tloc[nthread_-1].right;
|
|
||||||
const size_t rid = rowset.begin[i];
|
|
||||||
const uint32_t rbin = column.GetFeatureBinIdx(rid);
|
|
||||||
if (rbin == std::numeric_limits<uint32_t>::max()) { // missing value
|
|
||||||
if (default_left) {
|
|
||||||
left.push_back(rid);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
|
||||||
left.push_back(rid);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ApplySplitSparseData(const RowSetCollection::Elem rowset,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
|
||||||
const Column& column,
|
|
||||||
bst_uint lower_bound,
|
|
||||||
bst_uint upper_bound,
|
|
||||||
bst_int split_cond,
|
|
||||||
bool default_left) {
|
|
||||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
|
||||||
const size_t nrows = rowset.end - rowset.begin;
|
|
||||||
|
|
||||||
#pragma omp parallel num_threads(nthread_)
|
|
||||||
{
|
|
||||||
const auto tid = static_cast<size_t>(omp_get_thread_num());
|
|
||||||
const size_t ibegin = tid * nrows / nthread_;
|
|
||||||
const size_t iend = (tid + 1) * nrows / nthread_;
|
|
||||||
if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range
|
|
||||||
// search first nonzero row with index >= rowset[ibegin]
|
|
||||||
const size_t* p = std::lower_bound(column.GetRowData(),
|
|
||||||
column.GetRowData() + column.Size(),
|
|
||||||
rowset.begin[ibegin]);
|
|
||||||
|
|
||||||
auto& left = row_split_tloc[tid].left;
|
|
||||||
auto& right = row_split_tloc[tid].right;
|
|
||||||
if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) {
|
|
||||||
size_t cursor = p - column.GetRowData();
|
|
||||||
|
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
|
||||||
const size_t rid = rowset.begin[i];
|
|
||||||
while (cursor < column.Size()
|
|
||||||
&& column.GetRowIdx(cursor) < rid
|
|
||||||
&& column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) {
|
|
||||||
++cursor;
|
|
||||||
}
|
|
||||||
if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
|
|
||||||
const uint32_t rbin = column.GetFeatureBinIdx(cursor);
|
|
||||||
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
|
||||||
left.push_back(rid);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid);
|
|
||||||
}
|
|
||||||
++cursor;
|
|
||||||
} else {
|
|
||||||
// missing value
|
|
||||||
if (default_left) {
|
|
||||||
left.push_back(rid);
|
|
||||||
} else {
|
|
||||||
right.push_back(rid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else { // all rows in [ibegin, iend) have missing values
|
|
||||||
if (default_left) {
|
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
|
||||||
const size_t rid = rowset.begin[i];
|
|
||||||
left.push_back(rid);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
|
||||||
const size_t rid = rowset.begin[i];
|
|
||||||
right.push_back(rid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void InitNewNode(int nid,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const std::vector<GradientPair>& gpair,
|
|
||||||
const DMatrix& fmat,
|
|
||||||
const RegTree& tree) {
|
|
||||||
{
|
|
||||||
snode_.resize(tree.param.num_nodes, NodeEntry(param_));
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
auto& stats = snode_[nid].stats;
|
|
||||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
|
||||||
/* specialized code for dense data
|
|
||||||
For dense data (with no missing value),
|
|
||||||
the sum of gradient histogram is equal to snode[nid] */
|
|
||||||
GHistRow hist = hist_[nid];
|
|
||||||
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
|
||||||
|
|
||||||
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
|
||||||
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
|
||||||
for (uint32_t i = ibegin; i < iend; ++i) {
|
|
||||||
const GHistEntry et = hist.begin[i];
|
|
||||||
stats.Add(et.sum_grad, et.sum_hess);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
const RowSetCollection::Elem e = row_set_collection_[nid];
|
|
||||||
for (const size_t* it = e.begin; it < e.end; ++it) {
|
|
||||||
stats.Add(gpair[*it]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// calculating the weights
|
|
||||||
{
|
|
||||||
bst_uint parentid = tree[nid].Parent();
|
|
||||||
snode_[nid].weight = static_cast<float>(
|
|
||||||
spliteval_->ComputeWeight(parentid, snode_[nid].stats));
|
|
||||||
snode_[nid].root_gain = static_cast<float>(
|
|
||||||
spliteval_->ComputeScore(parentid, snode_[nid].stats, snode_[nid].weight));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// enumerate the split values of specific feature
|
|
||||||
inline void EnumerateSplit(int d_step,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const GHistRow& hist,
|
|
||||||
const NodeEntry& snode,
|
|
||||||
const MetaInfo& info,
|
|
||||||
SplitEntry* p_best,
|
|
||||||
bst_uint fid,
|
|
||||||
bst_uint nodeID) {
|
|
||||||
CHECK(d_step == +1 || d_step == -1);
|
|
||||||
|
|
||||||
// aliases
|
|
||||||
const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
|
|
||||||
const std::vector<bst_float>& cut_val = gmat.cut.cut;
|
|
||||||
|
|
||||||
// statistics on both sides of split
|
|
||||||
GradStats c(param_);
|
|
||||||
GradStats e(param_);
|
|
||||||
// best split so far
|
|
||||||
SplitEntry best;
|
|
||||||
|
|
||||||
// bin boundaries
|
|
||||||
CHECK_LE(cut_ptr[fid],
|
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
CHECK_LE(cut_ptr[fid + 1],
|
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
// imin: index (offset) of the minimum value for feature fid
|
|
||||||
// need this for backward enumeration
|
|
||||||
const auto imin = static_cast<int32_t>(cut_ptr[fid]);
|
|
||||||
// ibegin, iend: smallest/largest cut points for feature fid
|
|
||||||
// use int to allow for value -1
|
|
||||||
int32_t ibegin, iend;
|
|
||||||
if (d_step > 0) {
|
|
||||||
ibegin = static_cast<int32_t>(cut_ptr[fid]);
|
|
||||||
iend = static_cast<int32_t>(cut_ptr[fid + 1]);
|
|
||||||
} else {
|
|
||||||
ibegin = static_cast<int32_t>(cut_ptr[fid + 1]) - 1;
|
|
||||||
iend = static_cast<int32_t>(cut_ptr[fid]) - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int32_t i = ibegin; i != iend; i += d_step) {
|
|
||||||
// start working
|
|
||||||
// try to find a split
|
|
||||||
e.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
|
|
||||||
if (e.sum_hess >= param_.min_child_weight) {
|
|
||||||
c.SetSubstract(snode.stats, e);
|
|
||||||
if (c.sum_hess >= param_.min_child_weight) {
|
|
||||||
bst_float loss_chg;
|
|
||||||
bst_float split_pt;
|
|
||||||
if (d_step > 0) {
|
|
||||||
// forward enumeration: split at right bound of each bin
|
|
||||||
loss_chg = static_cast<bst_float>(
|
|
||||||
spliteval_->ComputeSplitScore(nodeID, fid, e, c) -
|
|
||||||
snode.root_gain);
|
|
||||||
split_pt = cut_val[i];
|
|
||||||
} else {
|
|
||||||
// backward enumeration: split at left bound of each bin
|
|
||||||
loss_chg = static_cast<bst_float>(
|
|
||||||
spliteval_->ComputeSplitScore(nodeID, fid, c, e) -
|
|
||||||
snode.root_gain);
|
|
||||||
if (i == imin) {
|
|
||||||
// for leftmost bin, left bound is the smallest feature value
|
|
||||||
split_pt = gmat.cut.min_val[fid];
|
|
||||||
} else {
|
|
||||||
split_pt = cut_val[i - 1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
best.Update(loss_chg, fid, split_pt, d_step == -1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p_best->Update(best);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* tree growing policies */
|
|
||||||
struct ExpandEntry {
|
|
||||||
int nid;
|
|
||||||
int depth;
|
|
||||||
bst_float loss_chg;
|
|
||||||
unsigned timestamp;
|
|
||||||
ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
|
|
||||||
: nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
|
|
||||||
};
|
|
||||||
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
|
||||||
if (lhs.depth == rhs.depth) {
|
|
||||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
|
||||||
} else {
|
|
||||||
return lhs.depth > rhs.depth; // favor small depth
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
|
||||||
if (lhs.loss_chg == rhs.loss_chg) {
|
|
||||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
|
||||||
} else {
|
|
||||||
return lhs.loss_chg < rhs.loss_chg; // favor large loss_chg
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --data fields--
|
|
||||||
const TrainParam& param_;
|
|
||||||
const FastHistParam& fhparam_;
|
|
||||||
// number of omp thread used during training
|
|
||||||
int nthread_;
|
|
||||||
common::ColumnSampler column_sampler_;
|
|
||||||
// the internal row sets
|
|
||||||
RowSetCollection row_set_collection_;
|
|
||||||
// the temp space for split
|
|
||||||
std::vector<RowSetCollection::Split> row_split_tloc_;
|
|
||||||
std::vector<SplitEntry> best_split_tloc_;
|
|
||||||
/*! \brief TreeNode Data: statistics for each constructed node */
|
|
||||||
std::vector<NodeEntry> snode_;
|
|
||||||
/*! \brief culmulative histogram of gradients. */
|
|
||||||
HistCollection hist_;
|
|
||||||
/*! \brief feature with least # of bins. to be used for dense specialization
|
|
||||||
of InitNewNode() */
|
|
||||||
uint32_t fid_least_bins_;
|
|
||||||
/*! \brief local prediction cache; maps node id to leaf value */
|
|
||||||
std::vector<float> leaf_value_cache_;
|
|
||||||
|
|
||||||
GHistBuilder hist_builder_;
|
|
||||||
std::unique_ptr<TreeUpdater> pruner_;
|
|
||||||
std::unique_ptr<SplitEvaluator> spliteval_;
|
|
||||||
|
|
||||||
// back pointers to tree and data matrix
|
|
||||||
const RegTree* p_last_tree_;
|
|
||||||
const DMatrix* p_last_fmat_;
|
|
||||||
|
|
||||||
using ExpandQueue =
|
|
||||||
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
|
||||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
|
||||||
std::unique_ptr<ExpandQueue> qexpand_;
|
|
||||||
|
|
||||||
enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
|
|
||||||
DataLayout data_layout_;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::unique_ptr<Builder> builder_;
|
|
||||||
std::unique_ptr<TreeUpdater> pruner_;
|
|
||||||
std::unique_ptr<SplitEvaluator> spliteval_;
|
|
||||||
};
|
|
||||||
|
|
||||||
XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
|
|
||||||
.describe("Grow tree using quantized histogram.")
|
|
||||||
.set_body([]() {
|
|
||||||
return new FastHistMaker();
|
|
||||||
});
|
|
||||||
|
|
||||||
} // namespace tree
|
|
||||||
} // namespace xgboost
|
|
||||||
@ -4,11 +4,12 @@
|
|||||||
* \brief use histogram counting to construct a tree
|
* \brief use histogram counting to construct a tree
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/base.h>
|
#include <xgboost/base.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/quantile.h"
|
#include "../common/quantile.h"
|
||||||
#include "../common/group_data.h"
|
#include "../common/group_data.h"
|
||||||
#include "./updater_basemaker-inl.h"
|
#include "./updater_basemaker-inl.h"
|
||||||
|
|||||||
@ -4,12 +4,13 @@
|
|||||||
* \brief prune a tree given the statistics
|
* \brief prune a tree given the statistics
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
748
src/tree/updater_quantile_hist.cc
Normal file
748
src/tree/updater_quantile_hist.cc
Normal file
@ -0,0 +1,748 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2017-2018 by Contributors
|
||||||
|
* \file updater_quantile_hist.cc
|
||||||
|
* \brief use quantized feature values to construct a tree
|
||||||
|
* \author Philip Cho, Tianqi Checn
|
||||||
|
*/
|
||||||
|
#include <dmlc/timer.h>
|
||||||
|
#include <rabit/rabit.h>
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <queue>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <numeric>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "./param.h"
|
||||||
|
#include "./updater_quantile_hist.h"
|
||||||
|
#include "./split_evaluator.h"
|
||||||
|
#include "../common/random.h"
|
||||||
|
#include "../common/hist_util.h"
|
||||||
|
#include "../common/row_set.h"
|
||||||
|
#include "../common/column_matrix.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
|
||||||
|
|
||||||
|
void QuantileHistMaker::Init(const std::vector<std::pair<std::string, std::string> >& args) {
|
||||||
|
// initialize pruner
|
||||||
|
if (!pruner_) {
|
||||||
|
pruner_.reset(TreeUpdater::Create("prune"));
|
||||||
|
}
|
||||||
|
pruner_->Init(args);
|
||||||
|
param_.InitAllowUnknown(args);
|
||||||
|
is_gmat_initialized_ = false;
|
||||||
|
|
||||||
|
// initialise the split evaluator
|
||||||
|
if (!spliteval_) {
|
||||||
|
spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
|
||||||
|
}
|
||||||
|
|
||||||
|
spliteval_->Init(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
|
||||||
|
DMatrix *dmat,
|
||||||
|
const std::vector<RegTree *> &trees) {
|
||||||
|
GradStats::CheckInfo(dmat->Info());
|
||||||
|
if (is_gmat_initialized_ == false) {
|
||||||
|
double tstart = dmlc::GetTime();
|
||||||
|
gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
|
||||||
|
column_matrix_.Init(gmat_, param_.sparse_threshold);
|
||||||
|
if (param_.enable_feature_grouping > 0) {
|
||||||
|
gmatb_.Init(gmat_, column_matrix_, param_);
|
||||||
|
}
|
||||||
|
is_gmat_initialized_ = true;
|
||||||
|
if (param_.debug_verbose > 0) {
|
||||||
|
LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// rescale learning rate according to size of trees
|
||||||
|
float lr = param_.learning_rate;
|
||||||
|
param_.learning_rate = lr / trees.size();
|
||||||
|
// build tree
|
||||||
|
if (!builder_) {
|
||||||
|
builder_.reset(new Builder(
|
||||||
|
param_,
|
||||||
|
std::move(pruner_),
|
||||||
|
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
|
||||||
|
}
|
||||||
|
for (auto tree : trees) {
|
||||||
|
builder_->Update
|
||||||
|
(gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
|
||||||
|
}
|
||||||
|
param_.learning_rate = lr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool QuantileHistMaker::UpdatePredictionCache(
|
||||||
|
const DMatrix* data,
|
||||||
|
HostDeviceVector<bst_float>* out_preds) {
|
||||||
|
if (!builder_ || param_.subsample < 1.0f) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return builder_->UpdatePredictionCache(data, out_preds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
|
||||||
|
const GHistIndexBlockMatrix& gmatb,
|
||||||
|
const ColumnMatrix& column_matrix,
|
||||||
|
HostDeviceVector<GradientPair>* gpair,
|
||||||
|
DMatrix* p_fmat,
|
||||||
|
RegTree* p_tree) {
|
||||||
|
double gstart = dmlc::GetTime();
|
||||||
|
|
||||||
|
int num_leaves = 0;
|
||||||
|
unsigned timestamp = 0;
|
||||||
|
|
||||||
|
double tstart;
|
||||||
|
double time_init_data = 0;
|
||||||
|
double time_init_new_node = 0;
|
||||||
|
double time_build_hist = 0;
|
||||||
|
double time_evaluate_split = 0;
|
||||||
|
double time_apply_split = 0;
|
||||||
|
|
||||||
|
const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
|
||||||
|
|
||||||
|
spliteval_->Reset();
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
|
||||||
|
time_init_data = dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
// FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
|
||||||
|
CHECK_EQ(p_tree->param.num_roots, 1)
|
||||||
|
<< "tree_method=hist does not support multiple roots at this moment";
|
||||||
|
for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
hist_.AddHistRow(nid);
|
||||||
|
BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid]);
|
||||||
|
time_build_hist += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
|
||||||
|
time_init_new_node += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
|
||||||
|
time_evaluate_split += dmlc::GetTime() - tstart;
|
||||||
|
qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
|
||||||
|
snode_[nid].best.loss_chg,
|
||||||
|
timestamp++));
|
||||||
|
++num_leaves;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!qexpand_->empty()) {
|
||||||
|
const ExpandEntry candidate = qexpand_->top();
|
||||||
|
const int nid = candidate.nid;
|
||||||
|
qexpand_->pop();
|
||||||
|
if (candidate.loss_chg <= kRtEps
|
||||||
|
|| (param_.max_depth > 0 && candidate.depth == param_.max_depth)
|
||||||
|
|| (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
|
||||||
|
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
||||||
|
} else {
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
|
||||||
|
time_apply_split += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
const int cleft = (*p_tree)[nid].LeftChild();
|
||||||
|
const int cright = (*p_tree)[nid].RightChild();
|
||||||
|
hist_.AddHistRow(cleft);
|
||||||
|
hist_.AddHistRow(cright);
|
||||||
|
if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
|
||||||
|
BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
|
||||||
|
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
|
||||||
|
} else {
|
||||||
|
BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
|
||||||
|
SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
|
||||||
|
}
|
||||||
|
time_build_hist += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
|
||||||
|
this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
|
||||||
|
bst_uint featureid = snode_[nid].best.SplitIndex();
|
||||||
|
spliteval_->AddSplit(nid, cleft, cright, featureid,
|
||||||
|
snode_[cleft].weight, snode_[cright].weight);
|
||||||
|
time_init_new_node += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
tstart = dmlc::GetTime();
|
||||||
|
this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree);
|
||||||
|
this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree);
|
||||||
|
time_evaluate_split += dmlc::GetTime() - tstart;
|
||||||
|
|
||||||
|
qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
|
||||||
|
snode_[cleft].best.loss_chg,
|
||||||
|
timestamp++));
|
||||||
|
qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
|
||||||
|
snode_[cright].best.loss_chg,
|
||||||
|
timestamp++));
|
||||||
|
|
||||||
|
++num_leaves; // give two and take one, as parent is no longer a leaf
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// set all the rest expanding nodes to leaf
|
||||||
|
// This post condition is not needed in current code, but may be necessary
|
||||||
|
// when there are stopping rule that leaves qexpand non-empty
|
||||||
|
while (!qexpand_->empty()) {
|
||||||
|
const int nid = qexpand_->top().nid;
|
||||||
|
qexpand_->pop();
|
||||||
|
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
||||||
|
}
|
||||||
|
// remember auxiliary statistics in the tree node
|
||||||
|
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
|
||||||
|
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
|
||||||
|
p_tree->Stat(nid).base_weight = snode_[nid].weight;
|
||||||
|
p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
|
||||||
|
snode_[nid].stats.SetLeafVec(param_, p_tree->Leafvec(nid));
|
||||||
|
}
|
||||||
|
|
||||||
|
pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
|
||||||
|
|
||||||
|
if (param_.debug_verbose > 0) {
|
||||||
|
double total_time = dmlc::GetTime() - gstart;
|
||||||
|
LOG(INFO) << "\nInitData: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
|
||||||
|
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
||||||
|
<< time_init_data / total_time * 100 << "%)\n"
|
||||||
|
<< "InitNewNode: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
|
||||||
|
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
||||||
|
<< time_init_new_node / total_time * 100 << "%)\n"
|
||||||
|
<< "BuildHist: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
|
||||||
|
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
||||||
|
<< time_build_hist / total_time * 100 << "%)\n"
|
||||||
|
<< "EvaluateSplit: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
|
||||||
|
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
||||||
|
<< time_evaluate_split / total_time * 100 << "%)\n"
|
||||||
|
<< "ApplySplit: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
|
||||||
|
<< " (" << std::fixed << std::setw(5) << std::setprecision(2)
|
||||||
|
<< time_apply_split / total_time * 100 << "%)\n"
|
||||||
|
<< "========================================\n"
|
||||||
|
<< "Total: "
|
||||||
|
<< std::fixed << std::setw(6) << std::setprecision(4) << total_time;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool QuantileHistMaker::Builder::UpdatePredictionCache(
|
||||||
|
const DMatrix* data,
|
||||||
|
HostDeviceVector<bst_float>* p_out_preds) {
|
||||||
|
std::vector<bst_float>& out_preds = p_out_preds->HostVector();
|
||||||
|
|
||||||
|
// p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
|
||||||
|
// conjunction with Update().
|
||||||
|
if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leaf_value_cache_.empty()) {
|
||||||
|
leaf_value_cache_.resize(p_last_tree_->param.num_nodes,
|
||||||
|
std::numeric_limits<float>::infinity());
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK_GT(out_preds.size(), 0U);
|
||||||
|
|
||||||
|
for (const RowSetCollection::Elem rowset : row_set_collection_) {
|
||||||
|
if (rowset.begin != nullptr && rowset.end != nullptr) {
|
||||||
|
int nid = rowset.node_id;
|
||||||
|
bst_float leaf_value;
|
||||||
|
// if a node is marked as deleted by the pruner, traverse upward to locate
|
||||||
|
// a non-deleted leaf.
|
||||||
|
if ((*p_last_tree_)[nid].IsDeleted()) {
|
||||||
|
while ((*p_last_tree_)[nid].IsDeleted()) {
|
||||||
|
nid = (*p_last_tree_)[nid].Parent();
|
||||||
|
}
|
||||||
|
CHECK((*p_last_tree_)[nid].IsLeaf());
|
||||||
|
}
|
||||||
|
leaf_value = (*p_last_tree_)[nid].LeafValue();
|
||||||
|
|
||||||
|
for (const size_t* it = rowset.begin; it < rowset.end; ++it) {
|
||||||
|
out_preds[*it] += leaf_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
|
||||||
|
const std::vector<GradientPair>& gpair,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree) {
|
||||||
|
CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
|
||||||
|
<< "ColMakerHist: can only grow new tree";
|
||||||
|
CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
|
||||||
|
<< "max_depth or max_leaves cannot be both 0 (unlimited); "
|
||||||
|
<< "at least one should be a positive quantity.";
|
||||||
|
if (param_.grow_policy == TrainParam::kDepthWise) {
|
||||||
|
CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
|
||||||
|
<< "when grow_policy is depthwise.";
|
||||||
|
}
|
||||||
|
const auto& info = fmat.Info();
|
||||||
|
|
||||||
|
{
|
||||||
|
// initialize the row set
|
||||||
|
row_set_collection_.Clear();
|
||||||
|
// clear local prediction cache
|
||||||
|
leaf_value_cache_.clear();
|
||||||
|
// initialize histogram collection
|
||||||
|
uint32_t nbins = gmat.cut.row_ptr.back();
|
||||||
|
hist_.Init(nbins);
|
||||||
|
|
||||||
|
// initialize histogram builder
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
this->nthread_ = omp_get_num_threads();
|
||||||
|
}
|
||||||
|
hist_builder_.Init(this->nthread_, nbins);
|
||||||
|
|
||||||
|
CHECK_EQ(info.root_index_.size(), 0U);
|
||||||
|
std::vector<size_t>& row_indices = row_set_collection_.row_indices_;
|
||||||
|
// mark subsample and build list of member rows
|
||||||
|
if (param_.subsample < 1.0f) {
|
||||||
|
std::bernoulli_distribution coin_flip(param_.subsample);
|
||||||
|
auto& rnd = common::GlobalRandom();
|
||||||
|
for (size_t i = 0; i < info.num_row_; ++i) {
|
||||||
|
if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
|
||||||
|
row_indices.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < info.num_row_; ++i) {
|
||||||
|
if (gpair[i].GetHess() >= 0.0f) {
|
||||||
|
row_indices.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
row_set_collection_.Init();
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
/* determine layout of data */
|
||||||
|
const size_t nrow = info.num_row_;
|
||||||
|
const size_t ncol = info.num_col_;
|
||||||
|
const size_t nnz = info.num_nonzero_;
|
||||||
|
// number of discrete bins for feature 0
|
||||||
|
const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
|
||||||
|
if (nrow * ncol == nnz) {
|
||||||
|
// dense data with zero-based indexing
|
||||||
|
data_layout_ = kDenseDataZeroBased;
|
||||||
|
} else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) {
|
||||||
|
// dense data with one-based indexing
|
||||||
|
data_layout_ = kDenseDataOneBased;
|
||||||
|
} else {
|
||||||
|
// sparse data
|
||||||
|
data_layout_ = kSparseData;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// store a pointer to the tree
|
||||||
|
p_last_tree_ = &tree;
|
||||||
|
// store a pointer to training data
|
||||||
|
p_last_fmat_ = &fmat;
|
||||||
|
// initialize feature index
|
||||||
|
if (data_layout_ == kDenseDataOneBased) {
|
||||||
|
column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
|
||||||
|
param_.colsample_bytree, true);
|
||||||
|
} else {
|
||||||
|
column_sampler_.Init(info.num_col_, param_.colsample_bylevel,
|
||||||
|
param_.colsample_bytree, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||||
|
/* specialized code for dense data:
|
||||||
|
choose the column that has a least positive number of discrete bins.
|
||||||
|
For dense data (with no missing value),
|
||||||
|
the sum of gradient histogram is equal to snode[nid] */
|
||||||
|
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||||
|
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
||||||
|
uint32_t min_nbins_per_feature = 0;
|
||||||
|
for (bst_uint i = 0; i < nfeature; ++i) {
|
||||||
|
const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
|
||||||
|
if (nbins > 0) {
|
||||||
|
if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) {
|
||||||
|
min_nbins_per_feature = nbins;
|
||||||
|
fid_least_bins_ = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CHECK_GT(min_nbins_per_feature, 0U);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
snode_.reserve(256);
|
||||||
|
snode_.clear();
|
||||||
|
}
|
||||||
|
{
|
||||||
|
if (param_.grow_policy == TrainParam::kLossGuide) {
|
||||||
|
qexpand_.reset(new ExpandQueue(LossGuide));
|
||||||
|
} else {
|
||||||
|
qexpand_.reset(new ExpandQueue(DepthWise));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::EvaluateSplit(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const HistCollection& hist,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree) {
|
||||||
|
// start enumeration
|
||||||
|
const MetaInfo& info = fmat.Info();
|
||||||
|
const auto& feature_set = column_sampler_.GetFeatureSet(
|
||||||
|
tree.GetDepth(nid)).HostVector();
|
||||||
|
const auto nfeature = static_cast<bst_uint>(feature_set.size());
|
||||||
|
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||||
|
best_split_tloc_.resize(nthread);
|
||||||
|
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||||
|
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
|
||||||
|
best_split_tloc_[tid] = snode_[nid].best;
|
||||||
|
}
|
||||||
|
#pragma omp parallel for schedule(dynamic) num_threads(nthread)
|
||||||
|
for (bst_omp_uint i = 0; i < nfeature; ++i) {
|
||||||
|
const bst_uint fid = feature_set[i];
|
||||||
|
const unsigned tid = omp_get_thread_num();
|
||||||
|
this->EnumerateSplit(-1, gmat, hist[nid], snode_[nid], info,
|
||||||
|
&best_split_tloc_[tid], fid, nid);
|
||||||
|
this->EnumerateSplit(+1, gmat, hist[nid], snode_[nid], info,
|
||||||
|
&best_split_tloc_[tid], fid, nid);
|
||||||
|
}
|
||||||
|
for (unsigned tid = 0; tid < nthread; ++tid) {
|
||||||
|
snode_[nid].best.Update(best_split_tloc_[tid]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::ApplySplit(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const ColumnMatrix& column_matrix,
|
||||||
|
const HistCollection& hist,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
RegTree* p_tree) {
|
||||||
|
// TODO(hcho3): support feature sampling by levels
|
||||||
|
|
||||||
|
/* 1. Create child nodes */
|
||||||
|
NodeEntry& e = snode_[nid];
|
||||||
|
|
||||||
|
p_tree->AddChilds(nid);
|
||||||
|
(*p_tree)[nid].SetSplit(e.best.SplitIndex(), e.best.split_value, e.best.DefaultLeft());
|
||||||
|
// mark right child as 0, to indicate fresh leaf
|
||||||
|
int cleft = (*p_tree)[nid].LeftChild();
|
||||||
|
int cright = (*p_tree)[nid].RightChild();
|
||||||
|
(*p_tree)[cleft].SetLeaf(0.0f, 0);
|
||||||
|
(*p_tree)[cright].SetLeaf(0.0f, 0);
|
||||||
|
|
||||||
|
/* 2. Categorize member rows */
|
||||||
|
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||||
|
row_split_tloc_.resize(nthread);
|
||||||
|
for (bst_omp_uint i = 0; i < nthread; ++i) {
|
||||||
|
row_split_tloc_[i].left.clear();
|
||||||
|
row_split_tloc_[i].right.clear();
|
||||||
|
}
|
||||||
|
const bool default_left = (*p_tree)[nid].DefaultLeft();
|
||||||
|
const bst_uint fid = (*p_tree)[nid].SplitIndex();
|
||||||
|
const bst_float split_pt = (*p_tree)[nid].SplitCond();
|
||||||
|
const uint32_t lower_bound = gmat.cut.row_ptr[fid];
|
||||||
|
const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
|
||||||
|
int32_t split_cond = -1;
|
||||||
|
// convert floating-point split_pt into corresponding bin_id
|
||||||
|
// split_cond = -1 indicates that split_pt is less than all known cut points
|
||||||
|
CHECK_LT(upper_bound,
|
||||||
|
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||||
|
for (uint32_t i = lower_bound; i < upper_bound; ++i) {
|
||||||
|
if (split_pt == gmat.cut.cut[i]) {
|
||||||
|
split_cond = static_cast<int32_t>(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& rowset = row_set_collection_[nid];
|
||||||
|
|
||||||
|
Column column = column_matrix.GetColumn(fid);
|
||||||
|
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||||
|
ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
|
||||||
|
default_left);
|
||||||
|
} else {
|
||||||
|
ApplySplitSparseData(rowset, gmat, &row_split_tloc_, column, lower_bound,
|
||||||
|
upper_bound, split_cond, default_left);
|
||||||
|
}
|
||||||
|
|
||||||
|
row_set_collection_.AddSplit(
|
||||||
|
nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::ApplySplitDenseData(
|
||||||
|
const RowSetCollection::Elem rowset,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
|
const Column& column,
|
||||||
|
bst_int split_cond,
|
||||||
|
bool default_left) {
|
||||||
|
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||||
|
constexpr int kUnroll = 8; // loop unrolling factor
|
||||||
|
const size_t nrows = rowset.end - rowset.begin;
|
||||||
|
const size_t rest = nrows % kUnroll;
|
||||||
|
|
||||||
|
#pragma omp parallel for num_threads(nthread_) schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
|
||||||
|
const bst_uint tid = omp_get_thread_num();
|
||||||
|
auto& left = row_split_tloc[tid].left;
|
||||||
|
auto& right = row_split_tloc[tid].right;
|
||||||
|
size_t rid[kUnroll];
|
||||||
|
uint32_t rbin[kUnroll];
|
||||||
|
for (int k = 0; k < kUnroll; ++k) {
|
||||||
|
rid[k] = rowset.begin[i + k];
|
||||||
|
}
|
||||||
|
for (int k = 0; k < kUnroll; ++k) {
|
||||||
|
rbin[k] = column.GetFeatureBinIdx(rid[k]);
|
||||||
|
}
|
||||||
|
for (int k = 0; k < kUnroll; ++k) { // NOLINT
|
||||||
|
if (rbin[k] == std::numeric_limits<uint32_t>::max()) { // missing value
|
||||||
|
if (default_left) {
|
||||||
|
left.push_back(rid[k]);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid[k]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (static_cast<int32_t>(rbin[k] + column.GetBaseIdx()) <= split_cond) {
|
||||||
|
left.push_back(rid[k]);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid[k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||||
|
auto& left = row_split_tloc[nthread_-1].left;
|
||||||
|
auto& right = row_split_tloc[nthread_-1].right;
|
||||||
|
const size_t rid = rowset.begin[i];
|
||||||
|
const uint32_t rbin = column.GetFeatureBinIdx(rid);
|
||||||
|
if (rbin == std::numeric_limits<uint32_t>::max()) { // missing value
|
||||||
|
if (default_left) {
|
||||||
|
left.push_back(rid);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
||||||
|
left.push_back(rid);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::ApplySplitSparseData(
|
||||||
|
const RowSetCollection::Elem rowset,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
|
const Column& column,
|
||||||
|
bst_uint lower_bound,
|
||||||
|
bst_uint upper_bound,
|
||||||
|
bst_int split_cond,
|
||||||
|
bool default_left) {
|
||||||
|
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||||
|
const size_t nrows = rowset.end - rowset.begin;
|
||||||
|
|
||||||
|
#pragma omp parallel num_threads(nthread_)
|
||||||
|
{
|
||||||
|
const auto tid = static_cast<size_t>(omp_get_thread_num());
|
||||||
|
const size_t ibegin = tid * nrows / nthread_;
|
||||||
|
const size_t iend = (tid + 1) * nrows / nthread_;
|
||||||
|
if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range
|
||||||
|
// search first nonzero row with index >= rowset[ibegin]
|
||||||
|
const size_t* p = std::lower_bound(column.GetRowData(),
|
||||||
|
column.GetRowData() + column.Size(),
|
||||||
|
rowset.begin[ibegin]);
|
||||||
|
|
||||||
|
auto& left = row_split_tloc[tid].left;
|
||||||
|
auto& right = row_split_tloc[tid].right;
|
||||||
|
if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) {
|
||||||
|
size_t cursor = p - column.GetRowData();
|
||||||
|
|
||||||
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
|
const size_t rid = rowset.begin[i];
|
||||||
|
while (cursor < column.Size()
|
||||||
|
&& column.GetRowIdx(cursor) < rid
|
||||||
|
&& column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) {
|
||||||
|
++cursor;
|
||||||
|
}
|
||||||
|
if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
|
||||||
|
const uint32_t rbin = column.GetFeatureBinIdx(cursor);
|
||||||
|
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
||||||
|
left.push_back(rid);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid);
|
||||||
|
}
|
||||||
|
++cursor;
|
||||||
|
} else {
|
||||||
|
// missing value
|
||||||
|
if (default_left) {
|
||||||
|
left.push_back(rid);
|
||||||
|
} else {
|
||||||
|
right.push_back(rid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // all rows in [ibegin, iend) have missing values
|
||||||
|
if (default_left) {
|
||||||
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
|
const size_t rid = rowset.begin[i];
|
||||||
|
left.push_back(rid);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
|
const size_t rid = rowset.begin[i];
|
||||||
|
right.push_back(rid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantileHistMaker::Builder::InitNewNode(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const std::vector<GradientPair>& gpair,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree) {
|
||||||
|
{
|
||||||
|
snode_.resize(tree.param.num_nodes, NodeEntry(param_));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto& stats = snode_[nid].stats;
|
||||||
|
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||||
|
/* specialized code for dense data
|
||||||
|
For dense data (with no missing value),
|
||||||
|
the sum of gradient histogram is equal to snode[nid] */
|
||||||
|
GHistRow hist = hist_[nid];
|
||||||
|
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||||
|
|
||||||
|
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
||||||
|
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
||||||
|
for (uint32_t i = ibegin; i < iend; ++i) {
|
||||||
|
const GHistEntry et = hist.begin[i];
|
||||||
|
stats.Add(et.sum_grad, et.sum_hess);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const RowSetCollection::Elem e = row_set_collection_[nid];
|
||||||
|
for (const size_t* it = e.begin; it < e.end; ++it) {
|
||||||
|
stats.Add(gpair[*it]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculating the weights
|
||||||
|
{
|
||||||
|
bst_uint parentid = tree[nid].Parent();
|
||||||
|
snode_[nid].weight = static_cast<float>(
|
||||||
|
spliteval_->ComputeWeight(parentid, snode_[nid].stats));
|
||||||
|
snode_[nid].root_gain = static_cast<float>(
|
||||||
|
spliteval_->ComputeScore(parentid, snode_[nid].stats, snode_[nid].weight));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// enumerate the split values of specific feature
|
||||||
|
void QuantileHistMaker::Builder::EnumerateSplit(int d_step,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const GHistRow& hist,
|
||||||
|
const NodeEntry& snode,
|
||||||
|
const MetaInfo& info,
|
||||||
|
SplitEntry* p_best,
|
||||||
|
bst_uint fid,
|
||||||
|
bst_uint nodeID) {
|
||||||
|
CHECK(d_step == +1 || d_step == -1);
|
||||||
|
|
||||||
|
// aliases
|
||||||
|
const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
|
||||||
|
const std::vector<bst_float>& cut_val = gmat.cut.cut;
|
||||||
|
|
||||||
|
// statistics on both sides of split
|
||||||
|
GradStats c(param_);
|
||||||
|
GradStats e(param_);
|
||||||
|
// best split so far
|
||||||
|
SplitEntry best;
|
||||||
|
|
||||||
|
// bin boundaries
|
||||||
|
CHECK_LE(cut_ptr[fid],
|
||||||
|
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||||
|
CHECK_LE(cut_ptr[fid + 1],
|
||||||
|
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||||
|
// imin: index (offset) of the minimum value for feature fid
|
||||||
|
// need this for backward enumeration
|
||||||
|
const auto imin = static_cast<int32_t>(cut_ptr[fid]);
|
||||||
|
// ibegin, iend: smallest/largest cut points for feature fid
|
||||||
|
// use int to allow for value -1
|
||||||
|
int32_t ibegin, iend;
|
||||||
|
if (d_step > 0) {
|
||||||
|
ibegin = static_cast<int32_t>(cut_ptr[fid]);
|
||||||
|
iend = static_cast<int32_t>(cut_ptr[fid + 1]);
|
||||||
|
} else {
|
||||||
|
ibegin = static_cast<int32_t>(cut_ptr[fid + 1]) - 1;
|
||||||
|
iend = static_cast<int32_t>(cut_ptr[fid]) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32_t i = ibegin; i != iend; i += d_step) {
|
||||||
|
// start working
|
||||||
|
// try to find a split
|
||||||
|
e.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
|
||||||
|
if (e.sum_hess >= param_.min_child_weight) {
|
||||||
|
c.SetSubstract(snode.stats, e);
|
||||||
|
if (c.sum_hess >= param_.min_child_weight) {
|
||||||
|
bst_float loss_chg;
|
||||||
|
bst_float split_pt;
|
||||||
|
if (d_step > 0) {
|
||||||
|
// forward enumeration: split at right bound of each bin
|
||||||
|
loss_chg = static_cast<bst_float>(
|
||||||
|
spliteval_->ComputeSplitScore(nodeID, fid, e, c) -
|
||||||
|
snode.root_gain);
|
||||||
|
split_pt = cut_val[i];
|
||||||
|
} else {
|
||||||
|
// backward enumeration: split at left bound of each bin
|
||||||
|
loss_chg = static_cast<bst_float>(
|
||||||
|
spliteval_->ComputeSplitScore(nodeID, fid, c, e) -
|
||||||
|
snode.root_gain);
|
||||||
|
if (i == imin) {
|
||||||
|
// for leftmost bin, left bound is the smallest feature value
|
||||||
|
split_pt = gmat.cut.min_val[fid];
|
||||||
|
} else {
|
||||||
|
split_pt = cut_val[i - 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
best.Update(loss_chg, fid, split_pt, d_step == -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p_best->Update(best);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker")
|
||||||
|
.describe("(Deprecated, use grow_quantile_histmaker instead.)"
|
||||||
|
" Grow tree using quantized histogram.")
|
||||||
|
.set_body(
|
||||||
|
[]() {
|
||||||
|
LOG(WARNING) << "grow_fast_histmaker is deprecated, "
|
||||||
|
<< "use grow_quantile_histmaker instead.";
|
||||||
|
return new QuantileHistMaker();
|
||||||
|
});
|
||||||
|
|
||||||
|
XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
|
||||||
|
.describe("Grow tree using quantized histogram.")
|
||||||
|
.set_body(
|
||||||
|
[]() {
|
||||||
|
return new QuantileHistMaker();
|
||||||
|
});
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
238
src/tree/updater_quantile_hist.h
Normal file
238
src/tree/updater_quantile_hist.h
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2017-2018 by Contributors
|
||||||
|
* \file updater_quantile_hist.h
|
||||||
|
* \brief use quantized feature values to construct a tree
|
||||||
|
* \author Philip Cho, Tianqi Chen
|
||||||
|
*/
|
||||||
|
#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
|
||||||
|
#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
|
||||||
|
|
||||||
|
#include <rabit/rabit.h>
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <queue>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "./param.h"
|
||||||
|
#include "./split_evaluator.h"
|
||||||
|
#include "../common/random.h"
|
||||||
|
#include "../common/hist_util.h"
|
||||||
|
#include "../common/row_set.h"
|
||||||
|
#include "../common/column_matrix.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
using xgboost::common::HistCutMatrix;
|
||||||
|
using xgboost::common::GHistIndexMatrix;
|
||||||
|
using xgboost::common::GHistIndexBlockMatrix;
|
||||||
|
using xgboost::common::GHistIndexRow;
|
||||||
|
using xgboost::common::GHistEntry;
|
||||||
|
using xgboost::common::HistCollection;
|
||||||
|
using xgboost::common::RowSetCollection;
|
||||||
|
using xgboost::common::GHistRow;
|
||||||
|
using xgboost::common::GHistBuilder;
|
||||||
|
using xgboost::common::ColumnMatrix;
|
||||||
|
using xgboost::common::Column;
|
||||||
|
|
||||||
|
/*! \brief construct a tree using quantized feature values */
|
||||||
|
class QuantileHistMaker: public TreeUpdater {
|
||||||
|
public:
|
||||||
|
void Init(const std::vector<std::pair<std::string, std::string> >& args) override;
|
||||||
|
|
||||||
|
void Update(HostDeviceVector<GradientPair>* gpair,
|
||||||
|
DMatrix* dmat,
|
||||||
|
const std::vector<RegTree*>& trees) override;
|
||||||
|
|
||||||
|
bool UpdatePredictionCache(const DMatrix* data,
|
||||||
|
HostDeviceVector<bst_float>* out_preds) override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// training parameter
|
||||||
|
TrainParam param_;
|
||||||
|
// quantized data matrix
|
||||||
|
GHistIndexMatrix gmat_;
|
||||||
|
// (optional) data matrix with feature grouping
|
||||||
|
GHistIndexBlockMatrix gmatb_;
|
||||||
|
// column accessor
|
||||||
|
ColumnMatrix column_matrix_;
|
||||||
|
bool is_gmat_initialized_;
|
||||||
|
|
||||||
|
// data structure
|
||||||
|
struct NodeEntry {
|
||||||
|
/*! \brief statics for node entry */
|
||||||
|
GradStats stats;
|
||||||
|
/*! \brief loss of this node, without split */
|
||||||
|
bst_float root_gain;
|
||||||
|
/*! \brief weight calculated related to current data */
|
||||||
|
float weight;
|
||||||
|
/*! \brief current best solution */
|
||||||
|
SplitEntry best;
|
||||||
|
// constructor
|
||||||
|
explicit NodeEntry(const TrainParam& param)
|
||||||
|
: stats(param), root_gain(0.0f), weight(0.0f) {
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// actual builder that runs the algorithm
|
||||||
|
|
||||||
|
struct Builder {
|
||||||
|
public:
|
||||||
|
// constructor
|
||||||
|
explicit Builder(const TrainParam& param,
|
||||||
|
std::unique_ptr<TreeUpdater> pruner,
|
||||||
|
std::unique_ptr<SplitEvaluator> spliteval)
|
||||||
|
: param_(param), pruner_(std::move(pruner)),
|
||||||
|
spliteval_(std::move(spliteval)), p_last_tree_(nullptr),
|
||||||
|
p_last_fmat_(nullptr) {}
|
||||||
|
// update one tree, growing
|
||||||
|
virtual void Update(const GHistIndexMatrix& gmat,
|
||||||
|
const GHistIndexBlockMatrix& gmatb,
|
||||||
|
const ColumnMatrix& column_matrix,
|
||||||
|
HostDeviceVector<GradientPair>* gpair,
|
||||||
|
DMatrix* p_fmat,
|
||||||
|
RegTree* p_tree);
|
||||||
|
|
||||||
|
inline void BuildHist(const std::vector<GradientPair>& gpair,
|
||||||
|
const RowSetCollection::Elem row_indices,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const GHistIndexBlockMatrix& gmatb,
|
||||||
|
GHistRow hist) {
|
||||||
|
if (param_.enable_feature_grouping > 0) {
|
||||||
|
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
|
||||||
|
} else {
|
||||||
|
hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
||||||
|
hist_builder_.SubtractionTrick(self, sibling, parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool UpdatePredictionCache(const DMatrix* data,
|
||||||
|
HostDeviceVector<bst_float>* p_out_preds);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// initialize temp data structure
|
||||||
|
void InitData(const GHistIndexMatrix& gmat,
|
||||||
|
const std::vector<GradientPair>& gpair,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree);
|
||||||
|
|
||||||
|
void EvaluateSplit(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const HistCollection& hist,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree);
|
||||||
|
|
||||||
|
void ApplySplit(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const ColumnMatrix& column_matrix,
|
||||||
|
const HistCollection& hist,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
RegTree* p_tree);
|
||||||
|
|
||||||
|
void ApplySplitDenseData(const RowSetCollection::Elem rowset,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
|
const Column& column,
|
||||||
|
bst_int split_cond,
|
||||||
|
bool default_left);
|
||||||
|
|
||||||
|
void ApplySplitSparseData(const RowSetCollection::Elem rowset,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
|
const Column& column,
|
||||||
|
bst_uint lower_bound,
|
||||||
|
bst_uint upper_bound,
|
||||||
|
bst_int split_cond,
|
||||||
|
bool default_left);
|
||||||
|
|
||||||
|
void InitNewNode(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const std::vector<GradientPair>& gpair,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree);
|
||||||
|
|
||||||
|
// enumerate the split values of specific feature
|
||||||
|
void EnumerateSplit(int d_step,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const GHistRow& hist,
|
||||||
|
const NodeEntry& snode,
|
||||||
|
const MetaInfo& info,
|
||||||
|
SplitEntry* p_best,
|
||||||
|
bst_uint fid,
|
||||||
|
bst_uint nodeID);
|
||||||
|
|
||||||
|
/* tree growing policies */
|
||||||
|
struct ExpandEntry {
|
||||||
|
int nid;
|
||||||
|
int depth;
|
||||||
|
bst_float loss_chg;
|
||||||
|
unsigned timestamp;
|
||||||
|
ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
|
||||||
|
: nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
|
||||||
|
};
|
||||||
|
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||||
|
if (lhs.depth == rhs.depth) {
|
||||||
|
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||||
|
} else {
|
||||||
|
return lhs.depth > rhs.depth; // favor small depth
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||||
|
if (lhs.loss_chg == rhs.loss_chg) {
|
||||||
|
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||||
|
} else {
|
||||||
|
return lhs.loss_chg < rhs.loss_chg; // favor large loss_chg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --data fields--
|
||||||
|
const TrainParam& param_;
|
||||||
|
// number of omp thread used during training
|
||||||
|
int nthread_;
|
||||||
|
common::ColumnSampler column_sampler_;
|
||||||
|
// the internal row sets
|
||||||
|
RowSetCollection row_set_collection_;
|
||||||
|
// the temp space for split
|
||||||
|
std::vector<RowSetCollection::Split> row_split_tloc_;
|
||||||
|
std::vector<SplitEntry> best_split_tloc_;
|
||||||
|
/*! \brief TreeNode Data: statistics for each constructed node */
|
||||||
|
std::vector<NodeEntry> snode_;
|
||||||
|
/*! \brief culmulative histogram of gradients. */
|
||||||
|
HistCollection hist_;
|
||||||
|
/*! \brief feature with least # of bins. to be used for dense specialization
|
||||||
|
of InitNewNode() */
|
||||||
|
uint32_t fid_least_bins_;
|
||||||
|
/*! \brief local prediction cache; maps node id to leaf value */
|
||||||
|
std::vector<float> leaf_value_cache_;
|
||||||
|
|
||||||
|
GHistBuilder hist_builder_;
|
||||||
|
std::unique_ptr<TreeUpdater> pruner_;
|
||||||
|
std::unique_ptr<SplitEvaluator> spliteval_;
|
||||||
|
|
||||||
|
// back pointers to tree and data matrix
|
||||||
|
const RegTree* p_last_tree_;
|
||||||
|
const DMatrix* p_last_fmat_;
|
||||||
|
|
||||||
|
using ExpandQueue =
|
||||||
|
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||||
|
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||||
|
std::unique_ptr<ExpandQueue> qexpand_;
|
||||||
|
|
||||||
|
enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
|
||||||
|
DataLayout data_layout_;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Builder> builder_;
|
||||||
|
std::unique_ptr<TreeUpdater> pruner_;
|
||||||
|
std::unique_ptr<SplitEvaluator> spliteval_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
|
|
||||||
|
#endif // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
|
||||||
@ -4,12 +4,13 @@
|
|||||||
* \brief refresh the statistics and leaf value on the tree on the dataset
|
* \brief refresh the statistics and leaf value on the tree on the dataset
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -5,12 +5,12 @@
|
|||||||
a refresh is needed to make the statistics exactly correct
|
a refresh is needed to make the statistics exactly correct
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <rabit/rabit.h>
|
||||||
#include <xgboost/base.h>
|
#include <xgboost/base.h>
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/quantile.h"
|
#include "../common/quantile.h"
|
||||||
#include "../common/group_data.h"
|
#include "../common/group_data.h"
|
||||||
#include "./updater_basemaker-inl.h"
|
#include "./updater_basemaker-inl.h"
|
||||||
|
|||||||
@ -7,7 +7,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include "../common/sync.h"
|
|
||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -32,7 +32,7 @@ TEST(learner, SelectTreeMethod) {
|
|||||||
"grow_colmaker,prune");
|
"grow_colmaker,prune");
|
||||||
learner->Configure({arg("tree_method", "hist")});
|
learner->Configure({arg("tree_method", "hist")});
|
||||||
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
|
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
|
||||||
"grow_fast_histmaker");
|
"grow_quantile_histmaker");
|
||||||
#ifdef XGBOOST_USE_CUDA
|
#ifdef XGBOOST_USE_CUDA
|
||||||
learner->Configure({arg("tree_method", "gpu_exact")});
|
learner->Configure({arg("tree_method", "gpu_exact")});
|
||||||
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
|
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
|
||||||
|
|||||||
@ -328,8 +328,8 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
shard->row_stride = n_cols;
|
shard->row_stride = n_cols;
|
||||||
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
||||||
shard->ridx.CurrentDVec().tend());
|
shard->ridx.CurrentDVec().tend());
|
||||||
|
// Free inside DeviceShard
|
||||||
dh::safe_cuda(cudaMallocHost(&(shard->tmp_pinned), sizeof(int64_t)));
|
dh::safe_cuda(cudaMallocHost(&(shard->tmp_pinned), sizeof(int64_t)));
|
||||||
|
|
||||||
// Initialize GPUHistMaker
|
// Initialize GPUHistMaker
|
||||||
hist_maker.param_ = param;
|
hist_maker.param_ = param;
|
||||||
RegTree tree;
|
RegTree tree;
|
||||||
@ -390,15 +390,5 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
|
ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GpuHist, MGPU_mock) {
|
|
||||||
// Attempt to choose multiple GPU devices
|
|
||||||
int ngpu;
|
|
||||||
dh::safe_cuda(cudaGetDeviceCount(&ngpu));
|
|
||||||
CHECK_GT(ngpu, 1);
|
|
||||||
for (int i = 0; i < ngpu; ++i) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,13 +1,13 @@
|
|||||||
// Copyright by Contributors
|
// Copyright by Contributors
|
||||||
#include "../../../src/tree/param.h"
|
#include "../../../src/tree/param.h"
|
||||||
|
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
TEST(Param, VectorIOStream) {
|
TEST(Param, VectorIOStream) {
|
||||||
std::vector<int> vals = {3, 2, 1};
|
std::vector<int> vals = {3, 2, 1};
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
std::vector<int> vals_in;
|
std::vector<int> vals_in;
|
||||||
|
|
||||||
ss << vals;
|
ss << vals;
|
||||||
EXPECT_EQ(ss.str(), "(3,2,1)");
|
EXPECT_EQ(ss.str(), "(3,2,1)");
|
||||||
|
|
||||||
|
|||||||
72
tests/cpp/tree/test_prune.cc
Normal file
72
tests/cpp/tree/test_prune.cc
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2018 by Contributors
|
||||||
|
*/
|
||||||
|
#include "../helpers.h"
|
||||||
|
#include "../../../src/common/host_device_vector.h"
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
TEST(Updater, Prune) {
|
||||||
|
int constexpr n_rows = 32, n_cols = 16;
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, std::string>> cfg;
|
||||||
|
cfg.push_back(std::pair<std::string, std::string>(
|
||||||
|
"num_feature", std::to_string(n_cols)));
|
||||||
|
cfg.push_back(std::pair<std::string, std::string>(
|
||||||
|
"min_split_loss", "10"));
|
||||||
|
cfg.push_back(std::pair<std::string, std::string>(
|
||||||
|
"silent", "1"));
|
||||||
|
|
||||||
|
// These data are just place holders.
|
||||||
|
HostDeviceVector<GradientPair> gpair =
|
||||||
|
{ {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
|
||||||
|
{0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
|
||||||
|
auto dmat = CreateDMatrix(32, 16, 0.4, 3);
|
||||||
|
|
||||||
|
// prepare tree
|
||||||
|
RegTree tree = RegTree();
|
||||||
|
tree.InitModel();
|
||||||
|
tree.param.InitAllowUnknown(cfg);
|
||||||
|
std::vector<RegTree*> trees {&tree};
|
||||||
|
// prepare pruner
|
||||||
|
std::unique_ptr<TreeUpdater> pruner(TreeUpdater::Create("prune"));
|
||||||
|
pruner->Init(cfg);
|
||||||
|
|
||||||
|
// loss_chg < min_split_loss;
|
||||||
|
tree.AddChilds(0);
|
||||||
|
int cleft = tree[0].LeftChild();
|
||||||
|
int cright = tree[0].RightChild();
|
||||||
|
tree[cleft].SetLeaf(0.3f, 0);
|
||||||
|
tree[cright].SetLeaf(0.4f, 0);
|
||||||
|
pruner->Update(&gpair, dmat->get(), trees);
|
||||||
|
|
||||||
|
ASSERT_EQ(tree.NumExtraNodes(), 0);
|
||||||
|
|
||||||
|
// loss_chg > min_split_loss;
|
||||||
|
tree.AddChilds(0);
|
||||||
|
cleft = tree[0].LeftChild();
|
||||||
|
cright = tree[0].RightChild();
|
||||||
|
tree[cleft].SetLeaf(0.3f, 0);
|
||||||
|
tree[cright].SetLeaf(0.4f, 0);
|
||||||
|
tree.Stat(0).loss_chg = 11;
|
||||||
|
pruner->Update(&gpair, dmat->get(), trees);
|
||||||
|
|
||||||
|
ASSERT_EQ(tree.NumExtraNodes(), 2);
|
||||||
|
|
||||||
|
// loss_chg == min_split_loss;
|
||||||
|
tree.Stat(0).loss_chg = 10;
|
||||||
|
pruner->Update(&gpair, dmat->get(), trees);
|
||||||
|
|
||||||
|
ASSERT_EQ(tree.NumExtraNodes(), 2);
|
||||||
|
|
||||||
|
delete dmat;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
181
tests/cpp/tree/test_quantile_hist.cc
Normal file
181
tests/cpp/tree/test_quantile_hist.cc
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2018 by Contributors
|
||||||
|
*/
|
||||||
|
#include "../helpers.h"
|
||||||
|
#include "../../../src/tree/param.h"
|
||||||
|
#include "../../../src/tree/updater_quantile_hist.h"
|
||||||
|
#include "../../../src/common/host_device_vector.h"
|
||||||
|
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
class QuantileHistMock : public QuantileHistMaker {
|
||||||
|
static double constexpr kEps = 1e-6;
|
||||||
|
|
||||||
|
struct BuilderMock : public QuantileHistMaker::Builder {
|
||||||
|
using RealImpl = QuantileHistMaker::Builder;
|
||||||
|
|
||||||
|
BuilderMock(const TrainParam& param,
|
||||||
|
std::unique_ptr<TreeUpdater> pruner,
|
||||||
|
std::unique_ptr<SplitEvaluator> spliteval)
|
||||||
|
: RealImpl(param, std::move(pruner), std::move(spliteval)) {}
|
||||||
|
|
||||||
|
public:
|
||||||
|
void TestInitData(const GHistIndexMatrix& gmat,
|
||||||
|
const std::vector<GradientPair>& gpair,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree) {
|
||||||
|
RealImpl::InitData(gmat, gpair, fmat, tree);
|
||||||
|
ASSERT_EQ(data_layout_, kSparseData);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestBuildHist(int nid,
|
||||||
|
const GHistIndexMatrix& gmat,
|
||||||
|
const DMatrix& fmat,
|
||||||
|
const RegTree& tree) {
|
||||||
|
std::vector<GradientPair> gpair =
|
||||||
|
{ {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
|
||||||
|
{0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
|
||||||
|
RealImpl::InitData(gmat, gpair, fmat, tree);
|
||||||
|
GHistIndexBlockMatrix quantile_index_block;
|
||||||
|
hist_.AddHistRow(nid);
|
||||||
|
BuildHist(gpair, row_set_collection_[nid],
|
||||||
|
gmat, quantile_index_block, hist_[nid]);
|
||||||
|
std::vector<GradientPairPrecise> solution {
|
||||||
|
{0.27, 0.29}, {0.27, 0.29}, {0.47, 0.49},
|
||||||
|
{0.27, 0.29}, {0.57, 0.59}, {0.26, 0.27},
|
||||||
|
{0.37, 0.39}, {0.23, 0.24}, {0.37, 0.39},
|
||||||
|
{0.27, 0.28}, {0.27, 0.29}, {0.37, 0.39},
|
||||||
|
{0.26, 0.27}, {0.23, 0.24}, {0.57, 0.59},
|
||||||
|
{0.47, 0.49}, {0.47, 0.49}, {0.37, 0.39},
|
||||||
|
{0.26, 0.27}, {0.23, 0.24}, {0.27, 0.28},
|
||||||
|
{0.57, 0.59}, {0.23, 0.24}, {0.47, 0.49}};
|
||||||
|
|
||||||
|
for (size_t i = 0; i < hist_[nid].size; ++i) {
|
||||||
|
GradientPairPrecise sol = solution[i];
|
||||||
|
ASSERT_NEAR(sol.GetGrad(), hist_[nid].begin[i].sum_grad, kEps);
|
||||||
|
ASSERT_NEAR(sol.GetHess(), hist_[nid].begin[i].sum_hess, kEps);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestEvaluateSplit(const GHistIndexBlockMatrix& quantile_index_block,
|
||||||
|
const RegTree& tree) {
|
||||||
|
std::vector<GradientPair> row_gpairs =
|
||||||
|
{ {0.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {0.27f, 0.28f},
|
||||||
|
{0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f} };
|
||||||
|
size_t constexpr max_bins = 4;
|
||||||
|
auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3); // dense
|
||||||
|
|
||||||
|
common::GHistIndexMatrix gmat;
|
||||||
|
gmat.Init((*dmat).get(), max_bins);
|
||||||
|
|
||||||
|
RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree);
|
||||||
|
hist_.AddHistRow(0);
|
||||||
|
|
||||||
|
BuildHist(row_gpairs, row_set_collection_[0],
|
||||||
|
gmat, quantile_index_block, hist_[0]);
|
||||||
|
|
||||||
|
RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
|
||||||
|
// Manipulate the root_gain so that I don't have to invent an actual
|
||||||
|
// split. Yes, I'm cheating.
|
||||||
|
snode_[0].root_gain = 0.8;
|
||||||
|
RealImpl::EvaluateSplit(0, gmat, hist_, *(*dmat), tree);
|
||||||
|
|
||||||
|
ASSERT_NEAR(snode_.at(0).best.loss_chg, 0.7128048, kEps);
|
||||||
|
ASSERT_EQ(snode_.at(0).best.SplitIndex(), 10);
|
||||||
|
ASSERT_NEAR(snode_.at(0).best.split_value, 0.182258, kEps);
|
||||||
|
|
||||||
|
delete dmat;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
int static constexpr n_rows = 8, n_cols = 16;
|
||||||
|
std::shared_ptr<xgboost::DMatrix> *dmat;
|
||||||
|
const std::vector<std::pair<std::string, std::string> > cfg;
|
||||||
|
std::shared_ptr<BuilderMock> builder_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit QuantileHistMock(
|
||||||
|
const std::vector<std::pair<std::string, std::string> >& args) :
|
||||||
|
cfg{args} {
|
||||||
|
QuantileHistMaker::Init(args);
|
||||||
|
builder_.reset(
|
||||||
|
new BuilderMock(
|
||||||
|
param_,
|
||||||
|
std::move(pruner_),
|
||||||
|
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
|
||||||
|
dmat = CreateDMatrix(n_rows, n_cols, 0.8, 3);
|
||||||
|
}
|
||||||
|
~QuantileHistMock() { delete dmat; }
|
||||||
|
|
||||||
|
static size_t GetNumColumns() { return n_cols; }
|
||||||
|
|
||||||
|
void TestInitData() {
|
||||||
|
size_t constexpr max_bins = 4;
|
||||||
|
common::GHistIndexMatrix gmat;
|
||||||
|
gmat.Init((*dmat).get(), max_bins);
|
||||||
|
|
||||||
|
RegTree tree = RegTree();
|
||||||
|
tree.InitModel();
|
||||||
|
tree.param.InitAllowUnknown(cfg);
|
||||||
|
|
||||||
|
std::vector<GradientPair> gpair =
|
||||||
|
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
||||||
|
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
||||||
|
|
||||||
|
builder_->TestInitData(gmat, gpair, *(*dmat), tree);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestBuildHist() {
|
||||||
|
RegTree tree = RegTree();
|
||||||
|
tree.InitModel();
|
||||||
|
tree.param.InitAllowUnknown(cfg);
|
||||||
|
|
||||||
|
size_t constexpr max_bins = 4;
|
||||||
|
common::GHistIndexMatrix gmat;
|
||||||
|
gmat.Init((*dmat).get(), max_bins);
|
||||||
|
|
||||||
|
builder_->TestBuildHist(0, gmat, *(*dmat).get(), tree);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestEvaluateSplit() {
|
||||||
|
RegTree tree = RegTree();
|
||||||
|
tree.InitModel();
|
||||||
|
tree.param.InitAllowUnknown(cfg);
|
||||||
|
|
||||||
|
builder_->TestEvaluateSplit(gmatb_, tree);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST(Updater, QuantileHist_InitData) {
|
||||||
|
std::vector<std::pair<std::string, std::string>> cfg
|
||||||
|
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}};
|
||||||
|
QuantileHistMock maker(cfg);
|
||||||
|
maker.TestInitData();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Updater, QuantileHist_BuildHist) {
|
||||||
|
// Don't enable feature grouping
|
||||||
|
std::vector<std::pair<std::string, std::string>> cfg
|
||||||
|
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
|
||||||
|
{"enable_feature_grouping", std::to_string(0)}};
|
||||||
|
QuantileHistMock maker(cfg);
|
||||||
|
maker.TestBuildHist();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Updater, QuantileHist_EvalSplits) {
|
||||||
|
std::vector<std::pair<std::string, std::string>> cfg
|
||||||
|
{{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
|
||||||
|
{"split_evaluator", "elastic_net"}};
|
||||||
|
QuantileHistMock maker(cfg);
|
||||||
|
maker.TestEvaluateSplit();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
57
tests/cpp/tree/test_refresh.cc
Normal file
57
tests/cpp/tree/test_refresh.cc
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2018 by Contributors
|
||||||
|
*/
|
||||||
|
#include "../helpers.h"
|
||||||
|
#include "../../../src/common/host_device_vector.h"
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
TEST(Updater, Refresh) {
|
||||||
|
int constexpr n_rows = 8, n_cols = 16;
|
||||||
|
|
||||||
|
HostDeviceVector<GradientPair> gpair =
|
||||||
|
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
||||||
|
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
||||||
|
auto dmat = CreateDMatrix(n_rows, n_cols, 0.4, 3);
|
||||||
|
std::vector<std::pair<std::string, std::string>> cfg {
|
||||||
|
{"reg_alpha", "0.0"},
|
||||||
|
{"num_feature", std::to_string(n_cols)},
|
||||||
|
{"reg_lambda", "1"}};
|
||||||
|
|
||||||
|
RegTree tree = RegTree();
|
||||||
|
tree.InitModel();
|
||||||
|
tree.param.InitAllowUnknown(cfg);
|
||||||
|
std::vector<RegTree*> trees {&tree};
|
||||||
|
std::unique_ptr<TreeUpdater> refresher(TreeUpdater::Create("refresh"));
|
||||||
|
|
||||||
|
tree.AddChilds(0);
|
||||||
|
int cleft = tree[0].LeftChild();
|
||||||
|
int cright = tree[0].RightChild();
|
||||||
|
tree[cleft].SetLeaf(0.2f, 0);
|
||||||
|
tree[cright].SetLeaf(0.8f, 0);
|
||||||
|
tree[0].SetSplit(2, 0.2f);
|
||||||
|
|
||||||
|
tree.Stat(cleft).base_weight = 1.2;
|
||||||
|
tree.Stat(cright).base_weight = 1.3;
|
||||||
|
|
||||||
|
refresher->Init(cfg);
|
||||||
|
refresher->Update(&gpair, dmat->get(), trees);
|
||||||
|
|
||||||
|
bst_float constexpr kEps = 1e-6;
|
||||||
|
ASSERT_NEAR(-0.183392, tree[cright].LeafValue(), kEps);
|
||||||
|
ASSERT_NEAR(-0.224489, tree.Stat(0).loss_chg, kEps);
|
||||||
|
ASSERT_NEAR(0, tree.Stat(cleft).loss_chg, kEps);
|
||||||
|
ASSERT_NEAR(0, tree.Stat(1).loss_chg, kEps);
|
||||||
|
ASSERT_NEAR(0, tree.Stat(2).loss_chg, kEps);
|
||||||
|
|
||||||
|
delete dmat;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
Loading…
x
Reference in New Issue
Block a user