Clang-tidy static analysis (#3222)
* Clang-tidy static analysis * Modernise checks * Google coding standard checks * Identifier renaming according to Google style
This commit is contained in:
@@ -13,7 +13,7 @@ namespace tree {
|
||||
/*! \brief training parameters for histogram-based training */
|
||||
struct FastHistParam : public dmlc::Parameter<FastHistParam> {
|
||||
// integral data type to be used with columnar data storage
|
||||
enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
|
||||
enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 }; // NOLINT
|
||||
int colmat_dtype;
|
||||
// percentage threshold for treating a feature as sparse
|
||||
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
|
||||
|
||||
102
src/tree/param.h
102
src/tree/param.h
@@ -190,26 +190,26 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
DMLC_DECLARE_ALIAS(learning_rate, eta);
|
||||
}
|
||||
/*! \brief whether need forward small to big search: default right */
|
||||
inline bool need_forward_search(float col_density, bool indicator) const {
|
||||
inline bool NeedForwardSearch(float col_density, bool indicator) const {
|
||||
return this->default_direction == 2 ||
|
||||
(default_direction == 0 && (col_density < opt_dense_col) &&
|
||||
!indicator);
|
||||
}
|
||||
/*! \brief whether need backward big to small search: default left */
|
||||
inline bool need_backward_search(float col_density, bool indicator) const {
|
||||
inline bool NeedBackwardSearch(float col_density, bool indicator) const {
|
||||
return this->default_direction != 2;
|
||||
}
|
||||
/*! \brief given the loss change, whether we need to invoke pruning */
|
||||
inline bool need_prune(double loss_chg, int depth) const {
|
||||
inline bool NeedPrune(double loss_chg, int depth) const {
|
||||
return loss_chg < this->min_split_loss;
|
||||
}
|
||||
/*! \brief whether we can split with current hessian */
|
||||
inline bool cannot_split(double sum_hess, int depth) const {
|
||||
inline bool CannotSplit(double sum_hess, int depth) const {
|
||||
return sum_hess < this->min_child_weight * 2.0;
|
||||
}
|
||||
/*! \brief maximum sketch size */
|
||||
inline unsigned max_sketch_size() const {
|
||||
unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
|
||||
inline unsigned MaxSketchSize() const {
|
||||
auto ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
|
||||
CHECK_GT(ret, 0U);
|
||||
return ret;
|
||||
}
|
||||
@@ -220,10 +220,12 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
// functions for L1 cost
|
||||
template <typename T1, typename T2>
|
||||
XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 lambda) {
|
||||
if (w > +lambda)
|
||||
if (w > +lambda) {
|
||||
return w - lambda;
|
||||
if (w < -lambda)
|
||||
}
|
||||
if (w < -lambda) {
|
||||
return w + lambda;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@@ -240,8 +242,9 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
|
||||
// calculate the cost of loss function
|
||||
template <typename TrainingParams, typename T>
|
||||
XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
|
||||
if (sum_hess < p.min_child_weight)
|
||||
if (sum_hess < p.min_child_weight) {
|
||||
return T(0.0);
|
||||
}
|
||||
if (p.max_delta_step == 0.0f) {
|
||||
if (p.reg_alpha == 0.0f) {
|
||||
return Sqr(sum_grad) / (sum_hess + p.reg_lambda);
|
||||
@@ -276,8 +279,9 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess
|
||||
template <typename TrainingParams, typename T>
|
||||
XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
|
||||
T sum_hess) {
|
||||
if (sum_hess < p.min_child_weight)
|
||||
if (sum_hess < p.min_child_weight) {
|
||||
return 0.0;
|
||||
}
|
||||
T dw;
|
||||
if (p.reg_alpha == 0.0f) {
|
||||
dw = -sum_grad / (sum_hess + p.reg_lambda);
|
||||
@@ -285,16 +289,18 @@ XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
|
||||
dw = -ThresholdL1(sum_grad, p.reg_alpha) / (sum_hess + p.reg_lambda);
|
||||
}
|
||||
if (p.max_delta_step != 0.0f) {
|
||||
if (dw > p.max_delta_step)
|
||||
if (dw > p.max_delta_step) {
|
||||
dw = p.max_delta_step;
|
||||
if (dw < -p.max_delta_step)
|
||||
}
|
||||
if (dw < -p.max_delta_step) {
|
||||
dw = -p.max_delta_step;
|
||||
}
|
||||
}
|
||||
return dw;
|
||||
}
|
||||
|
||||
template <typename TrainingParams, typename gpair_t>
|
||||
XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, gpair_t sum_grad) {
|
||||
template <typename TrainingParams, typename GpairT>
|
||||
XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad) {
|
||||
return CalcWeight(p, sum_grad.GetGrad(), sum_grad.GetHess());
|
||||
}
|
||||
|
||||
@@ -312,8 +318,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
|
||||
/*! \brief constructor, the object must be cleared during construction */
|
||||
explicit GradStats(const TrainParam& param) { this->Clear(); }
|
||||
|
||||
template <typename gpair_t>
|
||||
XGBOOST_DEVICE explicit GradStats(const gpair_t &sum)
|
||||
template <typename GpairT>
|
||||
XGBOOST_DEVICE explicit GradStats(const GpairT &sum)
|
||||
: sum_grad(sum.GetGrad()), sum_hess(sum.GetHess()) {}
|
||||
/*! \brief clear the statistics */
|
||||
inline void Clear() { sum_grad = sum_hess = 0.0f; }
|
||||
@@ -323,26 +329,26 @@ struct XGBOOST_ALIGNAS(16) GradStats {
|
||||
* \brief accumulate statistics
|
||||
* \param p the gradient pair
|
||||
*/
|
||||
inline void Add(bst_gpair p) { this->Add(p.GetGrad(), p.GetHess()); }
|
||||
inline void Add(GradientPair p) { this->Add(p.GetGrad(), p.GetHess()); }
|
||||
/*!
|
||||
* \brief accumulate statistics, more complicated version
|
||||
* \param gpair the vector storing the gradient statistics
|
||||
* \param info the additional information
|
||||
* \param ridx instance index of this instance
|
||||
*/
|
||||
inline void Add(const std::vector<bst_gpair>& gpair, const MetaInfo& info,
|
||||
inline void Add(const std::vector<GradientPair>& gpair, const MetaInfo& info,
|
||||
bst_uint ridx) {
|
||||
const bst_gpair& b = gpair[ridx];
|
||||
const GradientPair& b = gpair[ridx];
|
||||
this->Add(b.GetGrad(), b.GetHess());
|
||||
}
|
||||
/*! \brief calculate leaf weight */
|
||||
template <typename param_t>
|
||||
XGBOOST_DEVICE inline double CalcWeight(const param_t ¶m) const {
|
||||
template <typename ParamT>
|
||||
XGBOOST_DEVICE inline double CalcWeight(const ParamT ¶m) const {
|
||||
return xgboost::tree::CalcWeight(param, sum_grad, sum_hess);
|
||||
}
|
||||
/*! \brief calculate gain of the solution */
|
||||
template <typename param_t>
|
||||
inline double CalcGain(const param_t& param) const {
|
||||
template <typename ParamT>
|
||||
inline double CalcGain(const ParamT& param) const {
|
||||
return xgboost::tree::CalcGain(param, sum_grad, sum_hess);
|
||||
}
|
||||
/*! \brief add statistics to the data */
|
||||
@@ -364,7 +370,7 @@ template <typename param_t>
|
||||
/*! \brief set leaf vector value based on statistics */
|
||||
inline void SetLeafVec(const TrainParam& param, bst_float* vec) const {}
|
||||
// constructor to allow inheritance
|
||||
GradStats() {}
|
||||
GradStats() = default;
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(double grad, double hess) {
|
||||
sum_grad += grad;
|
||||
@@ -400,8 +406,8 @@ struct ValueConstraint {
|
||||
inline static void Init(TrainParam *param, unsigned num_feature) {
|
||||
param->monotone_constraints.resize(num_feature, 0);
|
||||
}
|
||||
template <typename param_t>
|
||||
XGBOOST_DEVICE inline double CalcWeight(const param_t ¶m, GradStats stats) const {
|
||||
template <typename ParamT>
|
||||
XGBOOST_DEVICE inline double CalcWeight(const ParamT ¶m, GradStats stats) const {
|
||||
double w = stats.CalcWeight(param);
|
||||
if (w < lower_bound) {
|
||||
return lower_bound;
|
||||
@@ -412,14 +418,14 @@ template <typename param_t>
|
||||
return w;
|
||||
}
|
||||
|
||||
template <typename param_t>
|
||||
XGBOOST_DEVICE inline double CalcGain(const param_t ¶m, GradStats stats) const {
|
||||
template <typename ParamT>
|
||||
XGBOOST_DEVICE inline double CalcGain(const ParamT ¶m, GradStats stats) const {
|
||||
return CalcGainGivenWeight(param, stats.sum_grad, stats.sum_hess,
|
||||
CalcWeight(param, stats));
|
||||
}
|
||||
|
||||
template <typename param_t>
|
||||
XGBOOST_DEVICE inline double CalcSplitGain(const param_t ¶m, int constraint,
|
||||
template <typename ParamT>
|
||||
XGBOOST_DEVICE inline double CalcSplitGain(const ParamT ¶m, int constraint,
|
||||
GradStats left, GradStats right) const {
|
||||
const double negative_infinity = -std::numeric_limits<double>::infinity();
|
||||
double wleft = CalcWeight(param, left);
|
||||
@@ -442,8 +448,9 @@ template <typename param_t>
|
||||
int c = param.monotone_constraints.at(split_index);
|
||||
*cleft = *this;
|
||||
*cright = *this;
|
||||
if (c == 0)
|
||||
if (c == 0) {
|
||||
return;
|
||||
}
|
||||
double wleft = CalcWeight(param, left);
|
||||
double wright = CalcWeight(param, right);
|
||||
double mid = (wleft + wright) / 2;
|
||||
@@ -464,13 +471,13 @@ template <typename param_t>
|
||||
*/
|
||||
struct SplitEntry {
|
||||
/*! \brief loss change after split this node */
|
||||
bst_float loss_chg;
|
||||
bst_float loss_chg{0.0f};
|
||||
/*! \brief split index */
|
||||
unsigned sindex;
|
||||
unsigned sindex{0};
|
||||
/*! \brief split value */
|
||||
bst_float split_value;
|
||||
bst_float split_value{0.0f};
|
||||
/*! \brief constructor */
|
||||
SplitEntry() : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
|
||||
SplitEntry() = default;
|
||||
/*!
|
||||
* \brief decides whether we can replace current entry with the given
|
||||
* statistics
|
||||
@@ -482,7 +489,7 @@ struct SplitEntry {
|
||||
* \param split_index the feature index where the split is on
|
||||
*/
|
||||
inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
|
||||
if (this->split_index() <= split_index) {
|
||||
if (this->SplitIndex() <= split_index) {
|
||||
return new_loss_chg > this->loss_chg;
|
||||
} else {
|
||||
return !(this->loss_chg > new_loss_chg);
|
||||
@@ -494,7 +501,7 @@ struct SplitEntry {
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
inline bool Update(const SplitEntry &e) {
|
||||
if (this->NeedReplace(e.loss_chg, e.split_index())) {
|
||||
if (this->NeedReplace(e.loss_chg, e.SplitIndex())) {
|
||||
this->loss_chg = e.loss_chg;
|
||||
this->sindex = e.sindex;
|
||||
this->split_value = e.split_value;
|
||||
@@ -515,8 +522,9 @@ struct SplitEntry {
|
||||
bst_float new_split_value, bool default_left) {
|
||||
if (this->NeedReplace(new_loss_chg, split_index)) {
|
||||
this->loss_chg = new_loss_chg;
|
||||
if (default_left)
|
||||
if (default_left) {
|
||||
split_index |= (1U << 31);
|
||||
}
|
||||
this->sindex = split_index;
|
||||
this->split_value = new_split_value;
|
||||
return true;
|
||||
@@ -530,9 +538,9 @@ struct SplitEntry {
|
||||
dst.Update(src);
|
||||
}
|
||||
/*!\return feature index to split on */
|
||||
inline unsigned split_index() const { return sindex & ((1U << 31) - 1U); }
|
||||
inline unsigned SplitIndex() const { return sindex & ((1U << 31) - 1U); }
|
||||
/*!\return whether missing value goes to left branch */
|
||||
inline bool default_left() const { return (sindex >> 31) != 0; }
|
||||
inline bool DefaultLeft() const { return (sindex >> 31) != 0; }
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
@@ -542,14 +550,16 @@ struct SplitEntry {
|
||||
namespace std {
|
||||
inline std::ostream &operator<<(std::ostream &os, const std::vector<int> &t) {
|
||||
os << '(';
|
||||
for (std::vector<int>::const_iterator it = t.begin(); it != t.end(); ++it) {
|
||||
if (it != t.begin())
|
||||
for (auto it = t.begin(); it != t.end(); ++it) {
|
||||
if (it != t.begin()) {
|
||||
os << ',';
|
||||
}
|
||||
os << *it;
|
||||
}
|
||||
// python style tuple
|
||||
if (t.size() == 1)
|
||||
if (t.size() == 1) {
|
||||
os << ',';
|
||||
}
|
||||
os << ')';
|
||||
return os;
|
||||
}
|
||||
@@ -566,8 +576,9 @@ inline std::istream &operator>>(std::istream &is, std::vector<int> &t) {
|
||||
return is;
|
||||
}
|
||||
is.get();
|
||||
if (ch == '(')
|
||||
if (ch == '(') {
|
||||
break;
|
||||
}
|
||||
if (!isspace(ch)) {
|
||||
is.setstate(std::ios::failbit);
|
||||
return is;
|
||||
@@ -597,8 +608,9 @@ inline std::istream &operator>>(std::istream &is, std::vector<int> &t) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (ch == ')')
|
||||
if (ch == ')') {
|
||||
break;
|
||||
}
|
||||
} else if (ch == ')') {
|
||||
break;
|
||||
} else {
|
||||
|
||||
@@ -21,45 +21,53 @@ void DumpRegTree(std::stringstream& fo, // NOLINT(*)
|
||||
int nid, int depth, int add_comma,
|
||||
bool with_stats, std::string format) {
|
||||
if (format == "json") {
|
||||
if (add_comma) fo << ",";
|
||||
if (depth != 0) fo << std::endl;
|
||||
for (int i = 0; i < depth+1; ++i) fo << " ";
|
||||
if (add_comma) {
|
||||
fo << ",";
|
||||
}
|
||||
if (depth != 0) {
|
||||
fo << std::endl;
|
||||
}
|
||||
for (int i = 0; i < depth + 1; ++i) {
|
||||
fo << " ";
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < depth; ++i) fo << '\t';
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
fo << '\t';
|
||||
}
|
||||
}
|
||||
if (tree[nid].is_leaf()) {
|
||||
if (tree[nid].IsLeaf()) {
|
||||
if (format == "json") {
|
||||
fo << "{ \"nodeid\": " << nid
|
||||
<< ", \"leaf\": " << tree[nid].leaf_value();
|
||||
<< ", \"leaf\": " << tree[nid].LeafValue();
|
||||
if (with_stats) {
|
||||
fo << ", \"cover\": " << tree.stat(nid).sum_hess;
|
||||
fo << ", \"cover\": " << tree.Stat(nid).sum_hess;
|
||||
}
|
||||
fo << " }";
|
||||
} else {
|
||||
fo << nid << ":leaf=" << tree[nid].leaf_value();
|
||||
fo << nid << ":leaf=" << tree[nid].LeafValue();
|
||||
if (with_stats) {
|
||||
fo << ",cover=" << tree.stat(nid).sum_hess;
|
||||
fo << ",cover=" << tree.Stat(nid).sum_hess;
|
||||
}
|
||||
fo << '\n';
|
||||
}
|
||||
} else {
|
||||
// right then left,
|
||||
bst_float cond = tree[nid].split_cond();
|
||||
const unsigned split_index = tree[nid].split_index();
|
||||
if (split_index < fmap.size()) {
|
||||
bst_float cond = tree[nid].SplitCond();
|
||||
const unsigned split_index = tree[nid].SplitIndex();
|
||||
if (split_index < fmap.Size()) {
|
||||
switch (fmap.type(split_index)) {
|
||||
case FeatureMap::kIndicator: {
|
||||
int nyes = tree[nid].default_left() ?
|
||||
tree[nid].cright() : tree[nid].cleft();
|
||||
int nyes = tree[nid].DefaultLeft() ?
|
||||
tree[nid].RightChild() : tree[nid].LeftChild();
|
||||
if (format == "json") {
|
||||
fo << "{ \"nodeid\": " << nid
|
||||
<< ", \"depth\": " << depth
|
||||
<< ", \"split\": \"" << fmap.name(split_index) << "\""
|
||||
<< ", \"split\": \"" << fmap.Name(split_index) << "\""
|
||||
<< ", \"yes\": " << nyes
|
||||
<< ", \"no\": " << tree[nid].cdefault();
|
||||
<< ", \"no\": " << tree[nid].DefaultChild();
|
||||
} else {
|
||||
fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
|
||||
<< ",no=" << tree[nid].cdefault();
|
||||
fo << nid << ":[" << fmap.Name(split_index) << "] yes=" << nyes
|
||||
<< ",no=" << tree[nid].DefaultChild();
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -67,17 +75,17 @@ void DumpRegTree(std::stringstream& fo, // NOLINT(*)
|
||||
if (format == "json") {
|
||||
fo << "{ \"nodeid\": " << nid
|
||||
<< ", \"depth\": " << depth
|
||||
<< ", \"split\": \"" << fmap.name(split_index) << "\""
|
||||
<< ", \"split\": \"" << fmap.Name(split_index) << "\""
|
||||
<< ", \"split_condition\": " << int(cond + 1.0)
|
||||
<< ", \"yes\": " << tree[nid].cleft()
|
||||
<< ", \"no\": " << tree[nid].cright()
|
||||
<< ", \"missing\": " << tree[nid].cdefault();
|
||||
<< ", \"yes\": " << tree[nid].LeftChild()
|
||||
<< ", \"no\": " << tree[nid].RightChild()
|
||||
<< ", \"missing\": " << tree[nid].DefaultChild();
|
||||
} else {
|
||||
fo << nid << ":[" << fmap.name(split_index) << "<"
|
||||
fo << nid << ":[" << fmap.Name(split_index) << "<"
|
||||
<< int(cond + 1.0)
|
||||
<< "] yes=" << tree[nid].cleft()
|
||||
<< ",no=" << tree[nid].cright()
|
||||
<< ",missing=" << tree[nid].cdefault();
|
||||
<< "] yes=" << tree[nid].LeftChild()
|
||||
<< ",no=" << tree[nid].RightChild()
|
||||
<< ",missing=" << tree[nid].DefaultChild();
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -86,16 +94,16 @@ void DumpRegTree(std::stringstream& fo, // NOLINT(*)
|
||||
if (format == "json") {
|
||||
fo << "{ \"nodeid\": " << nid
|
||||
<< ", \"depth\": " << depth
|
||||
<< ", \"split\": \"" << fmap.name(split_index) << "\""
|
||||
<< ", \"split\": \"" << fmap.Name(split_index) << "\""
|
||||
<< ", \"split_condition\": " << cond
|
||||
<< ", \"yes\": " << tree[nid].cleft()
|
||||
<< ", \"no\": " << tree[nid].cright()
|
||||
<< ", \"missing\": " << tree[nid].cdefault();
|
||||
<< ", \"yes\": " << tree[nid].LeftChild()
|
||||
<< ", \"no\": " << tree[nid].RightChild()
|
||||
<< ", \"missing\": " << tree[nid].DefaultChild();
|
||||
} else {
|
||||
fo << nid << ":[" << fmap.name(split_index) << "<" << cond
|
||||
<< "] yes=" << tree[nid].cleft()
|
||||
<< ",no=" << tree[nid].cright()
|
||||
<< ",missing=" << tree[nid].cdefault();
|
||||
fo << nid << ":[" << fmap.Name(split_index) << "<" << cond
|
||||
<< "] yes=" << tree[nid].LeftChild()
|
||||
<< ",no=" << tree[nid].RightChild()
|
||||
<< ",missing=" << tree[nid].DefaultChild();
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -107,22 +115,22 @@ void DumpRegTree(std::stringstream& fo, // NOLINT(*)
|
||||
<< ", \"depth\": " << depth
|
||||
<< ", \"split\": " << split_index
|
||||
<< ", \"split_condition\": " << cond
|
||||
<< ", \"yes\": " << tree[nid].cleft()
|
||||
<< ", \"no\": " << tree[nid].cright()
|
||||
<< ", \"missing\": " << tree[nid].cdefault();
|
||||
<< ", \"yes\": " << tree[nid].LeftChild()
|
||||
<< ", \"no\": " << tree[nid].RightChild()
|
||||
<< ", \"missing\": " << tree[nid].DefaultChild();
|
||||
} else {
|
||||
fo << nid << ":[f" << split_index << "<"<< cond
|
||||
<< "] yes=" << tree[nid].cleft()
|
||||
<< ",no=" << tree[nid].cright()
|
||||
<< ",missing=" << tree[nid].cdefault();
|
||||
<< "] yes=" << tree[nid].LeftChild()
|
||||
<< ",no=" << tree[nid].RightChild()
|
||||
<< ",missing=" << tree[nid].DefaultChild();
|
||||
}
|
||||
}
|
||||
if (with_stats) {
|
||||
if (format == "json") {
|
||||
fo << ", \"gain\": " << tree.stat(nid).loss_chg
|
||||
<< ", \"cover\": " << tree.stat(nid).sum_hess;
|
||||
fo << ", \"gain\": " << tree.Stat(nid).loss_chg
|
||||
<< ", \"cover\": " << tree.Stat(nid).sum_hess;
|
||||
} else {
|
||||
fo << ",gain=" << tree.stat(nid).loss_chg << ",cover=" << tree.stat(nid).sum_hess;
|
||||
fo << ",gain=" << tree.Stat(nid).loss_chg << ",cover=" << tree.Stat(nid).sum_hess;
|
||||
}
|
||||
}
|
||||
if (format == "json") {
|
||||
@@ -130,11 +138,13 @@ void DumpRegTree(std::stringstream& fo, // NOLINT(*)
|
||||
} else {
|
||||
fo << '\n';
|
||||
}
|
||||
DumpRegTree(fo, tree, fmap, tree[nid].cleft(), depth + 1, false, with_stats, format);
|
||||
DumpRegTree(fo, tree, fmap, tree[nid].cright(), depth + 1, true, with_stats, format);
|
||||
DumpRegTree(fo, tree, fmap, tree[nid].LeftChild(), depth + 1, false, with_stats, format);
|
||||
DumpRegTree(fo, tree, fmap, tree[nid].RightChild(), depth + 1, true, with_stats, format);
|
||||
if (format == "json") {
|
||||
fo << std::endl;
|
||||
for (int i = 0; i < depth+1; ++i) fo << " ";
|
||||
for (int i = 0; i < depth + 1; ++i) {
|
||||
fo << " ";
|
||||
}
|
||||
fo << "]}";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ namespace tree {
|
||||
class BaseMaker: public TreeUpdater {
|
||||
public:
|
||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
param.InitAllowUnknown(args);
|
||||
param_.InitAllowUnknown(args);
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -39,8 +39,8 @@ class BaseMaker: public TreeUpdater {
|
||||
/*! \brief find type of each feature, use column format */
|
||||
inline void InitByCol(DMatrix* p_fmat,
|
||||
const RegTree& tree) {
|
||||
fminmax.resize(tree.param.num_feature * 2);
|
||||
std::fill(fminmax.begin(), fminmax.end(),
|
||||
fminmax_.resize(tree.param.num_feature * 2);
|
||||
std::fill(fminmax_.begin(), fminmax_.end(),
|
||||
-std::numeric_limits<bst_float>::max());
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator();
|
||||
@@ -51,22 +51,22 @@ class BaseMaker: public TreeUpdater {
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
const ColBatch::Inst& c = batch[i];
|
||||
if (c.length != 0) {
|
||||
fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
|
||||
fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
|
||||
fminmax_[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax_[fid * 2 + 0]);
|
||||
fminmax_[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax_[fid * 2 + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief synchronize the information */
|
||||
inline void SyncInfo() {
|
||||
rabit::Allreduce<rabit::op::Max>(dmlc::BeginPtr(fminmax), fminmax.size());
|
||||
rabit::Allreduce<rabit::op::Max>(dmlc::BeginPtr(fminmax_), fminmax_.size());
|
||||
}
|
||||
// get feature type, 0:empty 1:binary 2:real
|
||||
inline int Type(bst_uint fid) const {
|
||||
CHECK_LT(fid * 2 + 1, fminmax.size())
|
||||
CHECK_LT(fid * 2 + 1, fminmax_.size())
|
||||
<< "FeatHelper fid exceed query bound ";
|
||||
bst_float a = fminmax[fid * 2];
|
||||
bst_float b = fminmax[fid * 2 + 1];
|
||||
bst_float a = fminmax_[fid * 2];
|
||||
bst_float b = fminmax_[fid * 2 + 1];
|
||||
if (a == -std::numeric_limits<bst_float>::max()) return 0;
|
||||
if (-a == b) {
|
||||
return 1;
|
||||
@@ -75,16 +75,16 @@ class BaseMaker: public TreeUpdater {
|
||||
}
|
||||
}
|
||||
inline bst_float MaxValue(bst_uint fid) const {
|
||||
return fminmax[fid *2 + 1];
|
||||
return fminmax_[fid *2 + 1];
|
||||
}
|
||||
inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
|
||||
std::vector<bst_uint> &findex = *p_findex;
|
||||
findex.clear();
|
||||
for (size_t i = 0; i < fminmax.size(); i += 2) {
|
||||
const bst_uint fid = static_cast<bst_uint>(i / 2);
|
||||
for (size_t i = 0; i < fminmax_.size(); i += 2) {
|
||||
const auto fid = static_cast<bst_uint>(i / 2);
|
||||
if (this->Type(fid) != 0) findex.push_back(fid);
|
||||
}
|
||||
unsigned n = static_cast<unsigned>(p * findex.size());
|
||||
auto n = static_cast<unsigned>(p * findex.size());
|
||||
std::shuffle(findex.begin(), findex.end(), common::GlobalRandom());
|
||||
findex.resize(n);
|
||||
// sync the findex if it is subsample
|
||||
@@ -99,64 +99,64 @@ class BaseMaker: public TreeUpdater {
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<bst_float> fminmax;
|
||||
std::vector<bst_float> fminmax_;
|
||||
};
|
||||
// ------static helper functions ------
|
||||
// helper function to get to next level of the tree
|
||||
/*! \brief this is helper function for row based data*/
|
||||
inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
|
||||
const RegTree::Node &n = tree[nid];
|
||||
bst_uint findex = n.split_index();
|
||||
bst_uint findex = n.SplitIndex();
|
||||
for (unsigned i = 0; i < inst.length; ++i) {
|
||||
if (findex == inst[i].index) {
|
||||
if (inst[i].fvalue < n.split_cond()) {
|
||||
return n.cleft();
|
||||
if (inst[i].fvalue < n.SplitCond()) {
|
||||
return n.LeftChild();
|
||||
} else {
|
||||
return n.cright();
|
||||
return n.RightChild();
|
||||
}
|
||||
}
|
||||
}
|
||||
return n.cdefault();
|
||||
return n.DefaultChild();
|
||||
}
|
||||
// ------class member helpers---------
|
||||
/*! \brief initialize temp data structure */
|
||||
inline void InitData(const std::vector<bst_gpair> &gpair,
|
||||
inline void InitData(const std::vector<GradientPair> &gpair,
|
||||
const DMatrix &fmat,
|
||||
const RegTree &tree) {
|
||||
CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
|
||||
<< "TreeMaker: can only grow new tree";
|
||||
const std::vector<unsigned> &root_index = fmat.info().root_index;
|
||||
const std::vector<unsigned> &root_index = fmat.Info().root_index_;
|
||||
{
|
||||
// setup position
|
||||
position.resize(gpair.size());
|
||||
position_.resize(gpair.size());
|
||||
if (root_index.size() == 0) {
|
||||
std::fill(position.begin(), position.end(), 0);
|
||||
std::fill(position_.begin(), position_.end(), 0);
|
||||
} else {
|
||||
for (size_t i = 0; i < position.size(); ++i) {
|
||||
position[i] = root_index[i];
|
||||
for (size_t i = 0; i < position_.size(); ++i) {
|
||||
position_[i] = root_index[i];
|
||||
CHECK_LT(root_index[i], (unsigned)tree.param.num_roots)
|
||||
<< "root index exceed setting";
|
||||
}
|
||||
}
|
||||
// mark delete for the deleted datas
|
||||
for (size_t i = 0; i < position.size(); ++i) {
|
||||
if (gpair[i].GetHess() < 0.0f) position[i] = ~position[i];
|
||||
for (size_t i = 0; i < position_.size(); ++i) {
|
||||
if (gpair[i].GetHess() < 0.0f) position_[i] = ~position_[i];
|
||||
}
|
||||
// mark subsample
|
||||
if (param.subsample < 1.0f) {
|
||||
std::bernoulli_distribution coin_flip(param.subsample);
|
||||
if (param_.subsample < 1.0f) {
|
||||
std::bernoulli_distribution coin_flip(param_.subsample);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
for (size_t i = 0; i < position.size(); ++i) {
|
||||
for (size_t i = 0; i < position_.size(); ++i) {
|
||||
if (gpair[i].GetHess() < 0.0f) continue;
|
||||
if (!coin_flip(rnd)) position[i] = ~position[i];
|
||||
if (!coin_flip(rnd)) position_[i] = ~position_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
// expand query
|
||||
qexpand.reserve(256); qexpand.clear();
|
||||
qexpand_.reserve(256); qexpand_.clear();
|
||||
for (int i = 0; i < tree.param.num_roots; ++i) {
|
||||
qexpand.push_back(i);
|
||||
qexpand_.push_back(i);
|
||||
}
|
||||
this->UpdateNode2WorkIndex(tree);
|
||||
}
|
||||
@@ -164,28 +164,27 @@ class BaseMaker: public TreeUpdater {
|
||||
/*! \brief update queue expand add in new leaves */
|
||||
inline void UpdateQueueExpand(const RegTree &tree) {
|
||||
std::vector<int> newnodes;
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
if (!tree[nid].is_leaf()) {
|
||||
newnodes.push_back(tree[nid].cleft());
|
||||
newnodes.push_back(tree[nid].cright());
|
||||
for (int nid : qexpand_) {
|
||||
if (!tree[nid].IsLeaf()) {
|
||||
newnodes.push_back(tree[nid].LeftChild());
|
||||
newnodes.push_back(tree[nid].RightChild());
|
||||
}
|
||||
}
|
||||
// use new nodes for qexpand
|
||||
qexpand = newnodes;
|
||||
qexpand_ = newnodes;
|
||||
this->UpdateNode2WorkIndex(tree);
|
||||
}
|
||||
// return decoded position
|
||||
inline int DecodePosition(bst_uint ridx) const {
|
||||
const int pid = position[ridx];
|
||||
const int pid = position_[ridx];
|
||||
return pid < 0 ? ~pid : pid;
|
||||
}
|
||||
// encode the encoded position value for ridx
|
||||
inline void SetEncodePosition(bst_uint ridx, int nid) {
|
||||
if (position[ridx] < 0) {
|
||||
position[ridx] = ~nid;
|
||||
if (position_[ridx] < 0) {
|
||||
position_[ridx] = ~nid;
|
||||
} else {
|
||||
position[ridx] = nid;
|
||||
position_[ridx] = nid;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
@@ -211,27 +210,27 @@ class BaseMaker: public TreeUpdater {
|
||||
inline void SetDefaultPostion(DMatrix *p_fmat,
|
||||
const RegTree &tree) {
|
||||
// set rest of instances to default position
|
||||
const RowSet &rowset = p_fmat->buffered_rowset();
|
||||
const RowSet &rowset = p_fmat->BufferedRowset();
|
||||
// set default direct nodes to default
|
||||
// for leaf nodes that are not fresh, mark then to ~nid,
|
||||
// so that they are ignored in future statistics collection
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
||||
const auto ndata = static_cast<bst_omp_uint>(rowset.Size());
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
const bst_uint ridx = rowset[i];
|
||||
const int nid = this->DecodePosition(ridx);
|
||||
if (tree[nid].is_leaf()) {
|
||||
if (tree[nid].IsLeaf()) {
|
||||
// mark finish when it is not a fresh leaf
|
||||
if (tree[nid].cright() == -1) {
|
||||
position[ridx] = ~nid;
|
||||
if (tree[nid].RightChild() == -1) {
|
||||
position_[ridx] = ~nid;
|
||||
}
|
||||
} else {
|
||||
// push to default branch
|
||||
if (tree[nid].default_left()) {
|
||||
this->SetEncodePosition(ridx, tree[nid].cleft());
|
||||
if (tree[nid].DefaultLeft()) {
|
||||
this->SetEncodePosition(ridx, tree[nid].LeftChild());
|
||||
} else {
|
||||
this->SetEncodePosition(ridx, tree[nid].cright());
|
||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -254,21 +253,21 @@ class BaseMaker: public TreeUpdater {
|
||||
auto it = std::lower_bound(sorted_split_set.begin(), sorted_split_set.end(), fid);
|
||||
|
||||
if (it != sorted_split_set.end() && *it == fid) {
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
|
||||
const auto ndata = static_cast<bst_omp_uint>(col.length);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
const bst_uint ridx = col[j].index;
|
||||
const bst_float fvalue = col[j].fvalue;
|
||||
const int nid = this->DecodePosition(ridx);
|
||||
CHECK(tree[nid].is_leaf());
|
||||
int pid = tree[nid].parent();
|
||||
CHECK(tree[nid].IsLeaf());
|
||||
int pid = tree[nid].Parent();
|
||||
|
||||
// go back to parent, correct those who are not default
|
||||
if (!tree[nid].is_root() && tree[pid].split_index() == fid) {
|
||||
if (fvalue < tree[pid].split_cond()) {
|
||||
this->SetEncodePosition(ridx, tree[pid].cleft());
|
||||
if (!tree[nid].IsRoot() && tree[pid].SplitIndex() == fid) {
|
||||
if (fvalue < tree[pid].SplitCond()) {
|
||||
this->SetEncodePosition(ridx, tree[pid].LeftChild());
|
||||
} else {
|
||||
this->SetEncodePosition(ridx, tree[pid].cright());
|
||||
this->SetEncodePosition(ridx, tree[pid].RightChild());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,10 +286,9 @@ class BaseMaker: public TreeUpdater {
|
||||
std::vector<unsigned>& fsplits = *out_split_set;
|
||||
fsplits.clear();
|
||||
// step 1, classify the non-default data into right places
|
||||
for (size_t i = 0; i < nodes.size(); ++i) {
|
||||
const int nid = nodes[i];
|
||||
if (!tree[nid].is_leaf()) {
|
||||
fsplits.push_back(tree[nid].split_index());
|
||||
for (int nid : nodes) {
|
||||
if (!tree[nid].IsLeaf()) {
|
||||
fsplits.push_back(tree[nid].SplitIndex());
|
||||
}
|
||||
}
|
||||
std::sort(fsplits.begin(), fsplits.end());
|
||||
@@ -314,18 +312,18 @@ class BaseMaker: public TreeUpdater {
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
ColBatch::Inst col = batch[i];
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
|
||||
const auto ndata = static_cast<bst_omp_uint>(col.length);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
const bst_uint ridx = col[j].index;
|
||||
const bst_float fvalue = col[j].fvalue;
|
||||
const int nid = this->DecodePosition(ridx);
|
||||
// go back to parent, correct those who are not default
|
||||
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
|
||||
if (fvalue < tree[nid].split_cond()) {
|
||||
this->SetEncodePosition(ridx, tree[nid].cleft());
|
||||
if (!tree[nid].IsLeaf() && tree[nid].SplitIndex() == fid) {
|
||||
if (fvalue < tree[nid].SplitCond()) {
|
||||
this->SetEncodePosition(ridx, tree[nid].LeftChild());
|
||||
} else {
|
||||
this->SetEncodePosition(ridx, tree[nid].cright());
|
||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -334,39 +332,37 @@ class BaseMaker: public TreeUpdater {
|
||||
}
|
||||
/*! \brief helper function to get statistics from a tree */
|
||||
template<typename TStats>
|
||||
inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
|
||||
inline void GetNodeStats(const std::vector<GradientPair> &gpair,
|
||||
const DMatrix &fmat,
|
||||
const RegTree &tree,
|
||||
std::vector< std::vector<TStats> > *p_thread_temp,
|
||||
std::vector<TStats> *p_node_stats) {
|
||||
std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
|
||||
const MetaInfo &info = fmat.info();
|
||||
const MetaInfo &info = fmat.Info();
|
||||
thread_temp.resize(omp_get_max_threads());
|
||||
p_node_stats->resize(tree.param.num_nodes);
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int tid = omp_get_thread_num();
|
||||
thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const unsigned nid = qexpand[i];
|
||||
thread_temp[tid].resize(tree.param.num_nodes, TStats(param_));
|
||||
for (unsigned int nid : qexpand_) {
|
||||
thread_temp[tid][nid].Clear();
|
||||
}
|
||||
}
|
||||
const RowSet &rowset = fmat.buffered_rowset();
|
||||
const RowSet &rowset = fmat.BufferedRowset();
|
||||
// setup position
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
||||
const auto ndata = static_cast<bst_omp_uint>(rowset.Size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
const bst_uint ridx = rowset[i];
|
||||
const int nid = position[ridx];
|
||||
const int nid = position_[ridx];
|
||||
const int tid = omp_get_thread_num();
|
||||
if (nid >= 0) {
|
||||
thread_temp[tid][nid].Add(gpair, info, ridx);
|
||||
}
|
||||
}
|
||||
// sum the per thread statistics together
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
const int nid = qexpand[j];
|
||||
for (int nid : qexpand_) {
|
||||
TStats &s = (*p_node_stats)[nid];
|
||||
s.Clear();
|
||||
for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
|
||||
@@ -461,28 +457,28 @@ class BaseMaker: public TreeUpdater {
|
||||
}
|
||||
};
|
||||
/*! \brief training parameter of tree grower */
|
||||
TrainParam param;
|
||||
TrainParam param_;
|
||||
/*! \brief queue of nodes to be expanded */
|
||||
std::vector<int> qexpand;
|
||||
std::vector<int> qexpand_;
|
||||
/*!
|
||||
* \brief map active node to is working index offset in qexpand,
|
||||
* can be -1, which means the node is node actively expanding
|
||||
*/
|
||||
std::vector<int> node2workindex;
|
||||
std::vector<int> node2workindex_;
|
||||
/*!
|
||||
* \brief position of each instance in the tree
|
||||
* can be negative, which means this position is no longer expanding
|
||||
* see also Decode/EncodePosition
|
||||
*/
|
||||
std::vector<int> position;
|
||||
std::vector<int> position_;
|
||||
|
||||
private:
|
||||
inline void UpdateNode2WorkIndex(const RegTree &tree) {
|
||||
// update the node2workindex
|
||||
std::fill(node2workindex.begin(), node2workindex.end(), -1);
|
||||
node2workindex.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
node2workindex[qexpand[i]] = static_cast<int>(i);
|
||||
std::fill(node2workindex_.begin(), node2workindex_.end(), -1);
|
||||
node2workindex_.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||
node2workindex_[qexpand_[i]] = static_cast<int>(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@
|
||||
#include <dmlc/timer.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
@@ -50,47 +51,47 @@ class FastHistMaker: public TreeUpdater {
|
||||
pruner_.reset(TreeUpdater::Create("prune"));
|
||||
}
|
||||
pruner_->Init(args);
|
||||
param.InitAllowUnknown(args);
|
||||
fhparam.InitAllowUnknown(args);
|
||||
param_.InitAllowUnknown(args);
|
||||
fhparam_.InitAllowUnknown(args);
|
||||
is_gmat_initialized_ = false;
|
||||
}
|
||||
|
||||
void Update(HostDeviceVector<bst_gpair>* gpair,
|
||||
void Update(HostDeviceVector<GradientPair>* gpair,
|
||||
DMatrix* dmat,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
TStats::CheckInfo(dmat->info());
|
||||
TStats::CheckInfo(dmat->Info());
|
||||
if (is_gmat_initialized_ == false) {
|
||||
double tstart = dmlc::GetTime();
|
||||
hmat_.Init(dmat, static_cast<uint32_t>(param.max_bin));
|
||||
hmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
|
||||
gmat_.cut = &hmat_;
|
||||
gmat_.Init(dmat);
|
||||
column_matrix_.Init(gmat_, fhparam);
|
||||
if (fhparam.enable_feature_grouping > 0) {
|
||||
gmatb_.Init(gmat_, column_matrix_, fhparam);
|
||||
column_matrix_.Init(gmat_, fhparam_);
|
||||
if (fhparam_.enable_feature_grouping > 0) {
|
||||
gmatb_.Init(gmat_, column_matrix_, fhparam_);
|
||||
}
|
||||
is_gmat_initialized_ = true;
|
||||
if (param.debug_verbose > 0) {
|
||||
if (param_.debug_verbose > 0) {
|
||||
LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
|
||||
}
|
||||
}
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
TConstraint::Init(¶m, dmat->info().num_col);
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
TConstraint::Init(¶m_, dmat->Info().num_col_);
|
||||
// build tree
|
||||
if (!builder_) {
|
||||
builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
|
||||
builder_.reset(new Builder(param_, fhparam_, std::move(pruner_)));
|
||||
}
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
for (auto tree : trees) {
|
||||
builder_->Update
|
||||
(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
|
||||
(gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
param_.learning_rate = lr;
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix* data,
|
||||
HostDeviceVector<bst_float>* out_preds) override {
|
||||
if (!builder_ || param.subsample < 1.0f) {
|
||||
if (!builder_ || param_.subsample < 1.0f) {
|
||||
return false;
|
||||
} else {
|
||||
return builder_->UpdatePredictionCache(data, out_preds);
|
||||
@@ -99,8 +100,8 @@ class FastHistMaker: public TreeUpdater {
|
||||
|
||||
protected:
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
FastHistParam fhparam;
|
||||
TrainParam param_;
|
||||
FastHistParam fhparam_;
|
||||
// data sketch
|
||||
HistCutMatrix hmat_;
|
||||
// quantized data matrix
|
||||
@@ -134,13 +135,13 @@ class FastHistMaker: public TreeUpdater {
|
||||
explicit Builder(const TrainParam& param,
|
||||
const FastHistParam& fhparam,
|
||||
std::unique_ptr<TreeUpdater> pruner)
|
||||
: param(param), fhparam(fhparam), pruner_(std::move(pruner)),
|
||||
: param_(param), fhparam_(fhparam), pruner_(std::move(pruner)),
|
||||
p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
|
||||
// update one tree, growing
|
||||
virtual void Update(const GHistIndexMatrix& gmat,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const ColumnMatrix& column_matrix,
|
||||
HostDeviceVector<bst_gpair>* gpair,
|
||||
HostDeviceVector<GradientPair>* gpair,
|
||||
DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
double gstart = dmlc::GetTime();
|
||||
@@ -155,11 +156,11 @@ class FastHistMaker: public TreeUpdater {
|
||||
double time_evaluate_split = 0;
|
||||
double time_apply_split = 0;
|
||||
|
||||
std::vector<bst_gpair>& gpair_h = gpair->data_h();
|
||||
std::vector<GradientPair>& gpair_h = gpair->HostVector();
|
||||
|
||||
tstart = dmlc::GetTime();
|
||||
this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
|
||||
std::vector<bst_uint> feat_set = feat_index;
|
||||
std::vector<bst_uint> feat_set = feat_index_;
|
||||
time_init_data = dmlc::GetTime() - tstart;
|
||||
|
||||
// FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
|
||||
@@ -179,7 +180,7 @@ class FastHistMaker: public TreeUpdater {
|
||||
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree, feat_set);
|
||||
time_evaluate_split += dmlc::GetTime() - tstart;
|
||||
qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
|
||||
snode[nid].best.loss_chg,
|
||||
snode_[nid].best.loss_chg,
|
||||
timestamp++));
|
||||
++num_leaves;
|
||||
}
|
||||
@@ -188,21 +189,21 @@ class FastHistMaker: public TreeUpdater {
|
||||
const ExpandEntry candidate = qexpand_->top();
|
||||
const int nid = candidate.nid;
|
||||
qexpand_->pop();
|
||||
if (candidate.loss_chg <= rt_eps
|
||||
|| (param.max_depth > 0 && candidate.depth == param.max_depth)
|
||||
|| (param.max_leaves > 0 && num_leaves == param.max_leaves) ) {
|
||||
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
||||
if (candidate.loss_chg <= kRtEps
|
||||
|| (param_.max_depth > 0 && candidate.depth == param_.max_depth)
|
||||
|| (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
|
||||
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
||||
} else {
|
||||
tstart = dmlc::GetTime();
|
||||
this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
|
||||
time_apply_split += dmlc::GetTime() - tstart;
|
||||
|
||||
tstart = dmlc::GetTime();
|
||||
const int cleft = (*p_tree)[nid].cleft();
|
||||
const int cright = (*p_tree)[nid].cright();
|
||||
const int cleft = (*p_tree)[nid].LeftChild();
|
||||
const int cright = (*p_tree)[nid].RightChild();
|
||||
hist_.AddHistRow(cleft);
|
||||
hist_.AddHistRow(cright);
|
||||
if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
|
||||
if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
|
||||
BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
|
||||
SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
|
||||
} else {
|
||||
@@ -222,10 +223,10 @@ class FastHistMaker: public TreeUpdater {
|
||||
time_evaluate_split += dmlc::GetTime() - tstart;
|
||||
|
||||
qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
|
||||
snode[cleft].best.loss_chg,
|
||||
snode_[cleft].best.loss_chg,
|
||||
timestamp++));
|
||||
qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
|
||||
snode[cright].best.loss_chg,
|
||||
snode_[cright].best.loss_chg,
|
||||
timestamp++));
|
||||
|
||||
++num_leaves; // give two and take one, as parent is no longer a leaf
|
||||
@@ -238,19 +239,19 @@ class FastHistMaker: public TreeUpdater {
|
||||
while (!qexpand_->empty()) {
|
||||
const int nid = qexpand_->top().nid;
|
||||
qexpand_->pop();
|
||||
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
||||
(*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
|
||||
}
|
||||
// remember auxiliary statistics in the tree node
|
||||
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
|
||||
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
|
||||
p_tree->stat(nid).base_weight = snode[nid].weight;
|
||||
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
|
||||
snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
|
||||
p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
|
||||
p_tree->Stat(nid).base_weight = snode_[nid].weight;
|
||||
p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
|
||||
snode_[nid].stats.SetLeafVec(param_, p_tree->Leafvec(nid));
|
||||
}
|
||||
|
||||
pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
|
||||
|
||||
if (param.debug_verbose > 0) {
|
||||
if (param_.debug_verbose > 0) {
|
||||
double total_time = dmlc::GetTime() - gstart;
|
||||
LOG(INFO) << "\nInitData: "
|
||||
<< std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
|
||||
@@ -278,13 +279,13 @@ class FastHistMaker: public TreeUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
inline void BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
inline void BuildHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
if (fhparam.enable_feature_grouping > 0) {
|
||||
if (fhparam_.enable_feature_grouping > 0) {
|
||||
hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
|
||||
} else {
|
||||
hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
|
||||
@@ -297,7 +298,7 @@ class FastHistMaker: public TreeUpdater {
|
||||
|
||||
inline bool UpdatePredictionCache(const DMatrix* data,
|
||||
HostDeviceVector<bst_float>* p_out_preds) {
|
||||
std::vector<bst_float>& out_preds = p_out_preds->data_h();
|
||||
std::vector<bst_float>& out_preds = p_out_preds->HostVector();
|
||||
|
||||
// p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
|
||||
// conjunction with Update().
|
||||
@@ -318,13 +319,13 @@ class FastHistMaker: public TreeUpdater {
|
||||
bst_float leaf_value;
|
||||
// if a node is marked as deleted by the pruner, traverse upward to locate
|
||||
// a non-deleted leaf.
|
||||
if ((*p_last_tree_)[nid].is_deleted()) {
|
||||
while ((*p_last_tree_)[nid].is_deleted()) {
|
||||
nid = (*p_last_tree_)[nid].parent();
|
||||
if ((*p_last_tree_)[nid].IsDeleted()) {
|
||||
while ((*p_last_tree_)[nid].IsDeleted()) {
|
||||
nid = (*p_last_tree_)[nid].Parent();
|
||||
}
|
||||
CHECK((*p_last_tree_)[nid].is_leaf());
|
||||
CHECK((*p_last_tree_)[nid].IsLeaf());
|
||||
}
|
||||
leaf_value = (*p_last_tree_)[nid].leaf_value();
|
||||
leaf_value = (*p_last_tree_)[nid].LeafValue();
|
||||
|
||||
for (const size_t* it = rowset.begin; it < rowset.end; ++it) {
|
||||
out_preds[*it] += leaf_value;
|
||||
@@ -338,19 +339,19 @@ class FastHistMaker: public TreeUpdater {
|
||||
protected:
|
||||
// initialize temp data structure
|
||||
inline void InitData(const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_gpair>& gpair,
|
||||
const std::vector<GradientPair>& gpair,
|
||||
const DMatrix& fmat,
|
||||
const RegTree& tree) {
|
||||
CHECK_EQ(tree.param.num_nodes, tree.param.num_roots)
|
||||
<< "ColMakerHist: can only grow new tree";
|
||||
CHECK((param.max_depth > 0 || param.max_leaves > 0))
|
||||
CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
|
||||
<< "max_depth or max_leaves cannot be both 0 (unlimited); "
|
||||
<< "at least one should be a positive quantity.";
|
||||
if (param.grow_policy == TrainParam::kDepthWise) {
|
||||
CHECK(param.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
|
||||
if (param_.grow_policy == TrainParam::kDepthWise) {
|
||||
CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
|
||||
<< "when grow_policy is depthwise.";
|
||||
}
|
||||
const auto& info = fmat.info();
|
||||
const auto& info = fmat.Info();
|
||||
|
||||
{
|
||||
// initialize the row set
|
||||
@@ -364,23 +365,23 @@ class FastHistMaker: public TreeUpdater {
|
||||
// initialize histogram builder
|
||||
#pragma omp parallel
|
||||
{
|
||||
this->nthread = omp_get_num_threads();
|
||||
this->nthread_ = omp_get_num_threads();
|
||||
}
|
||||
hist_builder_.Init(this->nthread, nbins);
|
||||
hist_builder_.Init(this->nthread_, nbins);
|
||||
|
||||
CHECK_EQ(info.root_index.size(), 0U);
|
||||
CHECK_EQ(info.root_index_.size(), 0U);
|
||||
std::vector<size_t>& row_indices = row_set_collection_.row_indices_;
|
||||
// mark subsample and build list of member rows
|
||||
if (param.subsample < 1.0f) {
|
||||
std::bernoulli_distribution coin_flip(param.subsample);
|
||||
if (param_.subsample < 1.0f) {
|
||||
std::bernoulli_distribution coin_flip(param_.subsample);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
for (size_t i = 0; i < info.num_row; ++i) {
|
||||
for (size_t i = 0; i < info.num_row_; ++i) {
|
||||
if (gpair[i].GetHess() >= 0.0f && coin_flip(rnd)) {
|
||||
row_indices.push_back(i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < info.num_row; ++i) {
|
||||
for (size_t i = 0; i < info.num_row_; ++i) {
|
||||
if (gpair[i].GetHess() >= 0.0f) {
|
||||
row_indices.push_back(i);
|
||||
}
|
||||
@@ -391,9 +392,9 @@ class FastHistMaker: public TreeUpdater {
|
||||
|
||||
{
|
||||
/* determine layout of data */
|
||||
const size_t nrow = info.num_row;
|
||||
const size_t ncol = info.num_col;
|
||||
const size_t nnz = info.num_nonzero;
|
||||
const size_t nrow = info.num_row_;
|
||||
const size_t ncol = info.num_col_;
|
||||
const size_t nnz = info.num_nonzero_;
|
||||
// number of discrete bins for feature 0
|
||||
const uint32_t nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0];
|
||||
if (nrow * ncol == nnz) {
|
||||
@@ -413,23 +414,23 @@ class FastHistMaker: public TreeUpdater {
|
||||
// store a pointer to training data
|
||||
p_last_fmat_ = &fmat;
|
||||
// initialize feature index
|
||||
bst_uint ncol = static_cast<bst_uint>(info.num_col);
|
||||
feat_index.clear();
|
||||
auto ncol = static_cast<bst_uint>(info.num_col_);
|
||||
feat_index_.clear();
|
||||
if (data_layout_ == kDenseDataOneBased) {
|
||||
for (bst_uint i = 1; i < ncol; ++i) {
|
||||
feat_index.push_back(i);
|
||||
feat_index_.push_back(i);
|
||||
}
|
||||
} else {
|
||||
for (bst_uint i = 0; i < ncol; ++i) {
|
||||
feat_index.push_back(i);
|
||||
feat_index_.push_back(i);
|
||||
}
|
||||
}
|
||||
bst_uint n = std::max(static_cast<bst_uint>(1),
|
||||
static_cast<bst_uint>(param.colsample_bytree * feat_index.size()));
|
||||
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
|
||||
CHECK_GT(param.colsample_bytree, 0U)
|
||||
static_cast<bst_uint>(param_.colsample_bytree * feat_index_.size()));
|
||||
std::shuffle(feat_index_.begin(), feat_index_.end(), common::GlobalRandom());
|
||||
CHECK_GT(param_.colsample_bytree, 0U)
|
||||
<< "colsample_bytree cannot be zero.";
|
||||
feat_index.resize(n);
|
||||
feat_index_.resize(n);
|
||||
}
|
||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||
/* specialized code for dense data:
|
||||
@@ -437,7 +438,7 @@ class FastHistMaker: public TreeUpdater {
|
||||
For dense data (with no missing value),
|
||||
the sum of gradient histogram is equal to snode[nid] */
|
||||
const std::vector<uint32_t>& row_ptr = gmat.cut->row_ptr;
|
||||
const bst_uint nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
||||
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
||||
uint32_t min_nbins_per_feature = 0;
|
||||
for (bst_uint i = 0; i < nfeature; ++i) {
|
||||
const uint32_t nbins = row_ptr[i + 1] - row_ptr[i];
|
||||
@@ -451,14 +452,14 @@ class FastHistMaker: public TreeUpdater {
|
||||
CHECK_GT(min_nbins_per_feature, 0U);
|
||||
}
|
||||
{
|
||||
snode.reserve(256);
|
||||
snode.clear();
|
||||
snode_.reserve(256);
|
||||
snode_.clear();
|
||||
}
|
||||
{
|
||||
if (param.grow_policy == TrainParam::kLossGuide) {
|
||||
qexpand_.reset(new ExpandQueue(loss_guide));
|
||||
if (param_.grow_policy == TrainParam::kLossGuide) {
|
||||
qexpand_.reset(new ExpandQueue(LossGuide));
|
||||
} else {
|
||||
qexpand_.reset(new ExpandQueue(depth_wise));
|
||||
qexpand_.reset(new ExpandQueue(DepthWise));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -470,25 +471,25 @@ class FastHistMaker: public TreeUpdater {
|
||||
const RegTree& tree,
|
||||
const std::vector<bst_uint>& feat_set) {
|
||||
// start enumeration
|
||||
const MetaInfo& info = fmat.info();
|
||||
const bst_uint nfeature = static_cast<bst_uint>(feat_set.size());
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread);
|
||||
const MetaInfo& info = fmat.Info();
|
||||
const auto nfeature = static_cast<bst_uint>(feat_set.size());
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
best_split_tloc_.resize(nthread);
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
|
||||
best_split_tloc_[tid] = snode[nid].best;
|
||||
best_split_tloc_[tid] = snode_[nid].best;
|
||||
}
|
||||
#pragma omp parallel for schedule(dynamic) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < nfeature; ++i) {
|
||||
const bst_uint fid = feat_set[i];
|
||||
const unsigned tid = omp_get_thread_num();
|
||||
this->EnumerateSplit(-1, gmat, hist[nid], snode[nid], constraints_[nid], info,
|
||||
this->EnumerateSplit(-1, gmat, hist[nid], snode_[nid], constraints_[nid], info,
|
||||
&best_split_tloc_[tid], fid);
|
||||
this->EnumerateSplit(+1, gmat, hist[nid], snode[nid], constraints_[nid], info,
|
||||
this->EnumerateSplit(+1, gmat, hist[nid], snode_[nid], constraints_[nid], info,
|
||||
&best_split_tloc_[tid], fid);
|
||||
}
|
||||
for (unsigned tid = 0; tid < nthread; ++tid) {
|
||||
snode[nid].best.Update(best_split_tloc_[tid]);
|
||||
snode_[nid].best.Update(best_split_tloc_[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -499,12 +500,13 @@ class FastHistMaker: public TreeUpdater {
|
||||
const DMatrix& fmat,
|
||||
RegTree* p_tree) {
|
||||
XGBOOST_TYPE_SWITCH(column_matrix.dtype, {
|
||||
ApplySplit_<DType>(nid, gmat, column_matrix, hist, fmat, p_tree);
|
||||
ApplySplitSpecialize<DType>(nid, gmat, column_matrix, hist, fmat,
|
||||
p_tree);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void ApplySplit_(int nid,
|
||||
inline void ApplySplitSpecialize(int nid,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& column_matrix,
|
||||
const HistCollection& hist,
|
||||
@@ -513,26 +515,26 @@ class FastHistMaker: public TreeUpdater {
|
||||
// TODO(hcho3): support feature sampling by levels
|
||||
|
||||
/* 1. Create child nodes */
|
||||
NodeEntry& e = snode[nid];
|
||||
NodeEntry& e = snode_[nid];
|
||||
|
||||
p_tree->AddChilds(nid);
|
||||
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
|
||||
(*p_tree)[nid].SetSplit(e.best.SplitIndex(), e.best.split_value, e.best.DefaultLeft());
|
||||
// mark right child as 0, to indicate fresh leaf
|
||||
int cleft = (*p_tree)[nid].cleft();
|
||||
int cright = (*p_tree)[nid].cright();
|
||||
(*p_tree)[cleft].set_leaf(0.0f, 0);
|
||||
(*p_tree)[cright].set_leaf(0.0f, 0);
|
||||
int cleft = (*p_tree)[nid].LeftChild();
|
||||
int cright = (*p_tree)[nid].RightChild();
|
||||
(*p_tree)[cleft].SetLeaf(0.0f, 0);
|
||||
(*p_tree)[cright].SetLeaf(0.0f, 0);
|
||||
|
||||
/* 2. Categorize member rows */
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread);
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
row_split_tloc_.resize(nthread);
|
||||
for (bst_omp_uint i = 0; i < nthread; ++i) {
|
||||
row_split_tloc_[i].left.clear();
|
||||
row_split_tloc_[i].right.clear();
|
||||
}
|
||||
const bool default_left = (*p_tree)[nid].default_left();
|
||||
const bst_uint fid = (*p_tree)[nid].split_index();
|
||||
const bst_float split_pt = (*p_tree)[nid].split_cond();
|
||||
const bool default_left = (*p_tree)[nid].DefaultLeft();
|
||||
const bst_uint fid = (*p_tree)[nid].SplitIndex();
|
||||
const bst_float split_pt = (*p_tree)[nid].SplitCond();
|
||||
const uint32_t lower_bound = gmat.cut->row_ptr[fid];
|
||||
const uint32_t upper_bound = gmat.cut->row_ptr[fid + 1];
|
||||
int32_t split_cond = -1;
|
||||
@@ -558,7 +560,7 @@ class FastHistMaker: public TreeUpdater {
|
||||
}
|
||||
|
||||
row_set_collection_.AddSplit(
|
||||
nid, row_split_tloc_, (*p_tree)[nid].cleft(), (*p_tree)[nid].cright());
|
||||
nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@@ -569,24 +571,24 @@ class FastHistMaker: public TreeUpdater {
|
||||
bst_int split_cond,
|
||||
bool default_left) {
|
||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||
const int K = 8; // loop unrolling factor
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nrows = rowset.end - rowset.begin;
|
||||
const size_t rest = nrows % K;
|
||||
const size_t rest = nrows % kUnroll;
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
#pragma omp parallel for num_threads(nthread_) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
|
||||
const bst_uint tid = omp_get_thread_num();
|
||||
auto& left = row_split_tloc[tid].left;
|
||||
auto& right = row_split_tloc[tid].right;
|
||||
size_t rid[K];
|
||||
T rbin[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
size_t rid[kUnroll];
|
||||
T rbin[kUnroll];
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = rowset.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rbin[k] = column.index[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) { // NOLINT
|
||||
if (rbin[k] == std::numeric_limits<T>::max()) { // missing value
|
||||
if (default_left) {
|
||||
left.push_back(rid[k]);
|
||||
@@ -605,8 +607,8 @@ class FastHistMaker: public TreeUpdater {
|
||||
}
|
||||
}
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
auto& left = row_split_tloc[nthread-1].left;
|
||||
auto& right = row_split_tloc[nthread-1].right;
|
||||
auto& left = row_split_tloc[nthread_-1].left;
|
||||
auto& right = row_split_tloc[nthread_-1].right;
|
||||
const size_t rid = rowset.begin[i];
|
||||
const T rbin = column.index[rid];
|
||||
if (rbin == std::numeric_limits<T>::max()) { // missing value
|
||||
@@ -635,27 +637,27 @@ class FastHistMaker: public TreeUpdater {
|
||||
bst_int split_cond,
|
||||
bool default_left) {
|
||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||
const int K = 8; // loop unrolling factor
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nrows = rowset.end - rowset.begin;
|
||||
const size_t rest = nrows % K;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
size_t rid[K];
|
||||
GHistIndexRow row[K];
|
||||
const uint32_t* p[K];
|
||||
const size_t rest = nrows % kUnroll;
|
||||
#pragma omp parallel for num_threads(nthread_) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
|
||||
size_t rid[kUnroll];
|
||||
GHistIndexRow row[kUnroll];
|
||||
const uint32_t* p[kUnroll];
|
||||
bst_uint tid = omp_get_thread_num();
|
||||
auto& left = row_split_tloc[tid].left;
|
||||
auto& right = row_split_tloc[tid].right;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = rowset.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
row[k] = gmat[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
p[k] = std::lower_bound(row[k].index, row[k].index + row[k].size, lower_bound);
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
if (p[k] != row[k].index + row[k].size && *p[k] < upper_bound) {
|
||||
CHECK_LT(*p[k],
|
||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||
@@ -708,11 +710,11 @@ class FastHistMaker: public TreeUpdater {
|
||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||
const size_t nrows = rowset.end - rowset.begin;
|
||||
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
#pragma omp parallel num_threads(nthread_)
|
||||
{
|
||||
const size_t tid = static_cast<size_t>(omp_get_thread_num());
|
||||
const size_t ibegin = tid * nrows / nthread;
|
||||
const size_t iend = (tid + 1) * nrows / nthread;
|
||||
const auto tid = static_cast<size_t>(omp_get_thread_num());
|
||||
const size_t ibegin = tid * nrows / nthread_;
|
||||
const size_t iend = (tid + 1) * nrows / nthread_;
|
||||
if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range
|
||||
// search first nonzero row with index >= rowset[ibegin]
|
||||
const size_t* p = std::lower_bound(column.row_ind,
|
||||
@@ -769,17 +771,17 @@ class FastHistMaker: public TreeUpdater {
|
||||
|
||||
inline void InitNewNode(int nid,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_gpair>& gpair,
|
||||
const std::vector<GradientPair>& gpair,
|
||||
const DMatrix& fmat,
|
||||
const RegTree& tree) {
|
||||
{
|
||||
snode.resize(tree.param.num_nodes, NodeEntry(param));
|
||||
snode_.resize(tree.param.num_nodes, NodeEntry(param_));
|
||||
constraints_.resize(tree.param.num_nodes);
|
||||
}
|
||||
|
||||
// setup constraints before calculating the weight
|
||||
{
|
||||
auto& stats = snode[nid].stats;
|
||||
auto& stats = snode_[nid].stats;
|
||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||
/* specialized code for dense data
|
||||
For dense data (with no missing value),
|
||||
@@ -799,22 +801,22 @@ class FastHistMaker: public TreeUpdater {
|
||||
stats.Add(gpair[*it]);
|
||||
}
|
||||
}
|
||||
if (!tree[nid].is_root()) {
|
||||
const int pid = tree[nid].parent();
|
||||
constraints_[pid].SetChild(param, tree[pid].split_index(),
|
||||
snode[tree[pid].cleft()].stats,
|
||||
snode[tree[pid].cright()].stats,
|
||||
&constraints_[tree[pid].cleft()],
|
||||
&constraints_[tree[pid].cright()]);
|
||||
if (!tree[nid].IsRoot()) {
|
||||
const int pid = tree[nid].Parent();
|
||||
constraints_[pid].SetChild(param_, tree[pid].SplitIndex(),
|
||||
snode_[tree[pid].LeftChild()].stats,
|
||||
snode_[tree[pid].RightChild()].stats,
|
||||
&constraints_[tree[pid].LeftChild()],
|
||||
&constraints_[tree[pid].RightChild()]);
|
||||
}
|
||||
}
|
||||
|
||||
// calculating the weights
|
||||
{
|
||||
snode[nid].root_gain = static_cast<float>(
|
||||
constraints_[nid].CalcGain(param, snode[nid].stats));
|
||||
snode[nid].weight = static_cast<float>(
|
||||
constraints_[nid].CalcWeight(param, snode[nid].stats));
|
||||
snode_[nid].root_gain = static_cast<float>(
|
||||
constraints_[nid].CalcGain(param_, snode_[nid].stats));
|
||||
snode_[nid].weight = static_cast<float>(
|
||||
constraints_[nid].CalcWeight(param_, snode_[nid].stats));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -834,8 +836,8 @@ class FastHistMaker: public TreeUpdater {
|
||||
const std::vector<bst_float>& cut_val = gmat.cut->cut;
|
||||
|
||||
// statistics on both sides of split
|
||||
TStats c(param);
|
||||
TStats e(param);
|
||||
TStats c(param_);
|
||||
TStats e(param_);
|
||||
// best split so far
|
||||
SplitEntry best;
|
||||
|
||||
@@ -846,7 +848,7 @@ class FastHistMaker: public TreeUpdater {
|
||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||
// imin: index (offset) of the minimum value for feature fid
|
||||
// need this for backward enumeration
|
||||
const int32_t imin = static_cast<int32_t>(cut_ptr[fid]);
|
||||
const auto imin = static_cast<int32_t>(cut_ptr[fid]);
|
||||
// ibegin, iend: smallest/largest cut points for feature fid
|
||||
// use int to allow for value -1
|
||||
int32_t ibegin, iend;
|
||||
@@ -862,21 +864,21 @@ class FastHistMaker: public TreeUpdater {
|
||||
// start working
|
||||
// try to find a split
|
||||
e.Add(hist.begin[i].sum_grad, hist.begin[i].sum_hess);
|
||||
if (e.sum_hess >= param.min_child_weight) {
|
||||
if (e.sum_hess >= param_.min_child_weight) {
|
||||
c.SetSubstract(snode.stats, e);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
if (c.sum_hess >= param_.min_child_weight) {
|
||||
bst_float loss_chg;
|
||||
bst_float split_pt;
|
||||
if (d_step > 0) {
|
||||
// forward enumeration: split at right bound of each bin
|
||||
loss_chg = static_cast<bst_float>(
|
||||
constraint.CalcSplitGain(param, param.monotone_constraints[fid], e, c) -
|
||||
constraint.CalcSplitGain(param_, param_.monotone_constraints[fid], e, c) -
|
||||
snode.root_gain);
|
||||
split_pt = cut_val[i];
|
||||
} else {
|
||||
// backward enumeration: split at left bound of each bin
|
||||
loss_chg = static_cast<bst_float>(
|
||||
constraint.CalcSplitGain(param, param.monotone_constraints[fid], c, e) -
|
||||
constraint.CalcSplitGain(param_, param_.monotone_constraints[fid], c, e) -
|
||||
snode.root_gain);
|
||||
if (i == imin) {
|
||||
// for leftmost bin, left bound is the smallest feature value
|
||||
@@ -901,14 +903,14 @@ class FastHistMaker: public TreeUpdater {
|
||||
ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
|
||||
: nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
|
||||
};
|
||||
inline static bool depth_wise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
if (lhs.depth == rhs.depth) {
|
||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||
} else {
|
||||
return lhs.depth > rhs.depth; // favor small depth
|
||||
}
|
||||
}
|
||||
inline static bool loss_guide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
if (lhs.loss_chg == rhs.loss_chg) {
|
||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||
} else {
|
||||
@@ -917,19 +919,19 @@ class FastHistMaker: public TreeUpdater {
|
||||
}
|
||||
|
||||
// --data fields--
|
||||
const TrainParam& param;
|
||||
const FastHistParam& fhparam;
|
||||
const TrainParam& param_;
|
||||
const FastHistParam& fhparam_;
|
||||
// number of omp thread used during training
|
||||
int nthread;
|
||||
int nthread_;
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<bst_uint> feat_index;
|
||||
std::vector<bst_uint> feat_index_;
|
||||
// the internal row sets
|
||||
RowSetCollection row_set_collection_;
|
||||
// the temp space for split
|
||||
std::vector<RowSetCollection::Split> row_split_tloc_;
|
||||
std::vector<SplitEntry> best_split_tloc_;
|
||||
/*! \brief TreeNode Data: statistics for each constructed node */
|
||||
std::vector<NodeEntry> snode;
|
||||
std::vector<NodeEntry> snode_;
|
||||
/*! \brief culmulative histogram of gradients. */
|
||||
HistCollection hist_;
|
||||
/*! \brief feature with least # of bins. to be used for dense specialization
|
||||
@@ -948,9 +950,9 @@ class FastHistMaker: public TreeUpdater {
|
||||
// constraint value
|
||||
std::vector<TConstraint> constraints_;
|
||||
|
||||
typedef std::priority_queue<ExpandEntry,
|
||||
std::vector<ExpandEntry>,
|
||||
std::function<bool(ExpandEntry, ExpandEntry)>> ExpandQueue;
|
||||
using ExpandQueue =
|
||||
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||
std::unique_ptr<ExpandQueue> qexpand_;
|
||||
|
||||
enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
|
||||
@@ -964,14 +966,14 @@ class FastHistMaker: public TreeUpdater {
|
||||
// simple switch to defer implementation.
|
||||
class FastHistTreeUpdaterSwitch : public TreeUpdater {
|
||||
public:
|
||||
FastHistTreeUpdaterSwitch() : monotone_(false) {}
|
||||
FastHistTreeUpdaterSwitch() = default;
|
||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
for (auto &kv : args) {
|
||||
if (kv.first == "monotone_constraints" && kv.second.length() != 0) {
|
||||
monotone_ = true;
|
||||
}
|
||||
}
|
||||
if (inner_.get() == nullptr) {
|
||||
if (inner_ == nullptr) {
|
||||
if (monotone_) {
|
||||
inner_.reset(new FastHistMaker<GradStats, ValueConstraint>());
|
||||
} else {
|
||||
@@ -982,7 +984,7 @@ class FastHistTreeUpdaterSwitch : public TreeUpdater {
|
||||
inner_->Init(args);
|
||||
}
|
||||
|
||||
void Update(HostDeviceVector<bst_gpair>* gpair,
|
||||
void Update(HostDeviceVector<GradientPair>* gpair,
|
||||
DMatrix* data,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
CHECK(inner_ != nullptr);
|
||||
@@ -991,7 +993,7 @@ class FastHistTreeUpdaterSwitch : public TreeUpdater {
|
||||
|
||||
private:
|
||||
// monotone constraints
|
||||
bool monotone_;
|
||||
bool monotone_{false};
|
||||
// internal implementation
|
||||
std::unique_ptr<TreeUpdater> inner_;
|
||||
};
|
||||
|
||||
@@ -22,25 +22,25 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu);
|
||||
* @return the uniq key
|
||||
*/
|
||||
|
||||
static HOST_DEV_INLINE node_id_t abs2uniqKey(int tid, const node_id_t* abs,
|
||||
static HOST_DEV_INLINE NodeIdT abs2uniqKey(int tid, const NodeIdT* abs,
|
||||
const int* colIds,
|
||||
node_id_t nodeStart, int nKeys) {
|
||||
NodeIdT nodeStart, int nKeys) {
|
||||
int a = abs[tid];
|
||||
if (a == UNUSED_NODE) return a;
|
||||
if (a == kUnusedNode) return a;
|
||||
return ((a - nodeStart) + (colIds[tid] * nKeys));
|
||||
}
|
||||
|
||||
/**
|
||||
* @struct Pair
|
||||
* @brief Pair used for key basd scan operations on bst_gpair
|
||||
* @brief Pair used for key basd scan operations on GradientPair
|
||||
*/
|
||||
struct Pair {
|
||||
int key;
|
||||
bst_gpair value;
|
||||
GradientPair value;
|
||||
};
|
||||
|
||||
/** define a key that's not used at all in the entire boosting process */
|
||||
static const int NONE_KEY = -100;
|
||||
static const int kNoneKey = -100;
|
||||
|
||||
/**
|
||||
* @brief Allocate temporary buffers needed for scan operations
|
||||
@@ -49,9 +49,9 @@ static const int NONE_KEY = -100;
|
||||
* @param size number of elements that will be scanned
|
||||
*/
|
||||
template <int BLKDIM_L1L3 = 256>
|
||||
int scanTempBufferSize(int size) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
return nBlks;
|
||||
int ScanTempBufferSize(int size) {
|
||||
int num_blocks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
||||
return num_blocks;
|
||||
}
|
||||
|
||||
struct AddByKey {
|
||||
@@ -76,21 +76,21 @@ struct AddByKey {
|
||||
* @param instIds instance index buffer
|
||||
* @return the expected gradient value
|
||||
*/
|
||||
HOST_DEV_INLINE bst_gpair get(int id, const bst_gpair* vals,
|
||||
HOST_DEV_INLINE GradientPair get(int id, const GradientPair* vals,
|
||||
const int* instIds) {
|
||||
id = instIds[id];
|
||||
return vals[id];
|
||||
}
|
||||
|
||||
template <int BLKDIM_L1L3>
|
||||
__global__ void cubScanByKeyL1(bst_gpair* scans, const bst_gpair* vals,
|
||||
const int* instIds, bst_gpair* mScans,
|
||||
int* mKeys, const node_id_t* keys, int nUniqKeys,
|
||||
const int* colIds, node_id_t nodeStart,
|
||||
__global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
|
||||
const int* instIds, GradientPair* mScans,
|
||||
int* mKeys, const NodeIdT* keys, int nUniqKeys,
|
||||
const int* colIds, NodeIdT nodeStart,
|
||||
const int size) {
|
||||
Pair rootPair = {NONE_KEY, bst_gpair(0.f, 0.f)};
|
||||
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
||||
int myKey;
|
||||
bst_gpair myValue;
|
||||
GradientPair myValue;
|
||||
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
Pair threadData;
|
||||
@@ -99,8 +99,8 @@ __global__ void cubScanByKeyL1(bst_gpair* scans, const bst_gpair* vals,
|
||||
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
myValue = get(tid, vals, instIds);
|
||||
} else {
|
||||
myKey = NONE_KEY;
|
||||
myValue = 0.f;
|
||||
myKey = kNoneKey;
|
||||
myValue = {};
|
||||
}
|
||||
threadData.key = myKey;
|
||||
threadData.value = myValue;
|
||||
@@ -119,14 +119,14 @@ __global__ void cubScanByKeyL1(bst_gpair* scans, const bst_gpair* vals,
|
||||
}
|
||||
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
||||
threadData.value =
|
||||
(myKey == previousKey) ? threadData.value : bst_gpair(0.0f, 0.0f);
|
||||
(myKey == previousKey) ? threadData.value : GradientPair(0.0f, 0.0f);
|
||||
mKeys[blockIdx.x] = myKey;
|
||||
mScans[blockIdx.x] = threadData.value + myValue;
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLKSIZE>
|
||||
__global__ void cubScanByKeyL2(bst_gpair* mScans, int* mKeys, int mLength) {
|
||||
__global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
|
||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
||||
Pair threadData;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
@@ -140,31 +140,31 @@ __global__ void cubScanByKeyL2(bst_gpair* mScans, int* mKeys, int mLength) {
|
||||
}
|
||||
|
||||
template <int BLKDIM_L1L3>
|
||||
__global__ void cubScanByKeyL3(bst_gpair* sums, bst_gpair* scans,
|
||||
const bst_gpair* vals, const int* instIds,
|
||||
const bst_gpair* mScans, const int* mKeys,
|
||||
const node_id_t* keys, int nUniqKeys,
|
||||
const int* colIds, node_id_t nodeStart,
|
||||
__global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
|
||||
const GradientPair* vals, const int* instIds,
|
||||
const GradientPair* mScans, const int* mKeys,
|
||||
const NodeIdT* keys, int nUniqKeys,
|
||||
const int* colIds, NodeIdT nodeStart,
|
||||
const int size) {
|
||||
int relId = threadIdx.x;
|
||||
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
|
||||
// to avoid the following warning from nvcc:
|
||||
// __shared__ memory variable with non-empty constructor or destructor
|
||||
// (potential race between threads)
|
||||
__shared__ char gradBuff[sizeof(bst_gpair)];
|
||||
__shared__ char gradBuff[sizeof(GradientPair)];
|
||||
__shared__ int s_mKeys;
|
||||
bst_gpair* s_mScans = reinterpret_cast<bst_gpair*>(gradBuff);
|
||||
GradientPair* s_mScans = reinterpret_cast<GradientPair*>(gradBuff);
|
||||
if (tid >= size) return;
|
||||
// cache block-wide partial scan info
|
||||
if (relId == 0) {
|
||||
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY;
|
||||
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : bst_gpair();
|
||||
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
|
||||
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
|
||||
}
|
||||
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
int previousKey =
|
||||
tid == 0 ? NONE_KEY
|
||||
tid == 0 ? kNoneKey
|
||||
: abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
||||
bst_gpair myValue = scans[tid];
|
||||
GradientPair myValue = scans[tid];
|
||||
__syncthreads();
|
||||
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
||||
myValue += s_mScans[0];
|
||||
@@ -174,7 +174,7 @@ __global__ void cubScanByKeyL3(bst_gpair* sums, bst_gpair* scans,
|
||||
}
|
||||
if ((previousKey != myKey) && (previousKey >= 0)) {
|
||||
sums[previousKey] = myValue;
|
||||
myValue = bst_gpair(0.0f, 0.0f);
|
||||
myValue = GradientPair(0.0f, 0.0f);
|
||||
}
|
||||
scans[tid] = myValue;
|
||||
}
|
||||
@@ -200,12 +200,12 @@ __global__ void cubScanByKeyL3(bst_gpair* sums, bst_gpair* scans,
|
||||
* @param nodeStart index of the leftmost node in the current level
|
||||
*/
|
||||
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
||||
void reduceScanByKey(bst_gpair* sums, bst_gpair* scans, const bst_gpair* vals,
|
||||
const int* instIds, const node_id_t* keys, int size,
|
||||
int nUniqKeys, int nCols, bst_gpair* tmpScans,
|
||||
int* tmpKeys, const int* colIds, node_id_t nodeStart) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(bst_gpair));
|
||||
void reduceScanByKey(GradientPair* sums, GradientPair* scans, const GradientPair* vals,
|
||||
const int* instIds, const NodeIdT* keys, int size,
|
||||
int nUniqKeys, int nCols, GradientPair* tmpScans,
|
||||
int* tmpKeys, const int* colIds, NodeIdT nodeStart) {
|
||||
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(GradientPair));
|
||||
cubScanByKeyL1<BLKDIM_L1L3>
|
||||
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
|
||||
nUniqKeys, colIds, nodeStart, size);
|
||||
@@ -243,13 +243,13 @@ struct ExactSplitCandidate {
|
||||
*/
|
||||
enum ArgMaxByKeyAlgo {
|
||||
/** simplest, use gmem-atomics for all updates */
|
||||
ABK_GMEM = 0,
|
||||
kAbkGmem = 0,
|
||||
/** use smem-atomics for updates (when number of keys are less) */
|
||||
ABK_SMEM
|
||||
kAbkSmem
|
||||
};
|
||||
|
||||
/** max depth until which to use shared mem based atomics for argmax */
|
||||
static const int MAX_ABK_LEVELS = 3;
|
||||
static const int kMaxAbkLevels = 3;
|
||||
|
||||
HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
|
||||
ExactSplitCandidate b) {
|
||||
@@ -281,27 +281,27 @@ DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
|
||||
}
|
||||
|
||||
DEV_INLINE void argMaxWithAtomics(
|
||||
int id, ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
|
||||
const bst_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const GPUTrainingParam& param) {
|
||||
int id, ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
NodeIdT nodeStart, int len, const GPUTrainingParam& param) {
|
||||
int nodeId = nodeAssigns[id];
|
||||
// @todo: this is really a bad check! but will be fixed when we move
|
||||
// to key-based reduction
|
||||
if ((id == 0) ||
|
||||
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
||||
(vals[id] == vals[id - 1]))) {
|
||||
if (nodeId != UNUSED_NODE) {
|
||||
if (nodeId != kUnusedNode) {
|
||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||
bst_gpair colSum = gradSums[sumId];
|
||||
GradientPair colSum = gradSums[sumId];
|
||||
int uid = nodeId - nodeStart;
|
||||
DeviceNodeStats n = nodes[nodeId];
|
||||
bst_gpair parentSum = n.sum_gradients;
|
||||
GradientPair parentSum = n.sum_gradients;
|
||||
float parentGain = n.root_gain;
|
||||
bool tmp;
|
||||
ExactSplitCandidate s;
|
||||
bst_gpair missing = parentSum - colSum;
|
||||
s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
|
||||
GradientPair missing = parentSum - colSum;
|
||||
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
|
||||
param, tmp);
|
||||
s.index = id;
|
||||
atomicArgMax(nodeSplits + uid, s);
|
||||
@@ -310,10 +310,10 @@ DEV_INLINE void argMaxWithAtomics(
|
||||
}
|
||||
|
||||
__global__ void atomicArgMaxByKeyGmem(
|
||||
ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
|
||||
const bst_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param) {
|
||||
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
NodeIdT nodeStart, int len, const TrainParam param) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
@@ -324,10 +324,10 @@ __global__ void atomicArgMaxByKeyGmem(
|
||||
}
|
||||
|
||||
__global__ void atomicArgMaxByKeySmem(
|
||||
ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
|
||||
const bst_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param) {
|
||||
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
NodeIdT nodeStart, int len, const GPUTrainingParam param) {
|
||||
extern __shared__ char sArr[];
|
||||
ExactSplitCandidate* sNodeSplits =
|
||||
reinterpret_cast<ExactSplitCandidate*>(sArr);
|
||||
@@ -368,27 +368,27 @@ __global__ void atomicArgMaxByKeySmem(
|
||||
* @param algo which algorithm to use for argmax_by_key
|
||||
*/
|
||||
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
||||
void argMaxByKey(ExactSplitCandidate* nodeSplits, const bst_gpair* gradScans,
|
||||
const bst_gpair* gradSums, const float* vals,
|
||||
const int* colIds, const node_id_t* nodeAssigns,
|
||||
void argMaxByKey(ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
||||
const GradientPair* gradSums, const float* vals,
|
||||
const int* colIds, const NodeIdT* nodeAssigns,
|
||||
const DeviceNodeStats* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param,
|
||||
NodeIdT nodeStart, int len, const TrainParam param,
|
||||
ArgMaxByKeyAlgo algo) {
|
||||
dh::fillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
||||
dh::get_device_idx(param.gpu_id), nodeSplits, nUniqKeys,
|
||||
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
||||
dh::GetDeviceIdx(param.gpu_id), nodeSplits, nUniqKeys,
|
||||
ExactSplitCandidate());
|
||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
switch (algo) {
|
||||
case ABK_GMEM:
|
||||
case kAbkGmem:
|
||||
atomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
case ABK_SMEM:
|
||||
case kAbkSmem:
|
||||
atomicArgMaxByKeySmem<<<nBlks, BLKDIM,
|
||||
sizeof(ExactSplitCandidate) * nUniqKeys>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
nUniqKeys, nodeStart, len, GPUTrainingParam(param));
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||
@@ -404,22 +404,22 @@ __global__ void assignColIds(int* colIds, const int* colOffsets) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
||||
__global__ void fillDefaultNodeIds(NodeIdT* nodeIdsPerInst,
|
||||
const DeviceNodeStats* nodes, int nRows) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
if (id >= nRows) {
|
||||
return;
|
||||
}
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
node_id_t nId = nodeIdsPerInst[id];
|
||||
if (nId == UNUSED_NODE) {
|
||||
NodeIdT nId = nodeIdsPerInst[id];
|
||||
if (nId == kUnusedNode) {
|
||||
return;
|
||||
}
|
||||
const DeviceNodeStats n = nodes[nId];
|
||||
node_id_t result;
|
||||
NodeIdT result;
|
||||
if (n.IsLeaf() || n.IsUnused()) {
|
||||
result = UNUSED_NODE;
|
||||
} else if (n.dir == LeftDir) {
|
||||
result = kUnusedNode;
|
||||
} else if (n.dir == kLeftDir) {
|
||||
result = (2 * n.idx) + 1;
|
||||
} else {
|
||||
result = (2 * n.idx) + 2;
|
||||
@@ -427,8 +427,8 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
||||
nodeIdsPerInst[id] = result;
|
||||
}
|
||||
|
||||
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
const node_id_t* nodeIds, const int* instId,
|
||||
__global__ void assignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations,
|
||||
const NodeIdT* nodeIds, const int* instId,
|
||||
const DeviceNodeStats* nodes,
|
||||
const int* colOffsets, const float* vals,
|
||||
int nVals, int nCols) {
|
||||
@@ -441,7 +441,7 @@ __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
// the nodeIdsPerInst with all default assignments
|
||||
int nId = nodeIds[id];
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
if (nId != UNUSED_NODE) {
|
||||
if (nId != kUnusedNode) {
|
||||
const DeviceNodeStats n = nodes[nId];
|
||||
int colId = n.fidx;
|
||||
// printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||
@@ -449,7 +449,7 @@ __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
int end = colOffsets[colId + 1];
|
||||
// @todo: too much wasteful threads!!
|
||||
if ((id >= start) && (id < end) && !(n.IsLeaf() || n.IsUnused())) {
|
||||
node_id_t result = (2 * n.idx) + 1 + (vals[id] >= n.fvalue);
|
||||
NodeIdT result = (2 * n.idx) + 1 + (vals[id] >= n.fvalue);
|
||||
nodeIdsPerInst[instId[id]] = result;
|
||||
}
|
||||
}
|
||||
@@ -475,31 +475,31 @@ class GPUMaker : public TreeUpdater {
|
||||
/** whether we have initialized memory already (so as not to repeat!) */
|
||||
bool allocated;
|
||||
/** feature values stored in column-major compressed format */
|
||||
dh::dvec2<float> vals;
|
||||
dh::dvec<float> vals_cached;
|
||||
dh::DVec2<float> vals;
|
||||
dh::DVec<float> vals_cached;
|
||||
/** corresponding instance id's of these featutre values */
|
||||
dh::dvec2<int> instIds;
|
||||
dh::dvec<int> instIds_cached;
|
||||
dh::DVec2<int> instIds;
|
||||
dh::DVec<int> instIds_cached;
|
||||
/** column offsets for these feature values */
|
||||
dh::dvec<int> colOffsets;
|
||||
dh::dvec<bst_gpair> gradsInst;
|
||||
dh::dvec2<node_id_t> nodeAssigns;
|
||||
dh::dvec2<int> nodeLocations;
|
||||
dh::dvec<DeviceNodeStats> nodes;
|
||||
dh::dvec<node_id_t> nodeAssignsPerInst;
|
||||
dh::dvec<bst_gpair> gradSums;
|
||||
dh::dvec<bst_gpair> gradScans;
|
||||
dh::dvec<ExactSplitCandidate> nodeSplits;
|
||||
dh::DVec<int> colOffsets;
|
||||
dh::DVec<GradientPair> gradsInst;
|
||||
dh::DVec2<NodeIdT> nodeAssigns;
|
||||
dh::DVec2<int> nodeLocations;
|
||||
dh::DVec<DeviceNodeStats> nodes;
|
||||
dh::DVec<NodeIdT> nodeAssignsPerInst;
|
||||
dh::DVec<GradientPair> gradSums;
|
||||
dh::DVec<GradientPair> gradScans;
|
||||
dh::DVec<ExactSplitCandidate> nodeSplits;
|
||||
int nVals;
|
||||
int nRows;
|
||||
int nCols;
|
||||
int maxNodes;
|
||||
int maxLeaves;
|
||||
dh::CubMemory tmp_mem;
|
||||
dh::dvec<bst_gpair> tmpScanGradBuff;
|
||||
dh::dvec<int> tmpScanKeyBuff;
|
||||
dh::dvec<int> colIds;
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
dh::DVec<GradientPair> tmpScanGradBuff;
|
||||
dh::DVec<int> tmpScanKeyBuff;
|
||||
dh::DVec<int> colIds;
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||
|
||||
public:
|
||||
GPUMaker() : allocated(false) {}
|
||||
@@ -512,9 +512,9 @@ class GPUMaker : public TreeUpdater {
|
||||
maxLeaves = 1 << param.max_depth;
|
||||
}
|
||||
|
||||
void Update(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
|
||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
GradStats::CheckInfo(dmat->info());
|
||||
GradStats::CheckInfo(dmat->Info());
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
@@ -530,7 +530,7 @@ class GPUMaker : public TreeUpdater {
|
||||
param.learning_rate = lr;
|
||||
}
|
||||
/// @note: Update should be only after Init!!
|
||||
void UpdateTree(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
RegTree* hTree) {
|
||||
if (!allocated) {
|
||||
setupOneTimeData(dmat);
|
||||
@@ -538,33 +538,33 @@ class GPUMaker : public TreeUpdater {
|
||||
for (int i = 0; i < param.max_depth; ++i) {
|
||||
if (i == 0) {
|
||||
// make sure to start on a fresh tree with sorted values!
|
||||
vals.current_dvec() = vals_cached;
|
||||
instIds.current_dvec() = instIds_cached;
|
||||
vals.CurrentDVec() = vals_cached;
|
||||
instIds.CurrentDVec() = instIds_cached;
|
||||
transferGrads(gpair);
|
||||
}
|
||||
int nNodes = 1 << i;
|
||||
node_id_t nodeStart = nNodes - 1;
|
||||
NodeIdT nodeStart = nNodes - 1;
|
||||
initNodeData(i, nodeStart, nNodes);
|
||||
findSplit(i, nodeStart, nNodes);
|
||||
}
|
||||
// mark all the used nodes with unused children as leaf nodes
|
||||
markLeaves();
|
||||
dense2sparse_tree(hTree, nodes, param);
|
||||
Dense2SparseTree(hTree, nodes, param);
|
||||
}
|
||||
|
||||
void split2node(int nNodes, node_id_t nodeStart) {
|
||||
auto d_nodes = nodes.data();
|
||||
auto d_gradScans = gradScans.data();
|
||||
auto d_gradSums = gradSums.data();
|
||||
auto d_nodeAssigns = nodeAssigns.current();
|
||||
auto d_colIds = colIds.data();
|
||||
auto d_vals = vals.current();
|
||||
auto d_nodeSplits = nodeSplits.data();
|
||||
void split2node(int nNodes, NodeIdT nodeStart) {
|
||||
auto d_nodes = nodes.Data();
|
||||
auto d_gradScans = gradScans.Data();
|
||||
auto d_gradSums = gradSums.Data();
|
||||
auto d_nodeAssigns = nodeAssigns.Current();
|
||||
auto d_colIds = colIds.Data();
|
||||
auto d_vals = vals.Current();
|
||||
auto d_nodeSplits = nodeSplits.Data();
|
||||
int nUniqKeys = nNodes;
|
||||
float min_split_loss = param.min_split_loss;
|
||||
auto gpu_param = GPUTrainingParam(param);
|
||||
|
||||
dh::launch_n(param.gpu_id, nNodes, [=] __device__(int uid) {
|
||||
dh::LaunchN(param.gpu_id, nNodes, [=] __device__(int uid) {
|
||||
int absNodeId = uid + nodeStart;
|
||||
ExactSplitCandidate s = d_nodeSplits[uid];
|
||||
if (s.isSplittable(min_split_loss)) {
|
||||
@@ -573,26 +573,26 @@ class GPUMaker : public TreeUpdater {
|
||||
abs2uniqKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
||||
bool missingLeft = true;
|
||||
const DeviceNodeStats& n = d_nodes[absNodeId];
|
||||
bst_gpair gradScan = d_gradScans[idx];
|
||||
bst_gpair gradSum = d_gradSums[nodeInstId];
|
||||
GradientPair gradScan = d_gradScans[idx];
|
||||
GradientPair gradSum = d_gradSums[nodeInstId];
|
||||
float thresh = d_vals[idx];
|
||||
int colId = d_colIds[idx];
|
||||
// get the default direction for the current node
|
||||
bst_gpair missing = n.sum_gradients - gradSum;
|
||||
loss_chg_missing(gradScan, missing, n.sum_gradients, n.root_gain,
|
||||
GradientPair missing = n.sum_gradients - gradSum;
|
||||
LossChangeMissing(gradScan, missing, n.sum_gradients, n.root_gain,
|
||||
gpu_param, missingLeft);
|
||||
// get the score/weight/id/gradSum for left and right child nodes
|
||||
bst_gpair lGradSum = missingLeft ? gradScan + missing : gradScan;
|
||||
bst_gpair rGradSum = n.sum_gradients - lGradSum;
|
||||
GradientPair lGradSum = missingLeft ? gradScan + missing : gradScan;
|
||||
GradientPair rGradSum = n.sum_gradients - lGradSum;
|
||||
|
||||
// Create children
|
||||
d_nodes[left_child_nidx(absNodeId)] =
|
||||
DeviceNodeStats(lGradSum, left_child_nidx(absNodeId), gpu_param);
|
||||
d_nodes[right_child_nidx(absNodeId)] =
|
||||
DeviceNodeStats(rGradSum, right_child_nidx(absNodeId), gpu_param);
|
||||
d_nodes[LeftChildNodeIdx(absNodeId)] =
|
||||
DeviceNodeStats(lGradSum, LeftChildNodeIdx(absNodeId), gpu_param);
|
||||
d_nodes[RightChildNodeIdx(absNodeId)] =
|
||||
DeviceNodeStats(rGradSum, RightChildNodeIdx(absNodeId), gpu_param);
|
||||
// Set split for parent
|
||||
d_nodes[absNodeId].SetSplit(thresh, colId,
|
||||
missingLeft ? LeftDir : RightDir, lGradSum,
|
||||
missingLeft ? kLeftDir : kRightDir, lGradSum,
|
||||
rGradSum);
|
||||
} else {
|
||||
// cannot be split further, so this node is a leaf!
|
||||
@@ -601,21 +601,21 @@ class GPUMaker : public TreeUpdater {
|
||||
});
|
||||
}
|
||||
|
||||
void findSplit(int level, node_id_t nodeStart, int nNodes) {
|
||||
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
|
||||
instIds.current(), nodeAssigns.current(), nVals, nNodes,
|
||||
nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
|
||||
colIds.data(), nodeStart);
|
||||
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
|
||||
vals.current(), colIds.data(), nodeAssigns.current(),
|
||||
nodes.data(), nNodes, nodeStart, nVals, param,
|
||||
level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
|
||||
void findSplit(int level, NodeIdT nodeStart, int nNodes) {
|
||||
reduceScanByKey(gradSums.Data(), gradScans.Data(), gradsInst.Data(),
|
||||
instIds.Current(), nodeAssigns.Current(), nVals, nNodes,
|
||||
nCols, tmpScanGradBuff.Data(), tmpScanKeyBuff.Data(),
|
||||
colIds.Data(), nodeStart);
|
||||
argMaxByKey(nodeSplits.Data(), gradScans.Data(), gradSums.Data(),
|
||||
vals.Current(), colIds.Data(), nodeAssigns.Current(),
|
||||
nodes.Data(), nNodes, nodeStart, nVals, param,
|
||||
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
||||
split2node(nNodes, nodeStart);
|
||||
}
|
||||
|
||||
void allocateAllData(int offsetSize) {
|
||||
int tmpBuffSize = scanTempBufferSize(nVals);
|
||||
ba.allocate(dh::get_device_idx(param.gpu_id), param.silent, &vals, nVals,
|
||||
int tmpBuffSize = ScanTempBufferSize(nVals);
|
||||
ba.Allocate(dh::GetDeviceIdx(param.gpu_id), param.silent, &vals, nVals,
|
||||
&vals_cached, nVals, &instIds, nVals, &instIds_cached, nVals,
|
||||
&colOffsets, offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
|
||||
&nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
|
||||
@@ -625,7 +625,7 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void setupOneTimeData(DMatrix* dmat) {
|
||||
size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
|
||||
size_t free_memory = dh::AvailableMemory(dh::GetDeviceIdx(param.gpu_id));
|
||||
if (!dmat->SingleColBlock()) {
|
||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||
}
|
||||
@@ -640,11 +640,11 @@ class GPUMaker : public TreeUpdater {
|
||||
|
||||
void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
||||
std::vector<int>* fId, std::vector<size_t>* offset) {
|
||||
MetaInfo info = dmat->info();
|
||||
CHECK(info.num_col < std::numeric_limits<int>::max());
|
||||
CHECK(info.num_row < std::numeric_limits<int>::max());
|
||||
nRows = static_cast<int>(info.num_row);
|
||||
nCols = static_cast<int>(info.num_col);
|
||||
MetaInfo info = dmat->Info();
|
||||
CHECK(info.num_col_ < std::numeric_limits<int>::max());
|
||||
CHECK(info.num_row_ < std::numeric_limits<int>::max());
|
||||
nRows = static_cast<int>(info.num_row_);
|
||||
nCols = static_cast<int>(info.num_col_);
|
||||
offset->reserve(nCols + 1);
|
||||
offset->push_back(0);
|
||||
fval->reserve(nCols * nRows);
|
||||
@@ -677,56 +677,56 @@ class GPUMaker : public TreeUpdater {
|
||||
void transferAndSortData(const std::vector<float>& fval,
|
||||
const std::vector<int>& fId,
|
||||
const std::vector<size_t>& offset) {
|
||||
vals.current_dvec() = fval;
|
||||
instIds.current_dvec() = fId;
|
||||
vals.CurrentDVec() = fval;
|
||||
instIds.CurrentDVec() = fId;
|
||||
colOffsets = offset;
|
||||
dh::segmentedSort<float, int>(&tmp_mem, &vals, &instIds, nVals, nCols,
|
||||
dh::SegmentedSort<float, int>(&tmp_mem, &vals, &instIds, nVals, nCols,
|
||||
colOffsets);
|
||||
vals_cached = vals.current_dvec();
|
||||
instIds_cached = instIds.current_dvec();
|
||||
assignColIds<<<nCols, 512>>>(colIds.data(), colOffsets.data());
|
||||
vals_cached = vals.CurrentDVec();
|
||||
instIds_cached = instIds.CurrentDVec();
|
||||
assignColIds<<<nCols, 512>>>(colIds.Data(), colOffsets.Data());
|
||||
}
|
||||
|
||||
void transferGrads(HostDeviceVector<bst_gpair>* gpair) {
|
||||
void transferGrads(HostDeviceVector<GradientPair>* gpair) {
|
||||
// HACK
|
||||
dh::safe_cuda(cudaMemcpy(gradsInst.data(), gpair->ptr_d(param.gpu_id),
|
||||
sizeof(bst_gpair) * nRows,
|
||||
dh::safe_cuda(cudaMemcpy(gradsInst.Data(), gpair->DevicePointer(param.gpu_id),
|
||||
sizeof(GradientPair) * nRows,
|
||||
cudaMemcpyDefault));
|
||||
// evaluate the full-grad reduction for the root node
|
||||
dh::sumReduction<bst_gpair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||
dh::SumReduction<GradientPair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||
}
|
||||
|
||||
void initNodeData(int level, node_id_t nodeStart, int nNodes) {
|
||||
void initNodeData(int level, NodeIdT nodeStart, int nNodes) {
|
||||
// all instances belong to root node at the beginning!
|
||||
if (level == 0) {
|
||||
nodes.fill(DeviceNodeStats());
|
||||
nodeAssigns.current_dvec().fill(0);
|
||||
nodeAssignsPerInst.fill(0);
|
||||
nodes.Fill(DeviceNodeStats());
|
||||
nodeAssigns.CurrentDVec().Fill(0);
|
||||
nodeAssignsPerInst.Fill(0);
|
||||
// for root node, just update the gradient/score/weight/id info
|
||||
// before splitting it! Currently all data is on GPU, hence this
|
||||
// stupid little kernel
|
||||
auto d_nodes = nodes.data();
|
||||
auto d_sums = gradSums.data();
|
||||
auto d_nodes = nodes.Data();
|
||||
auto d_sums = gradSums.Data();
|
||||
auto gpu_params = GPUTrainingParam(param);
|
||||
dh::launch_n(param.gpu_id, 1, [=] __device__(int idx) {
|
||||
dh::LaunchN(param.gpu_id, 1, [=] __device__(int idx) {
|
||||
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
|
||||
});
|
||||
} else {
|
||||
const int BlkDim = 256;
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = dh::div_round_up(nRows, BlkDim);
|
||||
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodes.data(), nRows);
|
||||
int nBlks = dh::DivRoundUp(nRows, BlkDim);
|
||||
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.Data(),
|
||||
nodes.Data(), nRows);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
|
||||
nBlks = dh::DivRoundUp(nVals, BlkDim * ItemsPerThread);
|
||||
assignNodeIds<<<nBlks, BlkDim>>>(
|
||||
nodeAssignsPerInst.data(), nodeLocations.current(),
|
||||
nodeAssigns.current(), instIds.current(), nodes.data(),
|
||||
colOffsets.data(), vals.current(), nVals, nCols);
|
||||
nodeAssignsPerInst.Data(), nodeLocations.Current(),
|
||||
nodeAssigns.Current(), instIds.Current(), nodes.Data(),
|
||||
colOffsets.Data(), vals.Current(), nVals, nCols);
|
||||
// gather the node assignments across all other columns too
|
||||
dh::gather(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
|
||||
nodeAssignsPerInst.data(), instIds.current(), nVals);
|
||||
dh::Gather(dh::GetDeviceIdx(param.gpu_id), nodeAssigns.Current(),
|
||||
nodeAssignsPerInst.Data(), instIds.Current(), nVals);
|
||||
sortKeys(level);
|
||||
}
|
||||
}
|
||||
@@ -734,19 +734,19 @@ class GPUMaker : public TreeUpdater {
|
||||
void sortKeys(int level) {
|
||||
// segmented-sort the arrays based on node-id's
|
||||
// but we don't need more than level+1 bits for sorting!
|
||||
segmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
|
||||
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
|
||||
colOffsets, 0, level + 1);
|
||||
dh::gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
|
||||
vals.current(), instIds.other(), instIds.current(),
|
||||
nodeLocations.current(), nVals);
|
||||
dh::Gather<float, int>(dh::GetDeviceIdx(param.gpu_id), vals.other(),
|
||||
vals.Current(), instIds.other(), instIds.Current(),
|
||||
nodeLocations.Current(), nVals);
|
||||
vals.buff().selector ^= 1;
|
||||
instIds.buff().selector ^= 1;
|
||||
}
|
||||
|
||||
void markLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = dh::div_round_up(maxNodes, BlkDim);
|
||||
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
|
||||
int nBlks = dh::DivRoundUp(maxNodes, BlkDim);
|
||||
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.Data(), maxNodes);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
|
||||
#else
|
||||
__device__ __forceinline__ double atomicAdd(double* address, double val) {
|
||||
XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
|
||||
unsigned long long int* address_as_ull =
|
||||
(unsigned long long int*)address; // NOLINT
|
||||
unsigned long long int old = *address_as_ull, assumed; // NOLINT
|
||||
@@ -37,8 +37,8 @@ namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
// Atomic add function for double precision gradients
|
||||
__device__ __forceinline__ void AtomicAddGpair(bst_gpair_precise* dest,
|
||||
const bst_gpair& gpair) {
|
||||
__device__ __forceinline__ void AtomicAddGpair(GradientPairPrecise* dest,
|
||||
const GradientPair& gpair) {
|
||||
auto dst_ptr = reinterpret_cast<double*>(dest);
|
||||
|
||||
atomicAdd(dst_ptr, static_cast<double>(gpair.GetGrad()));
|
||||
@@ -46,11 +46,11 @@ __device__ __forceinline__ void AtomicAddGpair(bst_gpair_precise* dest,
|
||||
}
|
||||
|
||||
// For integer gradients
|
||||
__device__ __forceinline__ void AtomicAddGpair(bst_gpair_integer* dest,
|
||||
const bst_gpair& gpair) {
|
||||
__device__ __forceinline__ void AtomicAddGpair(GradientPairInteger* dest,
|
||||
const GradientPair& gpair) {
|
||||
auto dst_ptr = reinterpret_cast<unsigned long long int*>(dest); // NOLINT
|
||||
bst_gpair_integer tmp(gpair.GetGrad(), gpair.GetHess());
|
||||
auto src_ptr = reinterpret_cast<bst_gpair_integer::value_t*>(&tmp);
|
||||
GradientPairInteger tmp(gpair.GetGrad(), gpair.GetHess());
|
||||
auto src_ptr = reinterpret_cast<GradientPairInteger::ValueT*>(&tmp);
|
||||
|
||||
atomicAdd(dst_ptr,
|
||||
static_cast<unsigned long long int>(*src_ptr)); // NOLINT
|
||||
@@ -59,13 +59,11 @@ __device__ __forceinline__ void AtomicAddGpair(bst_gpair_integer* dest,
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void CheckGradientMax(const dh::dvec<bst_gpair>& gpair)
|
||||
*
|
||||
* \brief Check maximum gradient value is below 2^16. This is to prevent
|
||||
* overflow when using integer gradient summation.
|
||||
*/
|
||||
|
||||
inline void CheckGradientMax(const std::vector<bst_gpair>& gpair) {
|
||||
inline void CheckGradientMax(const std::vector<GradientPair>& gpair) {
|
||||
auto* ptr = reinterpret_cast<const float*>(gpair.data());
|
||||
float abs_max =
|
||||
std::accumulate(ptr, ptr + (gpair.size() * 2), 0.f,
|
||||
@@ -87,19 +85,19 @@ struct GPUTrainingParam {
|
||||
// default=0 means no constraint on weight delta
|
||||
float max_delta_step;
|
||||
|
||||
__host__ __device__ GPUTrainingParam() {}
|
||||
GPUTrainingParam() = default;
|
||||
|
||||
__host__ __device__ GPUTrainingParam(const TrainParam& param)
|
||||
XGBOOST_DEVICE explicit GPUTrainingParam(const TrainParam& param)
|
||||
: min_child_weight(param.min_child_weight),
|
||||
reg_lambda(param.reg_lambda),
|
||||
reg_alpha(param.reg_alpha),
|
||||
max_delta_step(param.max_delta_step) {}
|
||||
};
|
||||
|
||||
typedef int node_id_t;
|
||||
using NodeIdT = int;
|
||||
|
||||
/** used to assign default id to a Node */
|
||||
static const int UNUSED_NODE = -1;
|
||||
static const int kUnusedNode = -1;
|
||||
|
||||
/**
|
||||
* @enum DefaultDirection node.cuh
|
||||
@@ -107,9 +105,9 @@ static const int UNUSED_NODE = -1;
|
||||
*/
|
||||
enum DefaultDirection {
|
||||
/** move to left child */
|
||||
LeftDir = 0,
|
||||
kLeftDir = 0,
|
||||
/** move to right child */
|
||||
RightDir
|
||||
kRightDir
|
||||
};
|
||||
|
||||
struct DeviceSplitCandidate {
|
||||
@@ -117,15 +115,15 @@ struct DeviceSplitCandidate {
|
||||
DefaultDirection dir;
|
||||
float fvalue;
|
||||
int findex;
|
||||
bst_gpair_integer left_sum;
|
||||
bst_gpair_integer right_sum;
|
||||
GradientPair left_sum;
|
||||
GradientPair right_sum;
|
||||
|
||||
__host__ __device__ DeviceSplitCandidate()
|
||||
: loss_chg(-FLT_MAX), dir(LeftDir), fvalue(0), findex(-1) {}
|
||||
XGBOOST_DEVICE DeviceSplitCandidate()
|
||||
: loss_chg(-FLT_MAX), dir(kLeftDir), fvalue(0), findex(-1) {}
|
||||
|
||||
template <typename param_t>
|
||||
__host__ __device__ void Update(const DeviceSplitCandidate& other,
|
||||
const param_t& param) {
|
||||
template <typename ParamT>
|
||||
XGBOOST_DEVICE void Update(const DeviceSplitCandidate& other,
|
||||
const ParamT& param) {
|
||||
if (other.loss_chg > loss_chg &&
|
||||
other.left_sum.GetHess() >= param.min_child_weight &&
|
||||
other.right_sum.GetHess() >= param.min_child_weight) {
|
||||
@@ -133,10 +131,10 @@ struct DeviceSplitCandidate {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void Update(float loss_chg_in, DefaultDirection dir_in,
|
||||
XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in,
|
||||
float fvalue_in, int findex_in,
|
||||
bst_gpair_integer left_sum_in,
|
||||
bst_gpair_integer right_sum_in,
|
||||
GradientPair left_sum_in,
|
||||
GradientPair right_sum_in,
|
||||
const GPUTrainingParam& param) {
|
||||
if (loss_chg_in > loss_chg &&
|
||||
left_sum_in.GetHess() >= param.min_child_weight &&
|
||||
@@ -149,11 +147,11 @@ struct DeviceSplitCandidate {
|
||||
findex = findex_in;
|
||||
}
|
||||
}
|
||||
__device__ bool IsValid() const { return loss_chg > 0.0f; }
|
||||
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
|
||||
};
|
||||
|
||||
struct DeviceNodeStats {
|
||||
bst_gpair sum_gradients;
|
||||
GradientPair sum_gradients;
|
||||
float root_gain;
|
||||
float weight;
|
||||
|
||||
@@ -161,31 +159,31 @@ struct DeviceNodeStats {
|
||||
DefaultDirection dir;
|
||||
/** threshold value for comparison */
|
||||
float fvalue;
|
||||
bst_gpair left_sum;
|
||||
bst_gpair right_sum;
|
||||
GradientPair left_sum;
|
||||
GradientPair right_sum;
|
||||
/** \brief The feature index. */
|
||||
int fidx;
|
||||
/** node id (used as key for reduce/scan) */
|
||||
node_id_t idx;
|
||||
NodeIdT idx;
|
||||
|
||||
HOST_DEV_INLINE DeviceNodeStats()
|
||||
: sum_gradients(),
|
||||
root_gain(-FLT_MAX),
|
||||
weight(-FLT_MAX),
|
||||
dir(LeftDir),
|
||||
dir(kLeftDir),
|
||||
fvalue(0.f),
|
||||
left_sum(),
|
||||
right_sum(),
|
||||
fidx(UNUSED_NODE),
|
||||
idx(UNUSED_NODE) {}
|
||||
fidx(kUnusedNode),
|
||||
idx(kUnusedNode) {}
|
||||
|
||||
template <typename param_t>
|
||||
HOST_DEV_INLINE DeviceNodeStats(bst_gpair sum_gradients, node_id_t nidx,
|
||||
const param_t& param)
|
||||
template <typename ParamT>
|
||||
HOST_DEV_INLINE DeviceNodeStats(GradientPair sum_gradients, NodeIdT nidx,
|
||||
const ParamT& param)
|
||||
: sum_gradients(sum_gradients),
|
||||
dir(LeftDir),
|
||||
dir(kLeftDir),
|
||||
fvalue(0.f),
|
||||
fidx(UNUSED_NODE),
|
||||
fidx(kUnusedNode),
|
||||
idx(nidx) {
|
||||
this->root_gain =
|
||||
CalcGain(param, sum_gradients.GetGrad(), sum_gradients.GetHess());
|
||||
@@ -194,7 +192,7 @@ struct DeviceNodeStats {
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE void SetSplit(float fvalue, int fidx, DefaultDirection dir,
|
||||
bst_gpair left_sum, bst_gpair right_sum) {
|
||||
GradientPair left_sum, GradientPair right_sum) {
|
||||
this->fvalue = fvalue;
|
||||
this->fidx = fidx;
|
||||
this->dir = dir;
|
||||
@@ -208,11 +206,11 @@ struct DeviceNodeStats {
|
||||
}
|
||||
|
||||
/** Tells whether this node is part of the decision tree */
|
||||
HOST_DEV_INLINE bool IsUnused() const { return (idx == UNUSED_NODE); }
|
||||
HOST_DEV_INLINE bool IsUnused() const { return (idx == kUnusedNode); }
|
||||
|
||||
/** Tells whether this node is a leaf of the decision tree */
|
||||
HOST_DEV_INLINE bool IsLeaf() const {
|
||||
return (!IsUnused() && (fidx == UNUSED_NODE));
|
||||
return (!IsUnused() && (fidx == kUnusedNode));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -221,37 +219,37 @@ struct SumCallbackOp {
|
||||
// Running prefix
|
||||
T running_total;
|
||||
// Constructor
|
||||
__device__ SumCallbackOp() : running_total(T()) {}
|
||||
__device__ T operator()(T block_aggregate) {
|
||||
XGBOOST_DEVICE SumCallbackOp() : running_total(T()) {}
|
||||
XGBOOST_DEVICE T operator()(T block_aggregate) {
|
||||
T old_prefix = running_total;
|
||||
running_total += block_aggregate;
|
||||
return old_prefix;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename gpair_t>
|
||||
__device__ inline float device_calc_loss_chg(const GPUTrainingParam& param,
|
||||
const gpair_t& left,
|
||||
const gpair_t& parent_sum,
|
||||
template <typename GradientPairT>
|
||||
XGBOOST_DEVICE inline float DeviceCalcLossChange(const GPUTrainingParam& param,
|
||||
const GradientPairT& left,
|
||||
const GradientPairT& parent_sum,
|
||||
const float& parent_gain) {
|
||||
gpair_t right = parent_sum - left;
|
||||
GradientPairT right = parent_sum - left;
|
||||
float left_gain = CalcGain(param, left.GetGrad(), left.GetHess());
|
||||
float right_gain = CalcGain(param, right.GetGrad(), right.GetHess());
|
||||
return left_gain + right_gain - parent_gain;
|
||||
}
|
||||
|
||||
// Without constraints
|
||||
template <typename gpair_t>
|
||||
__device__ float inline loss_chg_missing(const gpair_t& scan,
|
||||
const gpair_t& missing,
|
||||
const gpair_t& parent_sum,
|
||||
template <typename GradientPairT>
|
||||
XGBOOST_DEVICE float inline LossChangeMissing(const GradientPairT& scan,
|
||||
const GradientPairT& missing,
|
||||
const GradientPairT& parent_sum,
|
||||
const float& parent_gain,
|
||||
const GPUTrainingParam& param,
|
||||
bool& missing_left_out) { // NOLINT
|
||||
float missing_left_loss =
|
||||
device_calc_loss_chg(param, scan + missing, parent_sum, parent_gain);
|
||||
DeviceCalcLossChange(param, scan + missing, parent_sum, parent_gain);
|
||||
float missing_right_loss =
|
||||
device_calc_loss_chg(param, scan, parent_sum, parent_gain);
|
||||
DeviceCalcLossChange(param, scan, parent_sum, parent_gain);
|
||||
|
||||
if (missing_left_loss >= missing_right_loss) {
|
||||
missing_left_out = true;
|
||||
@@ -263,9 +261,9 @@ __device__ float inline loss_chg_missing(const gpair_t& scan,
|
||||
}
|
||||
|
||||
// With constraints
|
||||
template <typename gpair_t>
|
||||
__device__ float inline loss_chg_missing(
|
||||
const gpair_t& scan, const gpair_t& missing, const gpair_t& parent_sum,
|
||||
template <typename GradientPairT>
|
||||
XGBOOST_DEVICE float inline LossChangeMissing(
|
||||
const GradientPairT& scan, const GradientPairT& missing, const GradientPairT& parent_sum,
|
||||
const float& parent_gain, const GPUTrainingParam& param, int constraint,
|
||||
const ValueConstraint& value_constraint,
|
||||
bool& missing_left_out) { // NOLINT
|
||||
@@ -285,54 +283,54 @@ __device__ float inline loss_chg_missing(
|
||||
}
|
||||
|
||||
// Total number of nodes in tree, given depth
|
||||
__host__ __device__ inline int n_nodes(int depth) {
|
||||
XGBOOST_DEVICE inline int MaxNodesDepth(int depth) {
|
||||
return (1 << (depth + 1)) - 1;
|
||||
}
|
||||
|
||||
// Number of nodes at this level of the tree
|
||||
__host__ __device__ inline int n_nodes_level(int depth) { return 1 << depth; }
|
||||
XGBOOST_DEVICE inline int MaxNodesLevel(int depth) { return 1 << depth; }
|
||||
|
||||
// Whether a node is currently being processed at current depth
|
||||
__host__ __device__ inline bool is_active(int nidx, int depth) {
|
||||
return nidx >= n_nodes(depth - 1);
|
||||
XGBOOST_DEVICE inline bool IsNodeActive(int nidx, int depth) {
|
||||
return nidx >= MaxNodesDepth(depth - 1);
|
||||
}
|
||||
|
||||
__host__ __device__ inline int parent_nidx(int nidx) { return (nidx - 1) / 2; }
|
||||
XGBOOST_DEVICE inline int ParentNodeIdx(int nidx) { return (nidx - 1) / 2; }
|
||||
|
||||
__host__ __device__ inline int left_child_nidx(int nidx) {
|
||||
XGBOOST_DEVICE inline int LeftChildNodeIdx(int nidx) {
|
||||
return nidx * 2 + 1;
|
||||
}
|
||||
|
||||
__host__ __device__ inline int right_child_nidx(int nidx) {
|
||||
XGBOOST_DEVICE inline int RightChildNodeIdx(int nidx) {
|
||||
return nidx * 2 + 2;
|
||||
}
|
||||
|
||||
__host__ __device__ inline bool is_left_child(int nidx) {
|
||||
XGBOOST_DEVICE inline bool IsLeftChild(int nidx) {
|
||||
return nidx % 2 == 1;
|
||||
}
|
||||
|
||||
// Copy gpu dense representation of tree to xgboost sparse representation
|
||||
inline void dense2sparse_tree(RegTree* p_tree,
|
||||
const dh::dvec<DeviceNodeStats>& nodes,
|
||||
inline void Dense2SparseTree(RegTree* p_tree,
|
||||
const dh::DVec<DeviceNodeStats>& nodes,
|
||||
const TrainParam& param) {
|
||||
RegTree& tree = *p_tree;
|
||||
std::vector<DeviceNodeStats> h_nodes = nodes.as_vector();
|
||||
std::vector<DeviceNodeStats> h_nodes = nodes.AsVector();
|
||||
|
||||
int nid = 0;
|
||||
for (int gpu_nid = 0; gpu_nid < h_nodes.size(); gpu_nid++) {
|
||||
const DeviceNodeStats& n = h_nodes[gpu_nid];
|
||||
if (!n.IsUnused() && !n.IsLeaf()) {
|
||||
tree.AddChilds(nid);
|
||||
tree[nid].set_split(n.fidx, n.fvalue, n.dir == LeftDir);
|
||||
tree.stat(nid).loss_chg = n.root_gain;
|
||||
tree.stat(nid).base_weight = n.weight;
|
||||
tree.stat(nid).sum_hess = n.sum_gradients.GetHess();
|
||||
tree[tree[nid].cleft()].set_leaf(0);
|
||||
tree[tree[nid].cright()].set_leaf(0);
|
||||
tree[nid].SetSplit(n.fidx, n.fvalue, n.dir == kLeftDir);
|
||||
tree.Stat(nid).loss_chg = n.root_gain;
|
||||
tree.Stat(nid).base_weight = n.weight;
|
||||
tree.Stat(nid).sum_hess = n.sum_gradients.GetHess();
|
||||
tree[tree[nid].LeftChild()].SetLeaf(0);
|
||||
tree[tree[nid].RightChild()].SetLeaf(0);
|
||||
nid++;
|
||||
} else if (n.IsLeaf()) {
|
||||
tree[nid].set_leaf(n.weight * param.learning_rate);
|
||||
tree.stat(nid).sum_hess = n.sum_gradients.GetHess();
|
||||
tree[nid].SetLeaf(n.weight * param.learning_rate);
|
||||
tree.Stat(nid).sum_hess = n.sum_gradients.GetHess();
|
||||
nid++;
|
||||
}
|
||||
}
|
||||
@@ -346,11 +344,11 @@ struct BernoulliRng {
|
||||
float p;
|
||||
uint32_t seed;
|
||||
|
||||
__host__ __device__ BernoulliRng(float p, size_t seed_) : p(p) {
|
||||
XGBOOST_DEVICE BernoulliRng(float p, size_t seed_) : p(p) {
|
||||
seed = static_cast<uint32_t>(seed_);
|
||||
}
|
||||
|
||||
__host__ __device__ bool operator()(const int i) const {
|
||||
XGBOOST_DEVICE bool operator()(const int i) const {
|
||||
thrust::default_random_engine rng(seed);
|
||||
thrust::uniform_real_distribution<float> dist;
|
||||
rng.discard(i);
|
||||
@@ -359,25 +357,25 @@ struct BernoulliRng {
|
||||
};
|
||||
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
inline void subsample_gpair(dh::dvec<bst_gpair>* p_gpair, float subsample,
|
||||
inline void SubsampleGradientPair(dh::DVec<GradientPair>* p_gpair, float subsample,
|
||||
int offset = 0) {
|
||||
if (subsample == 1.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::dvec<bst_gpair>& gpair = *p_gpair;
|
||||
dh::DVec<GradientPair>& gpair = *p_gpair;
|
||||
|
||||
auto d_gpair = gpair.data();
|
||||
auto d_gpair = gpair.Data();
|
||||
BernoulliRng rng(subsample, common::GlobalRandom()());
|
||||
|
||||
dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) {
|
||||
dh::LaunchN(gpair.DeviceIdx(), gpair.Size(), [=] XGBOOST_DEVICE(int i) {
|
||||
if (!rng(i + offset)) {
|
||||
d_gpair[i] = bst_gpair();
|
||||
d_gpair[i] = GradientPair();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
||||
inline std::vector<int> ColSample(std::vector<int> features, float colsample) {
|
||||
CHECK_GT(features.size(), 0);
|
||||
int n = std::max(1, static_cast<int>(colsample * features.size()));
|
||||
|
||||
@@ -397,9 +395,9 @@ inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
||||
*/
|
||||
|
||||
class ColumnSampler {
|
||||
std::vector<int> feature_set_tree;
|
||||
std::map<int, std::vector<int>> feature_set_level;
|
||||
TrainParam param;
|
||||
std::vector<int> feature_set_tree_;
|
||||
std::map<int, std::vector<int>> feature_set_level_;
|
||||
TrainParam param_;
|
||||
|
||||
public:
|
||||
/**
|
||||
@@ -413,10 +411,10 @@ class ColumnSampler {
|
||||
|
||||
void Init(int64_t num_col, const TrainParam& param) {
|
||||
this->Reset();
|
||||
this->param = param;
|
||||
feature_set_tree.resize(num_col);
|
||||
std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
|
||||
feature_set_tree = col_sample(feature_set_tree, param.colsample_bytree);
|
||||
this->param_ = param;
|
||||
feature_set_tree_.resize(num_col);
|
||||
std::iota(feature_set_tree_.begin(), feature_set_tree_.end(), 0);
|
||||
feature_set_tree_ = ColSample(feature_set_tree_, param.colsample_bytree);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -426,8 +424,8 @@ class ColumnSampler {
|
||||
*/
|
||||
|
||||
void Reset() {
|
||||
feature_set_tree.clear();
|
||||
feature_set_level.clear();
|
||||
feature_set_tree_.clear();
|
||||
feature_set_level_.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -442,13 +440,13 @@ class ColumnSampler {
|
||||
*/
|
||||
|
||||
bool ColumnUsed(int column, int depth) {
|
||||
if (feature_set_level.count(depth) == 0) {
|
||||
feature_set_level[depth] =
|
||||
col_sample(feature_set_tree, param.colsample_bylevel);
|
||||
if (feature_set_level_.count(depth) == 0) {
|
||||
feature_set_level_[depth] =
|
||||
ColSample(feature_set_tree_, param_.colsample_bylevel);
|
||||
}
|
||||
|
||||
return std::binary_search(feature_set_level[depth].begin(),
|
||||
feature_set_level[depth].end(), column);
|
||||
return std::binary_search(feature_set_level_[depth].begin(),
|
||||
feature_set_level_[depth].end(), column);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -23,22 +23,22 @@ namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
|
||||
typedef bst_gpair_precise gpair_sum_t;
|
||||
using GradientPairSumT = GradientPairPrecise;
|
||||
|
||||
template <int BLOCK_THREADS, typename reduce_t, typename temp_storage_t>
|
||||
__device__ gpair_sum_t ReduceFeature(const gpair_sum_t* begin,
|
||||
const gpair_sum_t* end,
|
||||
temp_storage_t* temp_storage) {
|
||||
__shared__ cub::Uninitialized<gpair_sum_t> uninitialized_sum;
|
||||
gpair_sum_t& shared_sum = uninitialized_sum.Alias();
|
||||
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT>
|
||||
__device__ GradientPairSumT ReduceFeature(const GradientPairSumT* begin,
|
||||
const GradientPairSumT* end,
|
||||
TempStorageT* temp_storage) {
|
||||
__shared__ cub::Uninitialized<GradientPairSumT> uninitialized_sum;
|
||||
GradientPairSumT& shared_sum = uninitialized_sum.Alias();
|
||||
|
||||
gpair_sum_t local_sum = gpair_sum_t();
|
||||
GradientPairSumT local_sum = GradientPairSumT();
|
||||
for (auto itr = begin; itr < end; itr += BLOCK_THREADS) {
|
||||
bool thread_active = itr + threadIdx.x < end;
|
||||
// Scan histogram
|
||||
gpair_sum_t bin = thread_active ? *(itr + threadIdx.x) : gpair_sum_t();
|
||||
GradientPairSumT bin = thread_active ? *(itr + threadIdx.x) : GradientPairSumT();
|
||||
|
||||
local_sum += reduce_t(temp_storage->sum_reduce).Reduce(bin, cub::Sum());
|
||||
local_sum += ReduceT(temp_storage->sum_reduce).Reduce(bin, cub::Sum());
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
@@ -49,41 +49,41 @@ __device__ gpair_sum_t ReduceFeature(const gpair_sum_t* begin,
|
||||
return shared_sum;
|
||||
}
|
||||
|
||||
template <int BLOCK_THREADS, typename reduce_t, typename scan_t,
|
||||
typename max_reduce_t, typename temp_storage_t>
|
||||
__device__ void EvaluateFeature(int fidx, const gpair_sum_t* hist,
|
||||
template <int BLOCK_THREADS, typename ReduceT, typename scan_t,
|
||||
typename max_ReduceT, typename TempStorageT>
|
||||
__device__ void EvaluateFeature(int fidx, const GradientPairSumT* hist,
|
||||
const int* feature_segments, float min_fvalue,
|
||||
const float* gidx_fvalue_map,
|
||||
DeviceSplitCandidate* best_split,
|
||||
const DeviceNodeStats& node,
|
||||
const GPUTrainingParam& param,
|
||||
temp_storage_t* temp_storage, int constraint,
|
||||
TempStorageT* temp_storage, int constraint,
|
||||
const ValueConstraint& value_constraint) {
|
||||
int gidx_begin = feature_segments[fidx];
|
||||
int gidx_end = feature_segments[fidx + 1];
|
||||
|
||||
gpair_sum_t feature_sum = ReduceFeature<BLOCK_THREADS, reduce_t>(
|
||||
GradientPairSumT feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
|
||||
hist + gidx_begin, hist + gidx_end, temp_storage);
|
||||
|
||||
auto prefix_op = SumCallbackOp<gpair_sum_t>();
|
||||
auto prefix_op = SumCallbackOp<GradientPairSumT>();
|
||||
for (int scan_begin = gidx_begin; scan_begin < gidx_end;
|
||||
scan_begin += BLOCK_THREADS) {
|
||||
bool thread_active = scan_begin + threadIdx.x < gidx_end;
|
||||
|
||||
gpair_sum_t bin =
|
||||
thread_active ? hist[scan_begin + threadIdx.x] : gpair_sum_t();
|
||||
GradientPairSumT bin =
|
||||
thread_active ? hist[scan_begin + threadIdx.x] : GradientPairSumT();
|
||||
scan_t(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
|
||||
|
||||
// Calculate gain
|
||||
gpair_sum_t parent_sum = gpair_sum_t(node.sum_gradients);
|
||||
GradientPairSumT parent_sum = GradientPairSumT(node.sum_gradients);
|
||||
|
||||
gpair_sum_t missing = parent_sum - feature_sum;
|
||||
GradientPairSumT missing = parent_sum - feature_sum;
|
||||
|
||||
bool missing_left = true;
|
||||
const float null_gain = -FLT_MAX;
|
||||
float gain = null_gain;
|
||||
if (thread_active) {
|
||||
gain = loss_chg_missing(bin, missing, parent_sum, node.root_gain, param,
|
||||
gain = LossChangeMissing(bin, missing, parent_sum, node.root_gain, param,
|
||||
constraint, value_constraint, missing_left);
|
||||
}
|
||||
|
||||
@@ -92,7 +92,7 @@ __device__ void EvaluateFeature(int fidx, const gpair_sum_t* hist,
|
||||
// Find thread with best gain
|
||||
cub::KeyValuePair<int, float> tuple(threadIdx.x, gain);
|
||||
cub::KeyValuePair<int, float> best =
|
||||
max_reduce_t(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax());
|
||||
max_ReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax());
|
||||
|
||||
__shared__ cub::KeyValuePair<int, float> block_max;
|
||||
if (threadIdx.x == 0) {
|
||||
@@ -107,11 +107,11 @@ __device__ void EvaluateFeature(int fidx, const gpair_sum_t* hist,
|
||||
float fvalue =
|
||||
gidx == gidx_begin ? min_fvalue : gidx_fvalue_map[gidx - 1];
|
||||
|
||||
gpair_sum_t left = missing_left ? bin + missing : bin;
|
||||
gpair_sum_t right = parent_sum - left;
|
||||
GradientPairSumT left = missing_left ? bin + missing : bin;
|
||||
GradientPairSumT right = parent_sum - left;
|
||||
|
||||
best_split->Update(gain, missing_left ? LeftDir : RightDir, fvalue, fidx,
|
||||
left, right, param);
|
||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx,
|
||||
GradientPair(left), GradientPair(right), param);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -119,17 +119,17 @@ __device__ void EvaluateFeature(int fidx, const gpair_sum_t* hist,
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ void evaluate_split_kernel(
|
||||
const gpair_sum_t* d_hist, int nidx, uint64_t n_features,
|
||||
const GradientPairSumT* d_hist, int nidx, uint64_t n_features,
|
||||
DeviceNodeStats nodes, const int* d_feature_segments,
|
||||
const float* d_fidx_min_map, const float* d_gidx_fvalue_map,
|
||||
GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split,
|
||||
ValueConstraint value_constraint, int* d_monotonic_constraints) {
|
||||
typedef cub::KeyValuePair<int, float> ArgMaxT;
|
||||
typedef cub::BlockScan<gpair_sum_t, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
|
||||
typedef cub::BlockScan<GradientPairSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
|
||||
BlockScanT;
|
||||
typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
|
||||
|
||||
typedef cub::BlockReduce<gpair_sum_t, BLOCK_THREADS> SumReduceT;
|
||||
typedef cub::BlockReduce<GradientPairSumT, BLOCK_THREADS> SumReduceT;
|
||||
|
||||
union TempStorage {
|
||||
typename BlockScanT::TempStorage scan;
|
||||
@@ -163,8 +163,8 @@ __global__ void evaluate_split_kernel(
|
||||
}
|
||||
|
||||
// Find a gidx value for a given feature otherwise return -1 if not found
|
||||
template <typename gidx_iter_t>
|
||||
__device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data,
|
||||
template <typename GidxIterT>
|
||||
__device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data,
|
||||
int fidx_begin, int fidx_end) {
|
||||
bst_uint previous_middle = UINT32_MAX;
|
||||
while (end != begin) {
|
||||
@@ -189,19 +189,19 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data,
|
||||
}
|
||||
|
||||
struct DeviceHistogram {
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
dh::dvec<gpair_sum_t> data;
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||
dh::DVec<GradientPairSumT> data;
|
||||
int n_bins;
|
||||
void Init(int device_idx, int max_nodes, int n_bins, bool silent) {
|
||||
this->n_bins = n_bins;
|
||||
ba.allocate(device_idx, silent, &data, size_t(max_nodes) * size_t(n_bins));
|
||||
ba.Allocate(device_idx, silent, &data, size_t(max_nodes) * size_t(n_bins));
|
||||
}
|
||||
|
||||
void Reset() { data.fill(gpair_sum_t()); }
|
||||
gpair_sum_t* GetHistPtr(int nidx) { return data.data() + nidx * n_bins; }
|
||||
void Reset() { data.Fill(GradientPairSumT()); }
|
||||
GradientPairSumT* GetHistPtr(int nidx) { return data.Data() + nidx * n_bins; }
|
||||
|
||||
void PrintNidx(int nidx) const {
|
||||
auto h_data = data.as_vector();
|
||||
auto h_data = data.AsVector();
|
||||
std::cout << "nidx " << nidx << ":\n";
|
||||
for (int i = n_bins * nidx; i < n_bins * (nidx + 1); i++) {
|
||||
std::cout << h_data[i] << " ";
|
||||
@@ -216,7 +216,7 @@ struct CalcWeightTrainParam {
|
||||
float reg_lambda;
|
||||
float max_delta_step;
|
||||
float learning_rate;
|
||||
__host__ __device__ CalcWeightTrainParam(const TrainParam& p)
|
||||
XGBOOST_DEVICE explicit CalcWeightTrainParam(const TrainParam& p)
|
||||
: min_child_weight(p.min_child_weight),
|
||||
reg_alpha(p.reg_alpha),
|
||||
reg_lambda(p.reg_lambda),
|
||||
@@ -240,19 +240,19 @@ struct DeviceShard {
|
||||
|
||||
int device_idx;
|
||||
int normalised_device_idx; // Device index counting from param.gpu_id
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
dh::dvec<common::compressed_byte_t> gidx_buffer;
|
||||
dh::dvec<bst_gpair> gpair;
|
||||
dh::dvec2<bst_uint> ridx; // Row index relative to this shard
|
||||
dh::dvec2<int> position;
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||
dh::DVec<common::CompressedByteT> gidx_buffer;
|
||||
dh::DVec<GradientPair> gpair;
|
||||
dh::DVec2<bst_uint> ridx; // Row index relative to this shard
|
||||
dh::DVec2<int> position;
|
||||
std::vector<Segment> ridx_segments;
|
||||
dh::dvec<int> feature_segments;
|
||||
dh::dvec<float> gidx_fvalue_map;
|
||||
dh::dvec<float> min_fvalue;
|
||||
dh::dvec<int> monotone_constraints;
|
||||
dh::dvec<bst_float> prediction_cache;
|
||||
std::vector<bst_gpair> node_sum_gradients;
|
||||
dh::dvec<bst_gpair> node_sum_gradients_d;
|
||||
dh::DVec<int> feature_segments;
|
||||
dh::DVec<float> gidx_fvalue_map;
|
||||
dh::DVec<float> min_fvalue;
|
||||
dh::DVec<int> monotone_constraints;
|
||||
dh::DVec<bst_float> prediction_cache;
|
||||
std::vector<GradientPair> node_sum_gradients;
|
||||
dh::DVec<GradientPair> node_sum_gradients_d;
|
||||
common::CompressedIterator<uint32_t> gidx;
|
||||
int row_stride;
|
||||
bst_uint row_begin_idx; // The row offset for this shard
|
||||
@@ -311,8 +311,8 @@ struct DeviceShard {
|
||||
<< "Max leaves and max depth cannot both be unconstrained for "
|
||||
"gpu_hist.";
|
||||
int max_nodes =
|
||||
param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
|
||||
ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
|
||||
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
|
||||
ba.Allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
|
||||
&gpair, n_rows, &ridx, n_rows, &position, n_rows,
|
||||
&prediction_cache, n_rows, &node_sum_gradients_d, max_nodes,
|
||||
&feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
|
||||
@@ -328,11 +328,11 @@ struct DeviceShard {
|
||||
|
||||
// Compress gidx
|
||||
common::CompressedBufferWriter cbw(num_symbols);
|
||||
std::vector<common::compressed_byte_t> host_buffer(gidx_buffer.size());
|
||||
std::vector<common::CompressedByteT> host_buffer(gidx_buffer.Size());
|
||||
cbw.Write(host_buffer.data(), ellpack_matrix.begin(), ellpack_matrix.end());
|
||||
gidx_buffer = host_buffer;
|
||||
gidx =
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols);
|
||||
|
||||
common::CompressedIterator<uint32_t> ci_host(host_buffer.data(),
|
||||
num_symbols);
|
||||
@@ -369,19 +369,19 @@ struct DeviceShard {
|
||||
}
|
||||
|
||||
// Reset values for each update iteration
|
||||
void Reset(HostDeviceVector<bst_gpair>* dh_gpair, int device) {
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair, int device) {
|
||||
auto begin = dh_gpair->tbegin(device);
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
position.current_dvec().fill(0);
|
||||
position.CurrentDVec().Fill(0);
|
||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||
bst_gpair());
|
||||
GradientPair());
|
||||
|
||||
thrust::sequence(ridx.current_dvec().tbegin(), ridx.current_dvec().tend());
|
||||
thrust::sequence(ridx.CurrentDVec().tbegin(), ridx.CurrentDVec().tend());
|
||||
|
||||
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
||||
ridx_segments.front() = Segment(0, ridx.size());
|
||||
ridx_segments.front() = Segment(0, ridx.Size());
|
||||
this->gpair.copy(begin + row_begin_idx, begin + row_end_idx);
|
||||
subsample_gpair(&gpair, param.subsample, row_begin_idx);
|
||||
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
||||
hist.Reset();
|
||||
}
|
||||
|
||||
@@ -389,13 +389,13 @@ struct DeviceShard {
|
||||
auto segment = ridx_segments[nidx];
|
||||
auto d_node_hist = hist.GetHistPtr(nidx);
|
||||
auto d_gidx = gidx;
|
||||
auto d_ridx = ridx.current();
|
||||
auto d_gpair = gpair.data();
|
||||
auto d_ridx = ridx.Current();
|
||||
auto d_gpair = gpair.Data();
|
||||
auto row_stride = this->row_stride;
|
||||
auto null_gidx_value = this->null_gidx_value;
|
||||
auto n_elements = segment.Size() * row_stride;
|
||||
|
||||
dh::launch_n(device_idx, n_elements, [=] __device__(size_t idx) {
|
||||
dh::LaunchN(device_idx, n_elements, [=] __device__(size_t idx) {
|
||||
int ridx = d_ridx[(idx / row_stride) + segment.begin];
|
||||
int gidx = d_gidx[ridx * row_stride + idx % row_stride];
|
||||
|
||||
@@ -410,7 +410,7 @@ struct DeviceShard {
|
||||
auto d_node_hist_histogram = hist.GetHistPtr(nidx_histogram);
|
||||
auto d_node_hist_subtraction = hist.GetHistPtr(nidx_subtraction);
|
||||
|
||||
dh::launch_n(device_idx, hist.n_bins, [=] __device__(size_t idx) {
|
||||
dh::LaunchN(device_idx, hist.n_bins, [=] __device__(size_t idx) {
|
||||
d_node_hist_subtraction[idx] =
|
||||
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
||||
});
|
||||
@@ -432,11 +432,11 @@ struct DeviceShard {
|
||||
auto d_left_count = temp_memory.Pointer<int64_t>();
|
||||
dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t)));
|
||||
auto segment = ridx_segments[nidx];
|
||||
auto d_ridx = ridx.current();
|
||||
auto d_position = position.current();
|
||||
auto d_ridx = ridx.Current();
|
||||
auto d_position = position.Current();
|
||||
auto d_gidx = gidx;
|
||||
auto row_stride = this->row_stride;
|
||||
dh::launch_n<1, 512>(
|
||||
dh::LaunchN<1, 512>(
|
||||
device_idx, segment.Size(), [=] __device__(bst_uint idx) {
|
||||
idx += segment.begin;
|
||||
auto ridx = d_ridx[idx];
|
||||
@@ -482,22 +482,22 @@ struct DeviceShard {
|
||||
|
||||
size_t temp_storage_bytes = 0;
|
||||
cub::DeviceRadixSort::SortPairs(
|
||||
nullptr, temp_storage_bytes, position.current() + segment.begin,
|
||||
position.other() + segment.begin, ridx.current() + segment.begin,
|
||||
nullptr, temp_storage_bytes, position.Current() + segment.begin,
|
||||
position.other() + segment.begin, ridx.Current() + segment.begin,
|
||||
ridx.other() + segment.begin, segment.Size(), min_bits, max_bits);
|
||||
|
||||
temp_memory.LazyAllocate(temp_storage_bytes);
|
||||
|
||||
cub::DeviceRadixSort::SortPairs(
|
||||
temp_memory.d_temp_storage, temp_memory.temp_storage_bytes,
|
||||
position.current() + segment.begin, position.other() + segment.begin,
|
||||
ridx.current() + segment.begin, ridx.other() + segment.begin,
|
||||
position.Current() + segment.begin, position.other() + segment.begin,
|
||||
ridx.Current() + segment.begin, ridx.other() + segment.begin,
|
||||
segment.Size(), min_bits, max_bits);
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
position.current() + segment.begin, position.other() + segment.begin,
|
||||
position.Current() + segment.begin, position.other() + segment.begin,
|
||||
segment.Size() * sizeof(int), cudaMemcpyDeviceToDevice));
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
ridx.current() + segment.begin, ridx.other() + segment.begin,
|
||||
ridx.Current() + segment.begin, ridx.other() + segment.begin,
|
||||
segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
@@ -505,8 +505,8 @@ struct DeviceShard {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
if (!prediction_cache_initialised) {
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
prediction_cache.data(), &out_preds_d[row_begin_idx],
|
||||
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
prediction_cache.Data(), &out_preds_d[row_begin_idx],
|
||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
prediction_cache_initialised = true;
|
||||
|
||||
@@ -514,13 +514,13 @@ struct DeviceShard {
|
||||
|
||||
thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||
node_sum_gradients_d.tbegin());
|
||||
auto d_position = position.current();
|
||||
auto d_ridx = ridx.current();
|
||||
auto d_node_sum_gradients = node_sum_gradients_d.data();
|
||||
auto d_prediction_cache = prediction_cache.data();
|
||||
auto d_position = position.Current();
|
||||
auto d_ridx = ridx.Current();
|
||||
auto d_node_sum_gradients = node_sum_gradients_d.Data();
|
||||
auto d_prediction_cache = prediction_cache.Data();
|
||||
|
||||
dh::launch_n(
|
||||
device_idx, prediction_cache.size(), [=] __device__(int local_idx) {
|
||||
dh::LaunchN(
|
||||
device_idx, prediction_cache.Size(), [=] __device__(int local_idx) {
|
||||
int pos = d_position[local_idx];
|
||||
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
||||
d_prediction_cache[d_ridx[local_idx]] +=
|
||||
@@ -528,8 +528,8 @@ struct DeviceShard {
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
&out_preds_d[row_begin_idx], prediction_cache.data(),
|
||||
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
&out_preds_d[row_begin_idx], prediction_cache.Data(),
|
||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -537,33 +537,32 @@ class GPUHistMaker : public TreeUpdater {
|
||||
public:
|
||||
struct ExpandEntry;
|
||||
|
||||
GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {}
|
||||
~GPUHistMaker() {}
|
||||
GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {}
|
||||
void Init(
|
||||
const std::vector<std::pair<std::string, std::string>>& args) override {
|
||||
param.InitAllowUnknown(args);
|
||||
CHECK(param.n_gpus != 0) << "Must have at least one device";
|
||||
n_devices = param.n_gpus;
|
||||
param_.InitAllowUnknown(args);
|
||||
CHECK(param_.n_gpus != 0) << "Must have at least one device";
|
||||
n_devices_ = param_.n_gpus;
|
||||
|
||||
dh::check_compute_capability();
|
||||
dh::CheckComputeCapability();
|
||||
|
||||
if (param.grow_policy == TrainParam::kLossGuide) {
|
||||
qexpand_.reset(new ExpandQueue(loss_guide));
|
||||
if (param_.grow_policy == TrainParam::kLossGuide) {
|
||||
qexpand_.reset(new ExpandQueue(LossGuide));
|
||||
} else {
|
||||
qexpand_.reset(new ExpandQueue(depth_wise));
|
||||
qexpand_.reset(new ExpandQueue(DepthWise));
|
||||
}
|
||||
|
||||
monitor.Init("updater_gpu_hist", param.debug_verbose);
|
||||
monitor_.Init("updater_gpu_hist", param_.debug_verbose);
|
||||
}
|
||||
|
||||
void Update(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
|
||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
monitor.Start("Update", dList);
|
||||
GradStats::CheckInfo(dmat->info());
|
||||
monitor_.Start("Update", device_list_);
|
||||
GradStats::CheckInfo(dmat->Info());
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
ValueConstraint::Init(¶m, dmat->info().num_col);
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
ValueConstraint::Init(¶m_, dmat->Info().num_col_);
|
||||
// build tree
|
||||
try {
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
@@ -572,97 +571,97 @@ class GPUHistMaker : public TreeUpdater {
|
||||
} catch (const std::exception& e) {
|
||||
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
monitor.Stop("Update", dList);
|
||||
param_.learning_rate = lr;
|
||||
monitor_.Stop("Update", device_list_);
|
||||
}
|
||||
|
||||
void InitDataOnce(DMatrix* dmat) {
|
||||
info = &dmat->info();
|
||||
monitor.Start("Quantiles", dList);
|
||||
hmat_.Init(dmat, param.max_bin);
|
||||
info_ = &dmat->Info();
|
||||
monitor_.Start("Quantiles", device_list_);
|
||||
hmat_.Init(dmat, param_.max_bin);
|
||||
gmat_.cut = &hmat_;
|
||||
gmat_.Init(dmat);
|
||||
monitor.Stop("Quantiles", dList);
|
||||
n_bins = hmat_.row_ptr.back();
|
||||
monitor_.Stop("Quantiles", device_list_);
|
||||
n_bins_ = hmat_.row_ptr.back();
|
||||
|
||||
int n_devices = dh::n_devices(param.n_gpus, info->num_row);
|
||||
int n_devices = dh::NDevices(param_.n_gpus, info_->num_row_);
|
||||
|
||||
bst_uint row_begin = 0;
|
||||
bst_uint shard_size =
|
||||
std::ceil(static_cast<double>(info->num_row) / n_devices);
|
||||
std::ceil(static_cast<double>(info_->num_row_) / n_devices);
|
||||
|
||||
dList.resize(n_devices);
|
||||
device_list_.resize(n_devices);
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
||||
dList[d_idx] = device_idx;
|
||||
int device_idx = (param_.gpu_id + d_idx) % dh::NVisibleDevices();
|
||||
device_list_[d_idx] = device_idx;
|
||||
}
|
||||
|
||||
reducer.Init(dList);
|
||||
reducer_.Init(device_list_);
|
||||
|
||||
// Partition input matrix into row segments
|
||||
std::vector<size_t> row_segments;
|
||||
shards.resize(n_devices);
|
||||
shards_.resize(n_devices);
|
||||
row_segments.push_back(0);
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
bst_uint row_end =
|
||||
std::min(static_cast<size_t>(row_begin + shard_size), info->num_row);
|
||||
std::min(static_cast<size_t>(row_begin + shard_size), info_->num_row_);
|
||||
row_segments.push_back(row_end);
|
||||
row_begin = row_end;
|
||||
}
|
||||
|
||||
// Create device shards
|
||||
omp_set_num_threads(shards.size());
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
shards[cpu_thread_id] = std::unique_ptr<DeviceShard>(
|
||||
new DeviceShard(dList[cpu_thread_id], cpu_thread_id, gmat_,
|
||||
shards_[cpu_thread_id] = std::unique_ptr<DeviceShard>(
|
||||
new DeviceShard(device_list_[cpu_thread_id], cpu_thread_id, gmat_,
|
||||
row_segments[cpu_thread_id],
|
||||
row_segments[cpu_thread_id + 1], n_bins, param));
|
||||
row_segments[cpu_thread_id + 1], n_bins_, param_));
|
||||
}
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised = true;
|
||||
initialised_ = true;
|
||||
}
|
||||
|
||||
void InitData(HostDeviceVector<bst_gpair>* gpair, DMatrix* dmat,
|
||||
void InitData(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
const RegTree& tree) {
|
||||
monitor.Start("InitDataOnce", dList);
|
||||
if (!initialised) {
|
||||
monitor_.Start("InitDataOnce", device_list_);
|
||||
if (!initialised_) {
|
||||
this->InitDataOnce(dmat);
|
||||
}
|
||||
monitor.Stop("InitDataOnce", dList);
|
||||
monitor_.Stop("InitDataOnce", device_list_);
|
||||
|
||||
column_sampler.Init(info->num_col, param);
|
||||
column_sampler_.Init(info_->num_col_, param_);
|
||||
|
||||
// Copy gpair & reset memory
|
||||
monitor.Start("InitDataReset", dList);
|
||||
omp_set_num_threads(shards.size());
|
||||
monitor_.Start("InitDataReset", device_list_);
|
||||
omp_set_num_threads(shards_.size());
|
||||
|
||||
// TODO(canonizer): make it parallel again once HostDeviceVector is
|
||||
// thread-safe
|
||||
for (int shard = 0; shard < shards.size(); ++shard)
|
||||
shards[shard]->Reset(gpair, param.gpu_id);
|
||||
monitor.Stop("InitDataReset", dList);
|
||||
for (int shard = 0; shard < shards_.size(); ++shard)
|
||||
shards_[shard]->Reset(gpair, param_.gpu_id);
|
||||
monitor_.Stop("InitDataReset", device_list_);
|
||||
}
|
||||
|
||||
void AllReduceHist(int nidx) {
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
auto d_node_hist = shard->hist.GetHistPtr(nidx);
|
||||
reducer.AllReduceSum(
|
||||
reducer_.AllReduceSum(
|
||||
shard->normalised_device_idx,
|
||||
reinterpret_cast<gpair_sum_t::value_t*>(d_node_hist),
|
||||
reinterpret_cast<gpair_sum_t::value_t*>(d_node_hist),
|
||||
n_bins * (sizeof(gpair_sum_t) / sizeof(gpair_sum_t::value_t)));
|
||||
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
|
||||
reinterpret_cast<GradientPairSumT::ValueT*>(d_node_hist),
|
||||
n_bins_ * (sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)));
|
||||
}
|
||||
|
||||
reducer.Synchronize();
|
||||
reducer_.Synchronize();
|
||||
}
|
||||
|
||||
void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right) {
|
||||
size_t left_node_max_elements = 0;
|
||||
size_t right_node_max_elements = 0;
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
left_node_max_elements = (std::max)(
|
||||
left_node_max_elements, shard->ridx_segments[nidx_left].Size());
|
||||
right_node_max_elements = (std::max)(
|
||||
@@ -677,13 +676,13 @@ class GPUHistMaker : public TreeUpdater {
|
||||
subtraction_trick_nidx = nidx_left;
|
||||
}
|
||||
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
shard->BuildHist(build_hist_nidx);
|
||||
}
|
||||
|
||||
this->AllReduceHist(build_hist_nidx);
|
||||
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
||||
subtraction_trick_nidx);
|
||||
}
|
||||
@@ -692,12 +691,12 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Returns best loss
|
||||
std::vector<DeviceSplitCandidate> EvaluateSplits(
|
||||
const std::vector<int>& nidx_set, RegTree* p_tree) {
|
||||
auto columns = info->num_col;
|
||||
auto columns = info_->num_col_;
|
||||
std::vector<DeviceSplitCandidate> best_splits(nidx_set.size());
|
||||
std::vector<DeviceSplitCandidate> candidate_splits(nidx_set.size() *
|
||||
columns);
|
||||
// Use first device
|
||||
auto& shard = shards.front();
|
||||
auto& shard = shards_.front();
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||
shard->temp_memory.LazyAllocate(sizeof(DeviceSplitCandidate) * columns *
|
||||
nidx_set.size());
|
||||
@@ -708,16 +707,16 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Use streams to process nodes concurrently
|
||||
for (auto i = 0; i < nidx_set.size(); i++) {
|
||||
auto nidx = nidx_set[i];
|
||||
DeviceNodeStats node(shard->node_sum_gradients[nidx], nidx, param);
|
||||
DeviceNodeStats node(shard->node_sum_gradients[nidx], nidx, param_);
|
||||
|
||||
const int BLOCK_THREADS = 256;
|
||||
evaluate_split_kernel<BLOCK_THREADS>
|
||||
<<<uint32_t(columns), BLOCK_THREADS, 0, streams[i]>>>(
|
||||
shard->hist.GetHistPtr(nidx), nidx, info->num_col, node,
|
||||
shard->feature_segments.data(), shard->min_fvalue.data(),
|
||||
shard->gidx_fvalue_map.data(), GPUTrainingParam(param),
|
||||
shard->hist.GetHistPtr(nidx), nidx, info_->num_col_, node,
|
||||
shard->feature_segments.Data(), shard->min_fvalue.Data(),
|
||||
shard->gidx_fvalue_map.Data(), GPUTrainingParam(param_),
|
||||
d_split + i * columns, node_value_constraints_[nidx],
|
||||
shard->monotone_constraints.data());
|
||||
shard->monotone_constraints.Data());
|
||||
}
|
||||
|
||||
dh::safe_cuda(
|
||||
@@ -730,9 +729,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
DeviceSplitCandidate nidx_best;
|
||||
for (auto fidx = 0; fidx < columns; fidx++) {
|
||||
auto& candidate = candidate_splits[i * columns + fidx];
|
||||
if (column_sampler.ColumnUsed(candidate.findex,
|
||||
if (column_sampler_.ColumnUsed(candidate.findex,
|
||||
p_tree->GetDepth(nidx))) {
|
||||
nidx_best.Update(candidate_splits[i * columns + fidx], param);
|
||||
nidx_best.Update(candidate_splits[i * columns + fidx], param_);
|
||||
}
|
||||
}
|
||||
best_splits[i] = nidx_best;
|
||||
@@ -743,34 +742,34 @@ class GPUHistMaker : public TreeUpdater {
|
||||
void InitRoot(RegTree* p_tree) {
|
||||
auto root_nidx = 0;
|
||||
// Sum gradients
|
||||
std::vector<bst_gpair> tmp_sums(shards.size());
|
||||
omp_set_num_threads(shards.size());
|
||||
std::vector<GradientPair> tmp_sums(shards_.size());
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
auto& shard = shards[cpu_thread_id];
|
||||
auto& shard = shards_[cpu_thread_id];
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_idx));
|
||||
tmp_sums[cpu_thread_id] = dh::sumReduction(
|
||||
shard->temp_memory, shard->gpair.data(), shard->gpair.size());
|
||||
tmp_sums[cpu_thread_id] = dh::SumReduction(
|
||||
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
||||
}
|
||||
auto sum_gradient =
|
||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), bst_gpair_precise());
|
||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
||||
|
||||
// Generate root histogram
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
shard->BuildHist(root_nidx);
|
||||
}
|
||||
|
||||
this->AllReduceHist(root_nidx);
|
||||
|
||||
// Remember root stats
|
||||
p_tree->stat(root_nidx).sum_hess = sum_gradient.GetHess();
|
||||
auto weight = CalcWeight(param, sum_gradient);
|
||||
p_tree->stat(root_nidx).base_weight = weight;
|
||||
(*p_tree)[root_nidx].set_leaf(param.learning_rate * weight);
|
||||
p_tree->Stat(root_nidx).sum_hess = sum_gradient.GetHess();
|
||||
auto weight = CalcWeight(param_, sum_gradient);
|
||||
p_tree->Stat(root_nidx).base_weight = weight;
|
||||
(*p_tree)[root_nidx].SetLeaf(param_.learning_rate * weight);
|
||||
|
||||
// Store sum gradients
|
||||
for (auto& shard : shards) {
|
||||
for (auto& shard : shards_) {
|
||||
shard->node_sum_gradients[root_nidx] = sum_gradient;
|
||||
}
|
||||
|
||||
@@ -785,14 +784,14 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||
auto nidx = candidate.nid;
|
||||
auto left_nidx = (*p_tree)[nidx].cleft();
|
||||
auto right_nidx = (*p_tree)[nidx].cright();
|
||||
auto left_nidx = (*p_tree)[nidx].LeftChild();
|
||||
auto right_nidx = (*p_tree)[nidx].RightChild();
|
||||
|
||||
// convert floating-point split_pt into corresponding bin_id
|
||||
// split_cond = -1 indicates that split_pt is less than all known cut points
|
||||
auto split_gidx = -1;
|
||||
auto fidx = candidate.split.findex;
|
||||
auto default_dir_left = candidate.split.dir == LeftDir;
|
||||
auto default_dir_left = candidate.split.dir == kLeftDir;
|
||||
auto fidx_begin = hmat_.row_ptr[fidx];
|
||||
auto fidx_end = hmat_.row_ptr[fidx + 1];
|
||||
for (auto i = fidx_begin; i < fidx_end; ++i) {
|
||||
@@ -801,13 +800,13 @@ class GPUHistMaker : public TreeUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
auto is_dense = info->num_nonzero == info->num_row * info->num_col;
|
||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||
|
||||
omp_set_num_threads(shards.size());
|
||||
omp_set_num_threads(shards_.size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
shards[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
||||
shards_[cpu_thread_id]->UpdatePosition(nidx, left_nidx, right_nidx, fidx,
|
||||
split_gidx, default_dir_left,
|
||||
is_dense, fidx_begin, fidx_end);
|
||||
}
|
||||
@@ -818,55 +817,55 @@ class GPUHistMaker : public TreeUpdater {
|
||||
RegTree& tree = *p_tree;
|
||||
tree.AddChilds(candidate.nid);
|
||||
auto& parent = tree[candidate.nid];
|
||||
parent.set_split(candidate.split.findex, candidate.split.fvalue,
|
||||
candidate.split.dir == LeftDir);
|
||||
tree.stat(candidate.nid).loss_chg = candidate.split.loss_chg;
|
||||
parent.SetSplit(candidate.split.findex, candidate.split.fvalue,
|
||||
candidate.split.dir == kLeftDir);
|
||||
tree.Stat(candidate.nid).loss_chg = candidate.split.loss_chg;
|
||||
|
||||
// Set up child constraints
|
||||
node_value_constraints_.resize(tree.GetNodes().size());
|
||||
GradStats left_stats(param);
|
||||
GradStats left_stats(param_);
|
||||
left_stats.Add(candidate.split.left_sum);
|
||||
GradStats right_stats(param);
|
||||
GradStats right_stats(param_);
|
||||
right_stats.Add(candidate.split.right_sum);
|
||||
node_value_constraints_[candidate.nid].SetChild(
|
||||
param, parent.split_index(), left_stats, right_stats,
|
||||
&node_value_constraints_[parent.cleft()],
|
||||
&node_value_constraints_[parent.cright()]);
|
||||
param_, parent.SplitIndex(), left_stats, right_stats,
|
||||
&node_value_constraints_[parent.LeftChild()],
|
||||
&node_value_constraints_[parent.RightChild()]);
|
||||
|
||||
// Configure left child
|
||||
auto left_weight =
|
||||
node_value_constraints_[parent.cleft()].CalcWeight(param, left_stats);
|
||||
tree[parent.cleft()].set_leaf(left_weight * param.learning_rate, 0);
|
||||
tree.stat(parent.cleft()).base_weight = left_weight;
|
||||
tree.stat(parent.cleft()).sum_hess = candidate.split.left_sum.GetHess();
|
||||
node_value_constraints_[parent.LeftChild()].CalcWeight(param_, left_stats);
|
||||
tree[parent.LeftChild()].SetLeaf(left_weight * param_.learning_rate, 0);
|
||||
tree.Stat(parent.LeftChild()).base_weight = left_weight;
|
||||
tree.Stat(parent.LeftChild()).sum_hess = candidate.split.left_sum.GetHess();
|
||||
|
||||
// Configure right child
|
||||
auto right_weight =
|
||||
node_value_constraints_[parent.cright()].CalcWeight(param, right_stats);
|
||||
tree[parent.cright()].set_leaf(right_weight * param.learning_rate, 0);
|
||||
tree.stat(parent.cright()).base_weight = right_weight;
|
||||
tree.stat(parent.cright()).sum_hess = candidate.split.right_sum.GetHess();
|
||||
node_value_constraints_[parent.RightChild()].CalcWeight(param_, right_stats);
|
||||
tree[parent.RightChild()].SetLeaf(right_weight * param_.learning_rate, 0);
|
||||
tree.Stat(parent.RightChild()).base_weight = right_weight;
|
||||
tree.Stat(parent.RightChild()).sum_hess = candidate.split.right_sum.GetHess();
|
||||
// Store sum gradients
|
||||
for (auto& shard : shards) {
|
||||
shard->node_sum_gradients[parent.cleft()] = candidate.split.left_sum;
|
||||
shard->node_sum_gradients[parent.cright()] = candidate.split.right_sum;
|
||||
for (auto& shard : shards_) {
|
||||
shard->node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum;
|
||||
shard->node_sum_gradients[parent.RightChild()] = candidate.split.right_sum;
|
||||
}
|
||||
this->UpdatePosition(candidate, p_tree);
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<bst_gpair>* gpair, DMatrix* p_fmat,
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
// Temporarily store number of threads so we can change it back later
|
||||
int nthread = omp_get_max_threads();
|
||||
|
||||
auto& tree = *p_tree;
|
||||
|
||||
monitor.Start("InitData", dList);
|
||||
monitor_.Start("InitData", device_list_);
|
||||
this->InitData(gpair, p_fmat, *p_tree);
|
||||
monitor.Stop("InitData", dList);
|
||||
monitor.Start("InitRoot", dList);
|
||||
monitor_.Stop("InitData", device_list_);
|
||||
monitor_.Start("InitRoot", device_list_);
|
||||
this->InitRoot(p_tree);
|
||||
monitor.Stop("InitRoot", dList);
|
||||
monitor_.Stop("InitRoot", device_list_);
|
||||
|
||||
auto timestamp = qexpand_->size();
|
||||
auto num_leaves = 1;
|
||||
@@ -874,25 +873,25 @@ class GPUHistMaker : public TreeUpdater {
|
||||
while (!qexpand_->empty()) {
|
||||
auto candidate = qexpand_->top();
|
||||
qexpand_->pop();
|
||||
if (!candidate.IsValid(param, num_leaves)) continue;
|
||||
if (!candidate.IsValid(param_, num_leaves)) continue;
|
||||
// std::cout << candidate;
|
||||
monitor.Start("ApplySplit", dList);
|
||||
monitor_.Start("ApplySplit", device_list_);
|
||||
this->ApplySplit(candidate, p_tree);
|
||||
monitor.Stop("ApplySplit", dList);
|
||||
monitor_.Stop("ApplySplit", device_list_);
|
||||
num_leaves++;
|
||||
|
||||
auto left_child_nidx = tree[candidate.nid].cleft();
|
||||
auto right_child_nidx = tree[candidate.nid].cright();
|
||||
auto left_child_nidx = tree[candidate.nid].LeftChild();
|
||||
auto right_child_nidx = tree[candidate.nid].RightChild();
|
||||
|
||||
// Only create child entries if needed
|
||||
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
|
||||
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
|
||||
num_leaves)) {
|
||||
monitor.Start("BuildHist", dList);
|
||||
monitor_.Start("BuildHist", device_list_);
|
||||
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
||||
right_child_nidx);
|
||||
monitor.Stop("BuildHist", dList);
|
||||
monitor_.Stop("BuildHist", device_list_);
|
||||
|
||||
monitor.Start("EvaluateSplits", dList);
|
||||
monitor_.Start("EvaluateSplits", device_list_);
|
||||
auto splits =
|
||||
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
|
||||
qexpand_->push(ExpandEntry(left_child_nidx,
|
||||
@@ -901,7 +900,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
qexpand_->push(ExpandEntry(right_child_nidx,
|
||||
tree.GetDepth(right_child_nidx), splits[1],
|
||||
timestamp++));
|
||||
monitor.Stop("EvaluateSplits", dList);
|
||||
monitor_.Stop("EvaluateSplits", device_list_);
|
||||
}
|
||||
}
|
||||
// Reset omp num threads
|
||||
@@ -910,17 +909,17 @@ class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
bool UpdatePredictionCache(
|
||||
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
|
||||
monitor.Start("UpdatePredictionCache", dList);
|
||||
if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||
monitor_.Start("UpdatePredictionCache", device_list_);
|
||||
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||
return false;
|
||||
|
||||
bst_float* out_preds_d = p_out_preds->ptr_d(param.gpu_id);
|
||||
bst_float* out_preds_d = p_out_preds->DevicePointer(param_.gpu_id);
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards.size(); ++shard) {
|
||||
shards[shard]->UpdatePredictionCache(out_preds_d);
|
||||
for (int shard = 0; shard < shards_.size(); ++shard) {
|
||||
shards_[shard]->UpdatePredictionCache(out_preds_d);
|
||||
}
|
||||
monitor.Stop("UpdatePredictionCache", dList);
|
||||
monitor_.Stop("UpdatePredictionCache", device_list_);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -933,7 +932,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
uint64_t timestamp)
|
||||
: nid(nid), depth(depth), split(split), timestamp(timestamp) {}
|
||||
bool IsValid(const TrainParam& param, int num_leaves) const {
|
||||
if (split.loss_chg <= rt_eps) return false;
|
||||
if (split.loss_chg <= kRtEps) return false;
|
||||
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0)
|
||||
return false;
|
||||
if (param.max_depth > 0 && depth == param.max_depth) return false;
|
||||
@@ -959,38 +958,38 @@ class GPUHistMaker : public TreeUpdater {
|
||||
}
|
||||
};
|
||||
|
||||
inline static bool depth_wise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
if (lhs.depth == rhs.depth) {
|
||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||
} else {
|
||||
return lhs.depth > rhs.depth; // favor small depth
|
||||
}
|
||||
}
|
||||
inline static bool loss_guide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||
if (lhs.split.loss_chg == rhs.split.loss_chg) {
|
||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||
} else {
|
||||
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
|
||||
}
|
||||
}
|
||||
TrainParam param;
|
||||
TrainParam param_;
|
||||
common::HistCutMatrix hmat_;
|
||||
common::GHistIndexMatrix gmat_;
|
||||
MetaInfo* info;
|
||||
bool initialised;
|
||||
int n_devices;
|
||||
int n_bins;
|
||||
MetaInfo* info_;
|
||||
bool initialised_;
|
||||
int n_devices_;
|
||||
int n_bins_;
|
||||
|
||||
std::vector<std::unique_ptr<DeviceShard>> shards;
|
||||
ColumnSampler column_sampler;
|
||||
std::vector<std::unique_ptr<DeviceShard>> shards_;
|
||||
ColumnSampler column_sampler_;
|
||||
typedef std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||
std::function<bool(ExpandEntry, ExpandEntry)>>
|
||||
ExpandQueue;
|
||||
std::unique_ptr<ExpandQueue> qexpand_;
|
||||
common::Monitor monitor;
|
||||
dh::AllReducer reducer;
|
||||
common::Monitor monitor_;
|
||||
dh::AllReducer reducer_;
|
||||
std::vector<ValueConstraint> node_value_constraints_;
|
||||
std::vector<int> dList;
|
||||
std::vector<int> device_list_;
|
||||
|
||||
DMatrix* p_last_fmat_;
|
||||
};
|
||||
|
||||
@@ -21,18 +21,18 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker);
|
||||
template<typename TStats>
|
||||
class HistMaker: public BaseMaker {
|
||||
public:
|
||||
void Update(HostDeviceVector<bst_gpair> *gpair,
|
||||
void Update(HostDeviceVector<GradientPair> *gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<RegTree*> &trees) override {
|
||||
TStats::CheckInfo(p_fmat->info());
|
||||
TStats::CheckInfo(p_fmat->Info());
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
// build tree
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
this->Update(gpair->data_h(), p_fmat, trees[i]);
|
||||
for (auto tree : trees) {
|
||||
this->Update(gpair->HostVector(), p_fmat, tree);
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
param_.learning_rate = lr;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -45,13 +45,13 @@ class HistMaker: public BaseMaker {
|
||||
/*! \brief size of histogram */
|
||||
unsigned size;
|
||||
// default constructor
|
||||
HistUnit() {}
|
||||
HistUnit() = default;
|
||||
// constructor
|
||||
HistUnit(const bst_float *cut, TStats *data, unsigned size)
|
||||
: cut(cut), data(data), size(size) {}
|
||||
/*! \brief add a histogram to data */
|
||||
inline void Add(bst_float fv,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const std::vector<GradientPair> &gpair,
|
||||
const MetaInfo &info,
|
||||
const bst_uint ridx) {
|
||||
unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
|
||||
@@ -116,44 +116,44 @@ class HistMaker: public BaseMaker {
|
||||
}
|
||||
};
|
||||
// workspace of thread
|
||||
ThreadWSpace wspace;
|
||||
ThreadWSpace wspace_;
|
||||
// reducer for histogram
|
||||
rabit::Reducer<TStats, TStats::Reduce> histred;
|
||||
rabit::Reducer<TStats, TStats::Reduce> histred_;
|
||||
// set of working features
|
||||
std::vector<bst_uint> fwork_set;
|
||||
std::vector<bst_uint> fwork_set_;
|
||||
// update function implementation
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
virtual void Update(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
RegTree *p_tree) {
|
||||
this->InitData(gpair, *p_fmat, *p_tree);
|
||||
this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
|
||||
this->InitWorkSet(p_fmat, *p_tree, &fwork_set_);
|
||||
// mark root node as fresh.
|
||||
for (int i = 0; i < p_tree->param.num_roots; ++i) {
|
||||
(*p_tree)[i].set_leaf(0.0f, 0);
|
||||
(*p_tree)[i].SetLeaf(0.0f, 0);
|
||||
}
|
||||
|
||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||
for (int depth = 0; depth < param_.max_depth; ++depth) {
|
||||
// reset and propose candidate split
|
||||
this->ResetPosAndPropose(gpair, p_fmat, fwork_set, *p_tree);
|
||||
this->ResetPosAndPropose(gpair, p_fmat, fwork_set_, *p_tree);
|
||||
// create histogram
|
||||
this->CreateHist(gpair, p_fmat, fwork_set, *p_tree);
|
||||
this->CreateHist(gpair, p_fmat, fwork_set_, *p_tree);
|
||||
// find split based on histogram statistics
|
||||
this->FindSplit(depth, gpair, p_fmat, fwork_set, p_tree);
|
||||
this->FindSplit(depth, gpair, p_fmat, fwork_set_, p_tree);
|
||||
// reset position after split
|
||||
this->ResetPositionAfterSplit(p_fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree);
|
||||
// if nothing left to be expand, break
|
||||
if (qexpand.size() == 0) break;
|
||||
if (qexpand_.size() == 0) break;
|
||||
}
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
|
||||
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||
const int nid = qexpand_[i];
|
||||
(*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
|
||||
}
|
||||
}
|
||||
// this function does two jobs
|
||||
// (1) reset the position in array position, to be the latest leaf id
|
||||
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
|
||||
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
virtual void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector <bst_uint> &fset,
|
||||
const RegTree &tree) = 0;
|
||||
@@ -170,7 +170,7 @@ class HistMaker: public BaseMaker {
|
||||
virtual void ResetPositionAfterSplit(DMatrix *p_fmat,
|
||||
const RegTree &tree) {
|
||||
}
|
||||
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
virtual void CreateHist(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector <bst_uint> &fset,
|
||||
const RegTree &tree) = 0;
|
||||
@@ -183,14 +183,14 @@ class HistMaker: public BaseMaker {
|
||||
TStats *left_sum) {
|
||||
if (hist.size == 0) return;
|
||||
|
||||
double root_gain = node_sum.CalcGain(param);
|
||||
TStats s(param), c(param);
|
||||
double root_gain = node_sum.CalcGain(param_);
|
||||
TStats s(param_), c(param_);
|
||||
for (bst_uint i = 0; i < hist.size; ++i) {
|
||||
s.Add(hist.data[i]);
|
||||
if (s.sum_hess >= param.min_child_weight) {
|
||||
if (s.sum_hess >= param_.min_child_weight) {
|
||||
c.SetSubstract(node_sum, s);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
|
||||
if (c.sum_hess >= param_.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param_) + c.CalcGain(param_) - root_gain;
|
||||
if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i], false)) {
|
||||
*left_sum = s;
|
||||
}
|
||||
@@ -200,10 +200,10 @@ class HistMaker: public BaseMaker {
|
||||
s.Clear();
|
||||
for (bst_uint i = hist.size - 1; i != 0; --i) {
|
||||
s.Add(hist.data[i]);
|
||||
if (s.sum_hess >= param.min_child_weight) {
|
||||
if (s.sum_hess >= param_.min_child_weight) {
|
||||
c.SetSubstract(node_sum, s);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
|
||||
if (c.sum_hess >= param_.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param_) + c.CalcGain(param_) - root_gain;
|
||||
if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i-1], true)) {
|
||||
*left_sum = c;
|
||||
}
|
||||
@@ -212,65 +212,64 @@ class HistMaker: public BaseMaker {
|
||||
}
|
||||
}
|
||||
inline void FindSplit(int depth,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector <bst_uint> &fset,
|
||||
RegTree *p_tree) {
|
||||
const size_t num_feature = fset.size();
|
||||
// get the best split condition for each node
|
||||
std::vector<SplitEntry> sol(qexpand.size());
|
||||
std::vector<TStats> left_sum(qexpand.size());
|
||||
bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
|
||||
std::vector<SplitEntry> sol(qexpand_.size());
|
||||
std::vector<TStats> left_sum(qexpand_.size());
|
||||
auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||
const int nid = qexpand[wid];
|
||||
CHECK_EQ(node2workindex[nid], static_cast<int>(wid));
|
||||
const int nid = qexpand_[wid];
|
||||
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
|
||||
SplitEntry &best = sol[wid];
|
||||
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||
TStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
|
||||
EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
|
||||
node_sum, fset[i], &best, &left_sum[wid]);
|
||||
}
|
||||
}
|
||||
// get the best result, we can synchronize the solution
|
||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||
const int nid = qexpand[wid];
|
||||
const int nid = qexpand_[wid];
|
||||
const SplitEntry &best = sol[wid];
|
||||
const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||
const TStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||
this->SetStats(p_tree, nid, node_sum);
|
||||
// set up the values
|
||||
p_tree->stat(nid).loss_chg = best.loss_chg;
|
||||
p_tree->Stat(nid).loss_chg = best.loss_chg;
|
||||
// now we know the solution in snode[nid], set split
|
||||
if (best.loss_chg > rt_eps) {
|
||||
if (best.loss_chg > kRtEps) {
|
||||
p_tree->AddChilds(nid);
|
||||
(*p_tree)[nid].set_split(best.split_index(),
|
||||
best.split_value, best.default_left());
|
||||
(*p_tree)[nid].SetSplit(best.SplitIndex(),
|
||||
best.split_value, best.DefaultLeft());
|
||||
// mark right child as 0, to indicate fresh leaf
|
||||
(*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].LeftChild()].SetLeaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].RightChild()].SetLeaf(0.0f, 0);
|
||||
// right side sum
|
||||
TStats right_sum;
|
||||
right_sum.SetSubstract(node_sum, left_sum[wid]);
|
||||
this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
|
||||
this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
|
||||
this->SetStats(p_tree, (*p_tree)[nid].LeftChild(), left_sum[wid]);
|
||||
this->SetStats(p_tree, (*p_tree)[nid].RightChild(), right_sum);
|
||||
} else {
|
||||
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
|
||||
(*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
|
||||
p_tree->stat(nid).base_weight = static_cast<bst_float>(node_sum.CalcWeight(param));
|
||||
p_tree->stat(nid).sum_hess = static_cast<bst_float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
|
||||
p_tree->Stat(nid).base_weight = static_cast<bst_float>(node_sum.CalcWeight(param_));
|
||||
p_tree->Stat(nid).sum_hess = static_cast<bst_float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param_, p_tree->Leafvec(nid));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename TStats>
|
||||
class CQHistMaker: public HistMaker<TStats> {
|
||||
public:
|
||||
CQHistMaker() : cache_dmatrix_(nullptr) {
|
||||
}
|
||||
CQHistMaker() = default;
|
||||
|
||||
protected:
|
||||
struct HistEntry {
|
||||
@@ -281,7 +280,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
* do linear scan, start from istart
|
||||
*/
|
||||
inline void Add(bst_float fv,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const std::vector<GradientPair> &gpair,
|
||||
const MetaInfo &info,
|
||||
const bst_uint ridx) {
|
||||
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
||||
@@ -293,7 +292,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
* do linear scan, start from istart
|
||||
*/
|
||||
inline void Add(bst_float fv,
|
||||
bst_gpair gstats) {
|
||||
GradientPair gstats) {
|
||||
if (fv < hist.cut[istart]) {
|
||||
hist.data[istart].Add(gstats);
|
||||
} else {
|
||||
@@ -311,190 +310,190 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
}
|
||||
};
|
||||
// sketch type used for this
|
||||
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
|
||||
// initialize the work set of tree
|
||||
void InitWorkSet(DMatrix *p_fmat,
|
||||
const RegTree &tree,
|
||||
std::vector<bst_uint> *p_fset) override {
|
||||
if (p_fmat != cache_dmatrix_) {
|
||||
feat_helper.InitByCol(p_fmat, tree);
|
||||
feat_helper_.InitByCol(p_fmat, tree);
|
||||
cache_dmatrix_ = p_fmat;
|
||||
}
|
||||
feat_helper.SyncInfo();
|
||||
feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
|
||||
feat_helper_.SyncInfo();
|
||||
feat_helper_.SampleCol(this->param_.colsample_bytree, p_fset);
|
||||
}
|
||||
// code to create histogram
|
||||
void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
void CreateHist(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<bst_uint> &fset,
|
||||
const RegTree &tree) override {
|
||||
const MetaInfo &info = p_fmat->info();
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// fill in reverse map
|
||||
feat2workindex.resize(tree.param.num_feature);
|
||||
std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
|
||||
feat2workindex_.resize(tree.param.num_feature);
|
||||
std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
feat2workindex[fset[i]] = static_cast<int>(i);
|
||||
feat2workindex_[fset[i]] = static_cast<int>(i);
|
||||
}
|
||||
// start to work
|
||||
this->wspace.Init(this->param, 1);
|
||||
this->wspace_.Init(this->param_, 1);
|
||||
// if it is C++11, use lazy evaluation for Allreduce,
|
||||
// to gain speedup in recovery
|
||||
#if __cplusplus >= 201103L
|
||||
auto lazy_get_hist = [&]()
|
||||
#endif
|
||||
{
|
||||
thread_hist.resize(omp_get_max_threads());
|
||||
thread_hist_.resize(omp_get_max_threads());
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fset);
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
int offset = feat2workindex[batch.col_index[i]];
|
||||
int offset = feat2workindex_[batch.col_index[i]];
|
||||
if (offset >= 0) {
|
||||
this->UpdateHistCol(gpair, batch[i], info, tree,
|
||||
fset, offset,
|
||||
&thread_hist[omp_get_thread_num()]);
|
||||
&thread_hist_[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// update node statistics.
|
||||
this->GetNodeStats(gpair, *p_fmat, tree,
|
||||
&thread_stats, &node_stats);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
const int wid = this->node2workindex[nid];
|
||||
this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
|
||||
.data[0] = node_stats[nid];
|
||||
&thread_stats_, &node_stats_);
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
const int wid = this->node2workindex_[nid];
|
||||
this->wspace_.hset[0][fset.size() + wid * (fset.size()+1)]
|
||||
.data[0] = node_stats_[nid];
|
||||
}
|
||||
};
|
||||
// sync the histogram
|
||||
// if it is C++11, use lazy evaluation for Allreduce
|
||||
#if __cplusplus >= 201103L
|
||||
this->histred.Allreduce(dmlc::BeginPtr(this->wspace.hset[0].data),
|
||||
this->wspace.hset[0].data.size(), lazy_get_hist);
|
||||
this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
|
||||
this->wspace_.hset[0].data.size(), lazy_get_hist);
|
||||
#else
|
||||
this->histred.Allreduce(dmlc::BeginPtr(this->wspace.hset[0].data),
|
||||
this->wspace.hset[0].data.size());
|
||||
this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
|
||||
this->wspace_.hset[0].data.size());
|
||||
#endif
|
||||
}
|
||||
void ResetPositionAfterSplit(DMatrix *p_fmat,
|
||||
const RegTree &tree) override {
|
||||
this->GetSplitSet(this->qexpand, tree, &fsplit_set);
|
||||
this->GetSplitSet(this->qexpand_, tree, &fsplit_set_);
|
||||
}
|
||||
void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<bst_uint> &fset,
|
||||
const RegTree &tree) override {
|
||||
const MetaInfo &info = p_fmat->info();
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// fill in reverse map
|
||||
feat2workindex.resize(tree.param.num_feature);
|
||||
std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
|
||||
work_set.clear();
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (feat_helper.Type(fset[i]) == 2) {
|
||||
feat2workindex[fset[i]] = static_cast<int>(work_set.size());
|
||||
work_set.push_back(fset[i]);
|
||||
feat2workindex_.resize(tree.param.num_feature);
|
||||
std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1);
|
||||
work_set_.clear();
|
||||
for (auto fidx : fset) {
|
||||
if (feat_helper_.Type(fidx) == 2) {
|
||||
feat2workindex_[fidx] = static_cast<int>(work_set_.size());
|
||||
work_set_.push_back(fidx);
|
||||
} else {
|
||||
feat2workindex[fset[i]] = -2;
|
||||
feat2workindex_[fidx] = -2;
|
||||
}
|
||||
}
|
||||
const size_t work_set_size = work_set.size();
|
||||
const size_t work_set_size = work_set_.size();
|
||||
|
||||
sketchs.resize(this->qexpand.size() * work_set_size);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
sketchs[i].Init(info.num_row, this->param.sketch_eps);
|
||||
sketchs_.resize(this->qexpand_.size() * work_set_size);
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
sketchs_[i].Init(info.num_row_, this->param_.sketch_eps);
|
||||
}
|
||||
// intitialize the summary array
|
||||
summary_array.resize(sketchs.size());
|
||||
summary_array_.resize(sketchs_.size());
|
||||
// setup maximum size
|
||||
unsigned max_size = this->param.max_sketch_size();
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
summary_array[i].Reserve(max_size);
|
||||
unsigned max_size = this->param_.MaxSketchSize();
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
summary_array_[i].Reserve(max_size);
|
||||
}
|
||||
{
|
||||
// get smmary
|
||||
thread_sketch.resize(omp_get_max_threads());
|
||||
thread_sketch_.resize(omp_get_max_threads());
|
||||
|
||||
// TWOPASS: use the real set + split set in the column iteration.
|
||||
this->SetDefaultPostion(p_fmat, tree);
|
||||
work_set.insert(work_set.end(), fsplit_set.begin(), fsplit_set.end());
|
||||
std::sort(work_set.begin(), work_set.end());
|
||||
work_set.resize(std::unique(work_set.begin(), work_set.end()) - work_set.begin());
|
||||
work_set_.insert(work_set_.end(), fsplit_set_.begin(), fsplit_set_.end());
|
||||
std::sort(work_set_.begin(), work_set_.end());
|
||||
work_set_.resize(std::unique(work_set_.begin(), work_set_.end()) - work_set_.begin());
|
||||
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(work_set);
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(work_set_);
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// TWOPASS: use the real set + split set in the column iteration.
|
||||
this->CorrectNonDefaultPositionByBatch(batch, fsplit_set, tree);
|
||||
this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);
|
||||
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
int offset = feat2workindex[batch.col_index[i]];
|
||||
int offset = feat2workindex_[batch.col_index[i]];
|
||||
if (offset >= 0) {
|
||||
this->UpdateSketchCol(gpair, batch[i], tree,
|
||||
work_set_size, offset,
|
||||
&thread_sketch[omp_get_thread_num()]);
|
||||
&thread_sketch_[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
|
||||
sketchs[i].GetSummary(&out);
|
||||
summary_array[i].SetPrune(out, max_size);
|
||||
sketchs_[i].GetSummary(&out);
|
||||
summary_array_[i].SetPrune(out, max_size);
|
||||
}
|
||||
CHECK_EQ(summary_array.size(), sketchs.size());
|
||||
CHECK_EQ(summary_array_.size(), sketchs_.size());
|
||||
}
|
||||
if (summary_array.size() != 0) {
|
||||
if (summary_array_.size() != 0) {
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
|
||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
sreducer_.Allreduce(dmlc::BeginPtr(summary_array_), nbytes, summary_array_.size());
|
||||
}
|
||||
// now we get the final result of sketch, setup the cut
|
||||
this->wspace.cut.clear();
|
||||
this->wspace.rptr.clear();
|
||||
this->wspace.rptr.push_back(0);
|
||||
for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
int offset = feat2workindex[fset[i]];
|
||||
this->wspace_.cut.clear();
|
||||
this->wspace_.rptr.clear();
|
||||
this->wspace_.rptr.push_back(0);
|
||||
for (size_t wid = 0; wid < this->qexpand_.size(); ++wid) {
|
||||
for (unsigned int i : fset) {
|
||||
int offset = feat2workindex_[i];
|
||||
if (offset >= 0) {
|
||||
const WXQSketch::Summary &a = summary_array[wid * work_set_size + offset];
|
||||
const WXQSketch::Summary &a = summary_array_[wid * work_set_size + offset];
|
||||
for (size_t i = 1; i < a.size; ++i) {
|
||||
bst_float cpt = a.data[i].value - rt_eps;
|
||||
if (i == 1 || cpt > this->wspace.cut.back()) {
|
||||
this->wspace.cut.push_back(cpt);
|
||||
bst_float cpt = a.data[i].value - kRtEps;
|
||||
if (i == 1 || cpt > this->wspace_.cut.back()) {
|
||||
this->wspace_.cut.push_back(cpt);
|
||||
}
|
||||
}
|
||||
// push a value that is greater than anything
|
||||
if (a.size != 0) {
|
||||
bst_float cpt = a.data[a.size - 1].value;
|
||||
// this must be bigger than last value in a scale
|
||||
bst_float last = cpt + fabs(cpt) + rt_eps;
|
||||
this->wspace.cut.push_back(last);
|
||||
bst_float last = cpt + fabs(cpt) + kRtEps;
|
||||
this->wspace_.cut.push_back(last);
|
||||
}
|
||||
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
|
||||
this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
|
||||
} else {
|
||||
CHECK_EQ(offset, -2);
|
||||
bst_float cpt = feat_helper.MaxValue(fset[i]);
|
||||
this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
|
||||
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
|
||||
bst_float cpt = feat_helper_.MaxValue(i);
|
||||
this->wspace_.cut.push_back(cpt + fabs(cpt) + kRtEps);
|
||||
this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
|
||||
}
|
||||
}
|
||||
// reserve last value for global statistics
|
||||
this->wspace.cut.push_back(0.0f);
|
||||
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
|
||||
this->wspace_.cut.push_back(0.0f);
|
||||
this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
|
||||
}
|
||||
CHECK_EQ(this->wspace.rptr.size(),
|
||||
(fset.size() + 1) * this->qexpand.size() + 1);
|
||||
CHECK_EQ(this->wspace_.rptr.size(),
|
||||
(fset.size() + 1) * this->qexpand_.size() + 1);
|
||||
}
|
||||
|
||||
inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
|
||||
inline void UpdateHistCol(const std::vector<GradientPair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const MetaInfo &info,
|
||||
const RegTree &tree,
|
||||
@@ -505,21 +504,21 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
// initialize sbuilder for use
|
||||
std::vector<HistEntry> &hbuilder = *p_temp;
|
||||
hbuilder.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
const unsigned wid = this->node2workindex[nid];
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const unsigned nid = this->qexpand_[i];
|
||||
const unsigned wid = this->node2workindex_[nid];
|
||||
hbuilder[nid].istart = 0;
|
||||
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
|
||||
hbuilder[nid].hist = this->wspace_.hset[0][fid_offset + wid * (fset.size()+1)];
|
||||
}
|
||||
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||
const bst_uint kBuffer = 32;
|
||||
if (TStats::kSimpleStats != 0 && this->param_.cache_opt != 0) {
|
||||
constexpr bst_uint kBuffer = 32;
|
||||
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||
int buf_position[kBuffer];
|
||||
bst_gpair buf_gpair[kBuffer];
|
||||
GradientPair buf_gpair[kBuffer];
|
||||
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
bst_uint ridx = c[j + i].index;
|
||||
buf_position[i] = this->position[ridx];
|
||||
buf_position[i] = this->position_[ridx];
|
||||
buf_gpair[i] = gpair[ridx];
|
||||
}
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
@@ -531,7 +530,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
}
|
||||
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
|
||||
}
|
||||
@@ -539,14 +538,14 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
} else {
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
|
||||
inline void UpdateSketchCol(const std::vector<GradientPair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const RegTree &tree,
|
||||
size_t work_set_size,
|
||||
@@ -556,45 +555,45 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
// initialize sbuilder for use
|
||||
std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
|
||||
sbuilder.resize(tree.param.num_nodes);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
const unsigned wid = this->node2workindex[nid];
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const unsigned nid = this->qexpand_[i];
|
||||
const unsigned wid = this->node2workindex_[nid];
|
||||
sbuilder[nid].sum_total = 0.0f;
|
||||
sbuilder[nid].sketch = &sketchs[wid * work_set_size + offset];
|
||||
sbuilder[nid].sketch = &sketchs_[wid * work_set_size + offset];
|
||||
}
|
||||
|
||||
// first pass, get sum of weight, TODO, optimization to skip first pass
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].sum_total += gpair[ridx].GetHess();
|
||||
}
|
||||
}
|
||||
// if only one value, no need to do second pass
|
||||
if (c[0].fvalue == c[c.length-1].fvalue) {
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
sbuilder[nid].sketch->Push(c[0].fvalue, static_cast<bst_float>(sbuilder[nid].sum_total));
|
||||
}
|
||||
return;
|
||||
}
|
||||
// two pass scan
|
||||
unsigned max_size = this->param.max_sketch_size();
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
unsigned max_size = this->param_.MaxSketchSize();
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
sbuilder[nid].Init(max_size);
|
||||
}
|
||||
// second pass, build the sketch
|
||||
if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
|
||||
const bst_uint kBuffer = 32;
|
||||
if (TStats::kSimpleStats != 0 && this->param_.cache_opt != 0) {
|
||||
constexpr bst_uint kBuffer = 32;
|
||||
bst_uint align_length = c.length / kBuffer * kBuffer;
|
||||
int buf_position[kBuffer];
|
||||
bst_float buf_hess[kBuffer];
|
||||
for (bst_uint j = 0; j < align_length; j += kBuffer) {
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
bst_uint ridx = c[j + i].index;
|
||||
buf_position[i] = this->position[ridx];
|
||||
buf_position[i] = this->position_[ridx];
|
||||
buf_hess[i] = gpair[ridx].GetHess();
|
||||
}
|
||||
for (bst_uint i = 0; i < kBuffer; ++i) {
|
||||
@@ -606,7 +605,7 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
}
|
||||
for (bst_uint j = align_length; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].GetHess(), max_size);
|
||||
}
|
||||
@@ -614,136 +613,137 @@ class CQHistMaker: public HistMaker<TStats> {
|
||||
} else {
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].GetHess(), max_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
sbuilder[nid].Finalize(max_size);
|
||||
}
|
||||
}
|
||||
// cached dmatrix where we initialized the feature on.
|
||||
const DMatrix* cache_dmatrix_;
|
||||
const DMatrix* cache_dmatrix_{nullptr};
|
||||
// feature helper
|
||||
BaseMaker::FMetaHelper feat_helper;
|
||||
BaseMaker::FMetaHelper feat_helper_;
|
||||
// temp space to map feature id to working index
|
||||
std::vector<int> feat2workindex;
|
||||
std::vector<int> feat2workindex_;
|
||||
// set of index from fset that are current work set
|
||||
std::vector<bst_uint> work_set;
|
||||
std::vector<bst_uint> work_set_;
|
||||
// set of index from that are split candidates.
|
||||
std::vector<bst_uint> fsplit_set;
|
||||
std::vector<bst_uint> fsplit_set_;
|
||||
// thread temp data
|
||||
std::vector<std::vector<BaseMaker::SketchEntry> > thread_sketch;
|
||||
std::vector<std::vector<BaseMaker::SketchEntry> > thread_sketch_;
|
||||
// used to hold statistics
|
||||
std::vector<std::vector<TStats> > thread_stats;
|
||||
std::vector<std::vector<TStats> > thread_stats_;
|
||||
// used to hold start pointer
|
||||
std::vector<std::vector<HistEntry> > thread_hist;
|
||||
std::vector<std::vector<HistEntry> > thread_hist_;
|
||||
// node statistics
|
||||
std::vector<TStats> node_stats;
|
||||
std::vector<TStats> node_stats_;
|
||||
// summary array
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array;
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array_;
|
||||
// reducer for summary
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer_;
|
||||
// per node, per feature sketch
|
||||
std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs;
|
||||
std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs_;
|
||||
};
|
||||
|
||||
// global proposal
|
||||
template<typename TStats>
|
||||
class GlobalProposalHistMaker: public CQHistMaker<TStats> {
|
||||
protected:
|
||||
void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<bst_uint> &fset,
|
||||
const RegTree &tree) override {
|
||||
if (this->qexpand.size() == 1) {
|
||||
if (this->qexpand_.size() == 1) {
|
||||
cached_rptr_.clear();
|
||||
cached_cut_.clear();
|
||||
}
|
||||
if (cached_rptr_.size() == 0) {
|
||||
CHECK_EQ(this->qexpand.size(), 1U);
|
||||
CHECK_EQ(this->qexpand_.size(), 1U);
|
||||
CQHistMaker<TStats>::ResetPosAndPropose(gpair, p_fmat, fset, tree);
|
||||
cached_rptr_ = this->wspace.rptr;
|
||||
cached_cut_ = this->wspace.cut;
|
||||
cached_rptr_ = this->wspace_.rptr;
|
||||
cached_cut_ = this->wspace_.cut;
|
||||
} else {
|
||||
this->wspace.cut.clear();
|
||||
this->wspace.rptr.clear();
|
||||
this->wspace.rptr.push_back(0);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
this->wspace_.cut.clear();
|
||||
this->wspace_.rptr.clear();
|
||||
this->wspace_.rptr.push_back(0);
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
for (size_t j = 0; j < cached_rptr_.size() - 1; ++j) {
|
||||
this->wspace.rptr.push_back(
|
||||
this->wspace.rptr.back() + cached_rptr_[j + 1] - cached_rptr_[j]);
|
||||
this->wspace_.rptr.push_back(
|
||||
this->wspace_.rptr.back() + cached_rptr_[j + 1] - cached_rptr_[j]);
|
||||
}
|
||||
this->wspace.cut.insert(this->wspace.cut.end(), cached_cut_.begin(), cached_cut_.end());
|
||||
this->wspace_.cut.insert(this->wspace_.cut.end(), cached_cut_.begin(), cached_cut_.end());
|
||||
}
|
||||
CHECK_EQ(this->wspace.rptr.size(),
|
||||
(fset.size() + 1) * this->qexpand.size() + 1);
|
||||
CHECK_EQ(this->wspace.rptr.back(), this->wspace.cut.size());
|
||||
CHECK_EQ(this->wspace_.rptr.size(),
|
||||
(fset.size() + 1) * this->qexpand_.size() + 1);
|
||||
CHECK_EQ(this->wspace_.rptr.back(), this->wspace_.cut.size());
|
||||
}
|
||||
}
|
||||
|
||||
// code to create histogram
|
||||
void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||
void CreateHist(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<bst_uint> &fset,
|
||||
const RegTree &tree) override {
|
||||
const MetaInfo &info = p_fmat->info();
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// fill in reverse map
|
||||
this->feat2workindex.resize(tree.param.num_feature);
|
||||
this->work_set = fset;
|
||||
std::fill(this->feat2workindex.begin(), this->feat2workindex.end(), -1);
|
||||
this->feat2workindex_.resize(tree.param.num_feature);
|
||||
this->work_set_ = fset;
|
||||
std::fill(this->feat2workindex_.begin(), this->feat2workindex_.end(), -1);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
this->feat2workindex[fset[i]] = static_cast<int>(i);
|
||||
this->feat2workindex_[fset[i]] = static_cast<int>(i);
|
||||
}
|
||||
// start to work
|
||||
this->wspace.Init(this->param, 1);
|
||||
this->wspace_.Init(this->param_, 1);
|
||||
// to gain speedup in recovery
|
||||
{
|
||||
this->thread_hist.resize(omp_get_max_threads());
|
||||
this->thread_hist_.resize(omp_get_max_threads());
|
||||
|
||||
// TWOPASS: use the real set + split set in the column iteration.
|
||||
this->SetDefaultPostion(p_fmat, tree);
|
||||
this->work_set.insert(this->work_set.end(), this->fsplit_set.begin(), this->fsplit_set.end());
|
||||
std::sort(this->work_set.begin(), this->work_set.end());
|
||||
this->work_set.resize(
|
||||
std::unique(this->work_set.begin(), this->work_set.end()) - this->work_set.begin());
|
||||
this->work_set_.insert(this->work_set_.end(), this->fsplit_set_.begin(),
|
||||
this->fsplit_set_.end());
|
||||
std::sort(this->work_set_.begin(), this->work_set_.end());
|
||||
this->work_set_.resize(
|
||||
std::unique(this->work_set_.begin(), this->work_set_.end()) - this->work_set_.begin());
|
||||
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(this->work_set);
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(this->work_set_);
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// TWOPASS: use the real set + split set in the column iteration.
|
||||
this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set, tree);
|
||||
this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set_, tree);
|
||||
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
int offset = this->feat2workindex[batch.col_index[i]];
|
||||
int offset = this->feat2workindex_[batch.col_index[i]];
|
||||
if (offset >= 0) {
|
||||
this->UpdateHistCol(gpair, batch[i], info, tree,
|
||||
fset, offset,
|
||||
&this->thread_hist[omp_get_thread_num()]);
|
||||
&this->thread_hist_[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update node statistics.
|
||||
this->GetNodeStats(gpair, *p_fmat, tree,
|
||||
&(this->thread_stats), &(this->node_stats));
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
const int wid = this->node2workindex[nid];
|
||||
this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
|
||||
.data[0] = this->node_stats[nid];
|
||||
&(this->thread_stats_), &(this->node_stats_));
|
||||
for (size_t i = 0; i < this->qexpand_.size(); ++i) {
|
||||
const int nid = this->qexpand_[i];
|
||||
const int wid = this->node2workindex_[nid];
|
||||
this->wspace_.hset[0][fset.size() + wid * (fset.size()+1)]
|
||||
.data[0] = this->node_stats_[nid];
|
||||
}
|
||||
}
|
||||
this->histred.Allreduce(dmlc::BeginPtr(this->wspace.hset[0].data),
|
||||
this->wspace.hset[0].data.size());
|
||||
this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
|
||||
this->wspace_.hset[0].data.size());
|
||||
}
|
||||
|
||||
// cached unit pointer
|
||||
@@ -756,17 +756,17 @@ class GlobalProposalHistMaker: public CQHistMaker<TStats> {
|
||||
template<typename TStats>
|
||||
class QuantileHistMaker: public HistMaker<TStats> {
|
||||
protected:
|
||||
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
|
||||
using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
|
||||
void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector <bst_uint> &fset,
|
||||
const RegTree &tree) override {
|
||||
const MetaInfo &info = p_fmat->info();
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// initialize the data structure
|
||||
const int nthread = omp_get_max_threads();
|
||||
sketchs.resize(this->qexpand.size() * tree.param.num_feature);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
sketchs[i].Init(info.num_row, this->param.sketch_eps);
|
||||
sketchs_.resize(this->qexpand_.size() * tree.param.num_feature);
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
sketchs_[i].Init(info.num_row_, this->param_.sketch_eps);
|
||||
}
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
|
||||
@@ -775,7 +775,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
|
||||
const RowBatch &batch = iter->Value();
|
||||
// parallel convert to column major format
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&col_ptr, &col_data, &thread_col_ptr);
|
||||
builder(&col_ptr_, &col_data_, &thread_col_ptr_);
|
||||
builder.InitBudget(tree.param.num_feature, nthread);
|
||||
|
||||
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||
@@ -783,13 +783,13 @@ class QuantileHistMaker: public HistMaker<TStats> {
|
||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
int nid = this->position[ridx];
|
||||
int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
if (!tree[nid].is_leaf()) {
|
||||
this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
|
||||
if (!tree[nid].IsLeaf()) {
|
||||
this->position_[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
|
||||
}
|
||||
if (this->node2workindex[nid] < 0) {
|
||||
this->position[ridx] = ~nid;
|
||||
if (this->node2workindex_[nid] < 0) {
|
||||
this->position_[ridx] = ~nid;
|
||||
} else {
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.AddBudget(inst[j].index, omp_get_thread_num());
|
||||
@@ -802,7 +802,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
|
||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.Push(inst[j].index,
|
||||
@@ -812,71 +812,71 @@ class QuantileHistMaker: public HistMaker<TStats> {
|
||||
}
|
||||
}
|
||||
// start putting things into sketch
|
||||
const bst_omp_uint nfeat = col_ptr.size() - 1;
|
||||
const bst_omp_uint nfeat = col_ptr_.size() - 1;
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint k = 0; k < nfeat; ++k) {
|
||||
for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
|
||||
const SparseBatch::Entry &e = col_data[i];
|
||||
const int wid = this->node2workindex[e.index];
|
||||
sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].GetHess());
|
||||
for (size_t i = col_ptr_[k]; i < col_ptr_[k+1]; ++i) {
|
||||
const SparseBatch::Entry &e = col_data_[i];
|
||||
const int wid = this->node2workindex_[e.index];
|
||||
sketchs_[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].GetHess());
|
||||
}
|
||||
}
|
||||
}
|
||||
// setup maximum size
|
||||
unsigned max_size = this->param.max_sketch_size();
|
||||
unsigned max_size = this->param_.MaxSketchSize();
|
||||
// synchronize sketch
|
||||
summary_array.resize(sketchs.size());
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
summary_array_.resize(sketchs_.size());
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
common::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
|
||||
sketchs[i].GetSummary(&out);
|
||||
summary_array[i].Reserve(max_size);
|
||||
summary_array[i].SetPrune(out, max_size);
|
||||
sketchs_[i].GetSummary(&out);
|
||||
summary_array_[i].Reserve(max_size);
|
||||
summary_array_[i].SetPrune(out, max_size);
|
||||
}
|
||||
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
|
||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
sreducer_.Allreduce(dmlc::BeginPtr(summary_array_), nbytes, summary_array_.size());
|
||||
// now we get the final result of sketch, setup the cut
|
||||
this->wspace.cut.clear();
|
||||
this->wspace.rptr.clear();
|
||||
this->wspace.rptr.push_back(0);
|
||||
for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
|
||||
this->wspace_.cut.clear();
|
||||
this->wspace_.rptr.clear();
|
||||
this->wspace_.rptr.push_back(0);
|
||||
for (size_t wid = 0; wid < this->qexpand_.size(); ++wid) {
|
||||
for (int fid = 0; fid < tree.param.num_feature; ++fid) {
|
||||
const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
|
||||
const WXQSketch::Summary &a = summary_array_[wid * tree.param.num_feature + fid];
|
||||
for (size_t i = 1; i < a.size; ++i) {
|
||||
bst_float cpt = a.data[i].value - rt_eps;
|
||||
if (i == 1 || cpt > this->wspace.cut.back()) {
|
||||
this->wspace.cut.push_back(cpt);
|
||||
bst_float cpt = a.data[i].value - kRtEps;
|
||||
if (i == 1 || cpt > this->wspace_.cut.back()) {
|
||||
this->wspace_.cut.push_back(cpt);
|
||||
}
|
||||
}
|
||||
// push a value that is greater than anything
|
||||
if (a.size != 0) {
|
||||
bst_float cpt = a.data[a.size - 1].value;
|
||||
// this must be bigger than last value in a scale
|
||||
bst_float last = cpt + fabs(cpt) + rt_eps;
|
||||
this->wspace.cut.push_back(last);
|
||||
bst_float last = cpt + fabs(cpt) + kRtEps;
|
||||
this->wspace_.cut.push_back(last);
|
||||
}
|
||||
this->wspace.rptr.push_back(this->wspace.cut.size());
|
||||
this->wspace_.rptr.push_back(this->wspace_.cut.size());
|
||||
}
|
||||
// reserve last value for global statistics
|
||||
this->wspace.cut.push_back(0.0f);
|
||||
this->wspace.rptr.push_back(this->wspace.cut.size());
|
||||
this->wspace_.cut.push_back(0.0f);
|
||||
this->wspace_.rptr.push_back(this->wspace_.cut.size());
|
||||
}
|
||||
CHECK_EQ(this->wspace.rptr.size(),
|
||||
(tree.param.num_feature + 1) * this->qexpand.size() + 1);
|
||||
CHECK_EQ(this->wspace_.rptr.size(),
|
||||
(tree.param.num_feature + 1) * this->qexpand_.size() + 1);
|
||||
}
|
||||
|
||||
private:
|
||||
// summary array
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array;
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array_;
|
||||
// reducer for summary
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer_;
|
||||
// local temp column data structure
|
||||
std::vector<size_t> col_ptr;
|
||||
std::vector<size_t> col_ptr_;
|
||||
// local storage of column data
|
||||
std::vector<SparseBatch::Entry> col_data;
|
||||
std::vector<std::vector<size_t> > thread_col_ptr;
|
||||
std::vector<SparseBatch::Entry> col_data_;
|
||||
std::vector<std::vector<size_t> > thread_col_ptr_;
|
||||
// per node, per feature sketch
|
||||
std::vector<common::WQuantileSketch<bst_float, bst_float> > sketchs;
|
||||
std::vector<common::WQuantileSketch<bst_float, bst_float> > sketchs_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(LocalHistMaker, "grow_local_histmaker")
|
||||
|
||||
@@ -21,37 +21,37 @@ DMLC_REGISTRY_FILE_TAG(updater_prune);
|
||||
class TreePruner: public TreeUpdater {
|
||||
public:
|
||||
TreePruner() {
|
||||
syncher.reset(TreeUpdater::Create("sync"));
|
||||
syncher_.reset(TreeUpdater::Create("sync"));
|
||||
}
|
||||
// set training parameter
|
||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
param.InitAllowUnknown(args);
|
||||
syncher->Init(args);
|
||||
param_.InitAllowUnknown(args);
|
||||
syncher_->Init(args);
|
||||
}
|
||||
// update the tree, do pruning
|
||||
void Update(HostDeviceVector<bst_gpair> *gpair,
|
||||
void Update(HostDeviceVector<GradientPair> *gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<RegTree*> &trees) override {
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
this->DoPrune(*trees[i]);
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
for (auto tree : trees) {
|
||||
this->DoPrune(*tree);
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
syncher->Update(gpair, p_fmat, trees);
|
||||
param_.learning_rate = lr;
|
||||
syncher_->Update(gpair, p_fmat, trees);
|
||||
}
|
||||
|
||||
private:
|
||||
// try to prune off current leaf
|
||||
inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
|
||||
if (tree[nid].is_root()) return npruned;
|
||||
int pid = tree[nid].parent();
|
||||
RegTree::NodeStat &s = tree.stat(pid);
|
||||
if (tree[nid].IsRoot()) return npruned;
|
||||
int pid = tree[nid].Parent();
|
||||
RegTree::NodeStat &s = tree.Stat(pid);
|
||||
++s.leaf_child_cnt;
|
||||
if (s.leaf_child_cnt >= 2 && param.need_prune(s.loss_chg, depth - 1)) {
|
||||
if (s.leaf_child_cnt >= 2 && param_.NeedPrune(s.loss_chg, depth - 1)) {
|
||||
// need to be pruned
|
||||
tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
|
||||
tree.ChangeToLeaf(pid, param_.learning_rate * s.base_weight);
|
||||
// tail recursion
|
||||
return this->TryPruneLeaf(tree, pid, depth - 1, npruned + 2);
|
||||
} else {
|
||||
@@ -63,25 +63,25 @@ class TreePruner: public TreeUpdater {
|
||||
int npruned = 0;
|
||||
// initialize auxiliary statistics
|
||||
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
|
||||
tree.stat(nid).leaf_child_cnt = 0;
|
||||
tree.Stat(nid).leaf_child_cnt = 0;
|
||||
}
|
||||
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
|
||||
if (tree[nid].is_leaf()) {
|
||||
if (tree[nid].IsLeaf()) {
|
||||
npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
|
||||
}
|
||||
}
|
||||
if (!param.silent) {
|
||||
if (!param_.silent) {
|
||||
LOG(INFO) << "tree pruning end, " << tree.param.num_roots << " roots, "
|
||||
<< tree.num_extra_nodes() << " extra nodes, " << npruned
|
||||
<< tree.NumExtraNodes() << " extra nodes, " << npruned
|
||||
<< " pruned nodes, max_depth=" << tree.MaxDepth();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// synchronizer
|
||||
std::unique_ptr<TreeUpdater> syncher;
|
||||
std::unique_ptr<TreeUpdater> syncher_;
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
TrainParam param_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
|
||||
|
||||
@@ -22,14 +22,14 @@ template<typename TStats>
|
||||
class TreeRefresher: public TreeUpdater {
|
||||
public:
|
||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
param.InitAllowUnknown(args);
|
||||
param_.InitAllowUnknown(args);
|
||||
}
|
||||
// update the tree, do pruning
|
||||
void Update(HostDeviceVector<bst_gpair> *gpair,
|
||||
void Update(HostDeviceVector<GradientPair> *gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<RegTree*> &trees) override {
|
||||
if (trees.size() == 0) return;
|
||||
std::vector<bst_gpair> &gpair_h = gpair->data_h();
|
||||
std::vector<GradientPair> &gpair_h = gpair->HostVector();
|
||||
// number of threads
|
||||
// thread temporal space
|
||||
std::vector<std::vector<TStats> > stemp;
|
||||
@@ -42,11 +42,11 @@ class TreeRefresher: public TreeUpdater {
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int num_nodes = 0;
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
num_nodes += trees[i]->param.num_nodes;
|
||||
for (auto tree : trees) {
|
||||
num_nodes += tree->param.num_nodes;
|
||||
}
|
||||
stemp[tid].resize(num_nodes, TStats(param));
|
||||
std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
|
||||
stemp[tid].resize(num_nodes, TStats(param_));
|
||||
std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param_));
|
||||
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
||||
}
|
||||
// if it is C++11, use lazy evaluation for Allreduce,
|
||||
@@ -55,32 +55,32 @@ class TreeRefresher: public TreeUpdater {
|
||||
auto lazy_get_stats = [&]()
|
||||
#endif
|
||||
{
|
||||
const MetaInfo &info = p_fmat->info();
|
||||
const MetaInfo &info = p_fmat->Info();
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
CHECK_LT(batch.size, std::numeric_limits<unsigned>::max());
|
||||
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
RegTree::FVec &feats = fvec_temp[tid];
|
||||
feats.Fill(inst);
|
||||
int offset = 0;
|
||||
for (size_t j = 0; j < trees.size(); ++j) {
|
||||
AddStats(*trees[j], feats, gpair_h, info, ridx,
|
||||
for (auto tree : trees) {
|
||||
AddStats(*tree, feats, gpair_h, info, ridx,
|
||||
dmlc::BeginPtr(stemp[tid]) + offset);
|
||||
offset += trees[j]->param.num_nodes;
|
||||
offset += tree->param.num_nodes;
|
||||
}
|
||||
feats.Drop(inst);
|
||||
}
|
||||
}
|
||||
// aggregate the statistics
|
||||
int num_nodes = static_cast<int>(stemp[0].size());
|
||||
auto num_nodes = static_cast<int>(stemp[0].size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int nid = 0; nid < num_nodes; ++nid) {
|
||||
for (int tid = 1; tid < nthread; ++tid) {
|
||||
@@ -89,64 +89,64 @@ class TreeRefresher: public TreeUpdater {
|
||||
}
|
||||
};
|
||||
#if __cplusplus >= 201103L
|
||||
reducer.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
|
||||
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
|
||||
#else
|
||||
reducer.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size());
|
||||
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size());
|
||||
#endif
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
int offset = 0;
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
|
||||
this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, rid, trees[i]);
|
||||
for (auto tree : trees) {
|
||||
for (int rid = 0; rid < tree->param.num_roots; ++rid) {
|
||||
this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, rid, tree);
|
||||
}
|
||||
offset += trees[i]->param.num_nodes;
|
||||
offset += tree->param.num_nodes;
|
||||
}
|
||||
// set learning rate back
|
||||
param.learning_rate = lr;
|
||||
param_.learning_rate = lr;
|
||||
}
|
||||
|
||||
private:
|
||||
inline static void AddStats(const RegTree &tree,
|
||||
const RegTree::FVec &feat,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const std::vector<GradientPair> &gpair,
|
||||
const MetaInfo &info,
|
||||
const bst_uint ridx,
|
||||
TStats *gstats) {
|
||||
// start from groups that belongs to current data
|
||||
int pid = static_cast<int>(info.GetRoot(ridx));
|
||||
auto pid = static_cast<int>(info.GetRoot(ridx));
|
||||
gstats[pid].Add(gpair, info, ridx);
|
||||
// tranverse tree
|
||||
while (!tree[pid].is_leaf()) {
|
||||
unsigned split_index = tree[pid].split_index();
|
||||
pid = tree.GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
|
||||
while (!tree[pid].IsLeaf()) {
|
||||
unsigned split_index = tree[pid].SplitIndex();
|
||||
pid = tree.GetNext(pid, feat.Fvalue(split_index), feat.IsMissing(split_index));
|
||||
gstats[pid].Add(gpair, info, ridx);
|
||||
}
|
||||
}
|
||||
inline void Refresh(const TStats *gstats,
|
||||
int nid, RegTree *p_tree) {
|
||||
RegTree &tree = *p_tree;
|
||||
tree.stat(nid).base_weight = static_cast<bst_float>(gstats[nid].CalcWeight(param));
|
||||
tree.stat(nid).sum_hess = static_cast<bst_float>(gstats[nid].sum_hess);
|
||||
gstats[nid].SetLeafVec(param, tree.leafvec(nid));
|
||||
if (tree[nid].is_leaf()) {
|
||||
if (param.refresh_leaf) {
|
||||
tree[nid].set_leaf(tree.stat(nid).base_weight * param.learning_rate);
|
||||
tree.Stat(nid).base_weight = static_cast<bst_float>(gstats[nid].CalcWeight(param_));
|
||||
tree.Stat(nid).sum_hess = static_cast<bst_float>(gstats[nid].sum_hess);
|
||||
gstats[nid].SetLeafVec(param_, tree.Leafvec(nid));
|
||||
if (tree[nid].IsLeaf()) {
|
||||
if (param_.refresh_leaf) {
|
||||
tree[nid].SetLeaf(tree.Stat(nid).base_weight * param_.learning_rate);
|
||||
}
|
||||
} else {
|
||||
tree.stat(nid).loss_chg = static_cast<bst_float>(
|
||||
gstats[tree[nid].cleft()].CalcGain(param) +
|
||||
gstats[tree[nid].cright()].CalcGain(param) -
|
||||
gstats[nid].CalcGain(param));
|
||||
this->Refresh(gstats, tree[nid].cleft(), p_tree);
|
||||
this->Refresh(gstats, tree[nid].cright(), p_tree);
|
||||
tree.Stat(nid).loss_chg = static_cast<bst_float>(
|
||||
gstats[tree[nid].LeftChild()].CalcGain(param_) +
|
||||
gstats[tree[nid].RightChild()].CalcGain(param_) -
|
||||
gstats[nid].CalcGain(param_));
|
||||
this->Refresh(gstats, tree[nid].LeftChild(), p_tree);
|
||||
this->Refresh(gstats, tree[nid].RightChild(), p_tree);
|
||||
}
|
||||
}
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
TrainParam param_;
|
||||
// reducer
|
||||
rabit::Reducer<TStats, TStats::Reduce> reducer;
|
||||
rabit::Reducer<TStats, TStats::Reduce> reducer_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
|
||||
|
||||
@@ -22,58 +22,57 @@ DMLC_REGISTRY_FILE_TAG(updater_skmaker);
|
||||
|
||||
class SketchMaker: public BaseMaker {
|
||||
public:
|
||||
void Update(HostDeviceVector<bst_gpair> *gpair,
|
||||
void Update(HostDeviceVector<GradientPair> *gpair,
|
||||
DMatrix *p_fmat,
|
||||
const std::vector<RegTree*> &trees) override {
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
// build tree
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
this->Update(gpair->data_h(), p_fmat, trees[i]);
|
||||
for (auto tree : trees) {
|
||||
this->Update(gpair->HostVector(), p_fmat, tree);
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
param_.learning_rate = lr;
|
||||
}
|
||||
|
||||
protected:
|
||||
inline void Update(const std::vector<bst_gpair> &gpair,
|
||||
inline void Update(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
RegTree *p_tree) {
|
||||
this->InitData(gpair, *p_fmat, *p_tree);
|
||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||
for (int depth = 0; depth < param_.max_depth; ++depth) {
|
||||
this->GetNodeStats(gpair, *p_fmat, *p_tree,
|
||||
&thread_stats, &node_stats);
|
||||
&thread_stats_, &node_stats_);
|
||||
this->BuildSketch(gpair, p_fmat, *p_tree);
|
||||
this->SyncNodeStats();
|
||||
this->FindSplit(depth, gpair, p_fmat, p_tree);
|
||||
this->ResetPositionCol(qexpand, p_fmat, *p_tree);
|
||||
this->ResetPositionCol(qexpand_, p_fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree);
|
||||
// if nothing left to be expand, break
|
||||
if (qexpand.size() == 0) break;
|
||||
if (qexpand_.size() == 0) break;
|
||||
}
|
||||
if (qexpand.size() != 0) {
|
||||
if (qexpand_.size() != 0) {
|
||||
this->GetNodeStats(gpair, *p_fmat, *p_tree,
|
||||
&thread_stats, &node_stats);
|
||||
&thread_stats_, &node_stats_);
|
||||
this->SyncNodeStats();
|
||||
}
|
||||
// set all statistics correctly
|
||||
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
|
||||
this->SetStats(nid, node_stats[nid], p_tree);
|
||||
if (!(*p_tree)[nid].is_leaf()) {
|
||||
p_tree->stat(nid).loss_chg = static_cast<bst_float>(
|
||||
node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
|
||||
node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
|
||||
node_stats[nid].CalcGain(param));
|
||||
this->SetStats(nid, node_stats_[nid], p_tree);
|
||||
if (!(*p_tree)[nid].IsLeaf()) {
|
||||
p_tree->Stat(nid).loss_chg = static_cast<bst_float>(
|
||||
node_stats_[(*p_tree)[nid].LeftChild()].CalcGain(param_) +
|
||||
node_stats_[(*p_tree)[nid].RightChild()].CalcGain(param_) -
|
||||
node_stats_[nid].CalcGain(param_));
|
||||
}
|
||||
}
|
||||
// set left leaves
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
|
||||
for (int nid : qexpand_) {
|
||||
(*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
|
||||
}
|
||||
}
|
||||
// define the sketch we want to use
|
||||
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
|
||||
|
||||
private:
|
||||
// statistics needed in the gradient calculation
|
||||
@@ -84,20 +83,20 @@ class SketchMaker: public BaseMaker {
|
||||
double neg_grad;
|
||||
/*! \brief sum of hessian statistics */
|
||||
double sum_hess;
|
||||
SKStats(void) {}
|
||||
SKStats() = default;
|
||||
// constructor
|
||||
explicit SKStats(const TrainParam ¶m) {
|
||||
this->Clear();
|
||||
}
|
||||
/*! \brief clear the statistics */
|
||||
inline void Clear(void) {
|
||||
inline void Clear() {
|
||||
neg_grad = pos_grad = sum_hess = 0.0f;
|
||||
}
|
||||
// accumulate statistics
|
||||
inline void Add(const std::vector<bst_gpair> &gpair,
|
||||
inline void Add(const std::vector<GradientPair> &gpair,
|
||||
const MetaInfo &info,
|
||||
bst_uint ridx) {
|
||||
const bst_gpair &b = gpair[ridx];
|
||||
const GradientPair &b = gpair[ridx];
|
||||
if (b.GetGrad() >= 0.0f) {
|
||||
pos_grad += b.GetGrad();
|
||||
} else {
|
||||
@@ -133,48 +132,48 @@ class SketchMaker: public BaseMaker {
|
||||
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const {
|
||||
}
|
||||
};
|
||||
inline void BuildSketch(const std::vector<bst_gpair> &gpair,
|
||||
inline void BuildSketch(const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
const RegTree &tree) {
|
||||
const MetaInfo& info = p_fmat->info();
|
||||
sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
sketchs[i].Init(info.num_row, this->param.sketch_eps);
|
||||
const MetaInfo& info = p_fmat->Info();
|
||||
sketchs_.resize(this->qexpand_.size() * tree.param.num_feature * 3);
|
||||
for (auto & sketch : sketchs_) {
|
||||
sketch.Init(info.num_row_, this->param_.sketch_eps);
|
||||
}
|
||||
thread_sketch.resize(omp_get_max_threads());
|
||||
thread_sketch_.resize(omp_get_max_threads());
|
||||
// number of rows in
|
||||
const size_t nrows = p_fmat->buffered_rowset().size();
|
||||
const size_t nrows = p_fmat->BufferedRowset().Size();
|
||||
// start accumulating statistics
|
||||
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
this->UpdateSketchCol(gpair, batch[i], tree,
|
||||
node_stats,
|
||||
node_stats_,
|
||||
batch.col_index[i],
|
||||
batch[i].length == nrows,
|
||||
&thread_sketch[omp_get_thread_num()]);
|
||||
&thread_sketch_[omp_get_thread_num()]);
|
||||
}
|
||||
}
|
||||
// setup maximum size
|
||||
unsigned max_size = param.max_sketch_size();
|
||||
unsigned max_size = param_.MaxSketchSize();
|
||||
// synchronize sketch
|
||||
summary_array.resize(sketchs.size());
|
||||
for (size_t i = 0; i < sketchs.size(); ++i) {
|
||||
summary_array_.resize(sketchs_.size());
|
||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||
common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
|
||||
sketchs[i].GetSummary(&out);
|
||||
summary_array[i].Reserve(max_size);
|
||||
summary_array[i].SetPrune(out, max_size);
|
||||
sketchs_[i].GetSummary(&out);
|
||||
summary_array_[i].Reserve(max_size);
|
||||
summary_array_[i].SetPrune(out, max_size);
|
||||
}
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
|
||||
sketch_reducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
sketch_reducer_.Allreduce(dmlc::BeginPtr(summary_array_), nbytes, summary_array_.size());
|
||||
}
|
||||
// update sketch information in column fid
|
||||
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
|
||||
inline void UpdateSketchCol(const std::vector<GradientPair> &gpair,
|
||||
const ColBatch::Inst &c,
|
||||
const RegTree &tree,
|
||||
const std::vector<SKStats> &nstats,
|
||||
@@ -185,20 +184,19 @@ class SketchMaker: public BaseMaker {
|
||||
// initialize sbuilder for use
|
||||
std::vector<SketchEntry> &sbuilder = *p_temp;
|
||||
sbuilder.resize(tree.param.num_nodes * 3);
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
const unsigned wid = this->node2workindex[nid];
|
||||
for (unsigned int nid : this->qexpand_) {
|
||||
const unsigned wid = this->node2workindex_[nid];
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
sbuilder[3 * nid + k].sum_total = 0.0f;
|
||||
sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
|
||||
sbuilder[3 * nid + k].sketch = &sketchs_[(wid * tree.param.num_feature + fid) * 3 + k];
|
||||
}
|
||||
}
|
||||
if (!col_full) {
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
const bst_gpair &e = gpair[ridx];
|
||||
const GradientPair &e = gpair[ridx];
|
||||
if (e.GetGrad() >= 0.0f) {
|
||||
sbuilder[3 * nid + 0].sum_total += e.GetGrad();
|
||||
} else {
|
||||
@@ -208,8 +206,7 @@ class SketchMaker: public BaseMaker {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const unsigned nid = this->qexpand[i];
|
||||
for (unsigned int nid : this->qexpand_) {
|
||||
sbuilder[3 * nid + 0].sum_total = static_cast<bst_float>(nstats[nid].pos_grad);
|
||||
sbuilder[3 * nid + 1].sum_total = static_cast<bst_float>(nstats[nid].neg_grad);
|
||||
sbuilder[3 * nid + 2].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
|
||||
@@ -217,8 +214,7 @@ class SketchMaker: public BaseMaker {
|
||||
}
|
||||
// if only one value, no need to do second pass
|
||||
if (c[0].fvalue == c[c.length-1].fvalue) {
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
for (int nid : this->qexpand_) {
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
sbuilder[3 * nid + k].sketch->Push(c[0].fvalue,
|
||||
static_cast<bst_float>(
|
||||
@@ -228,9 +224,8 @@ class SketchMaker: public BaseMaker {
|
||||
return;
|
||||
}
|
||||
// two pass scan
|
||||
unsigned max_size = param.max_sketch_size();
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
unsigned max_size = param_.MaxSketchSize();
|
||||
for (int nid : this->qexpand_) {
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
sbuilder[3 * nid + k].Init(max_size);
|
||||
}
|
||||
@@ -238,9 +233,9 @@ class SketchMaker: public BaseMaker {
|
||||
// second pass, build the sketch
|
||||
for (bst_uint j = 0; j < c.length; ++j) {
|
||||
const bst_uint ridx = c[j].index;
|
||||
const int nid = this->position[ridx];
|
||||
const int nid = this->position_[ridx];
|
||||
if (nid >= 0) {
|
||||
const bst_gpair &e = gpair[ridx];
|
||||
const GradientPair &e = gpair[ridx];
|
||||
if (e.GetGrad() >= 0.0f) {
|
||||
sbuilder[3 * nid + 0].Push(c[j].fvalue, e.GetGrad(), max_size);
|
||||
} else {
|
||||
@@ -249,70 +244,69 @@ class SketchMaker: public BaseMaker {
|
||||
sbuilder[3 * nid + 2].Push(c[j].fvalue, e.GetHess(), max_size);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < this->qexpand.size(); ++i) {
|
||||
const int nid = this->qexpand[i];
|
||||
for (int nid : this->qexpand_) {
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
sbuilder[3 * nid + k].Finalize(max_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void SyncNodeStats(void) {
|
||||
CHECK_NE(qexpand.size(), 0U);
|
||||
std::vector<SKStats> tmp(qexpand.size());
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
tmp[i] = node_stats[qexpand[i]];
|
||||
inline void SyncNodeStats() {
|
||||
CHECK_NE(qexpand_.size(), 0U);
|
||||
std::vector<SKStats> tmp(qexpand_.size());
|
||||
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||
tmp[i] = node_stats_[qexpand_[i]];
|
||||
}
|
||||
stats_reducer.Allreduce(dmlc::BeginPtr(tmp), tmp.size());
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
node_stats[qexpand[i]] = tmp[i];
|
||||
stats_reducer_.Allreduce(dmlc::BeginPtr(tmp), tmp.size());
|
||||
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||
node_stats_[qexpand_[i]] = tmp[i];
|
||||
}
|
||||
}
|
||||
inline void FindSplit(int depth,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const std::vector<GradientPair> &gpair,
|
||||
DMatrix *p_fmat,
|
||||
RegTree *p_tree) {
|
||||
const bst_uint num_feature = p_tree->param.num_feature;
|
||||
// get the best split condition for each node
|
||||
std::vector<SplitEntry> sol(qexpand.size());
|
||||
bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
|
||||
std::vector<SplitEntry> sol(qexpand_.size());
|
||||
auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||
const int nid = qexpand[wid];
|
||||
CHECK_EQ(node2workindex[nid], static_cast<int>(wid));
|
||||
const int nid = qexpand_[wid];
|
||||
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
|
||||
SplitEntry &best = sol[wid];
|
||||
for (bst_uint fid = 0; fid < num_feature; ++fid) {
|
||||
unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
|
||||
EnumerateSplit(summary_array[base + 0],
|
||||
summary_array[base + 1],
|
||||
summary_array[base + 2],
|
||||
node_stats[nid], fid, &best);
|
||||
EnumerateSplit(summary_array_[base + 0],
|
||||
summary_array_[base + 1],
|
||||
summary_array_[base + 2],
|
||||
node_stats_[nid], fid, &best);
|
||||
}
|
||||
}
|
||||
// get the best result, we can synchronize the solution
|
||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||
const int nid = qexpand[wid];
|
||||
const int nid = qexpand_[wid];
|
||||
const SplitEntry &best = sol[wid];
|
||||
// set up the values
|
||||
p_tree->stat(nid).loss_chg = best.loss_chg;
|
||||
this->SetStats(nid, node_stats[nid], p_tree);
|
||||
p_tree->Stat(nid).loss_chg = best.loss_chg;
|
||||
this->SetStats(nid, node_stats_[nid], p_tree);
|
||||
// now we know the solution in snode[nid], set split
|
||||
if (best.loss_chg > rt_eps) {
|
||||
if (best.loss_chg > kRtEps) {
|
||||
p_tree->AddChilds(nid);
|
||||
(*p_tree)[nid].set_split(best.split_index(),
|
||||
best.split_value, best.default_left());
|
||||
(*p_tree)[nid].SetSplit(best.SplitIndex(),
|
||||
best.split_value, best.DefaultLeft());
|
||||
// mark right child as 0, to indicate fresh leaf
|
||||
(*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].LeftChild()].SetLeaf(0.0f, 0);
|
||||
(*p_tree)[(*p_tree)[nid].RightChild()].SetLeaf(0.0f, 0);
|
||||
} else {
|
||||
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
|
||||
(*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
// set statistics on ptree
|
||||
inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
|
||||
p_tree->stat(nid).base_weight = static_cast<bst_float>(node_sum.CalcWeight(param));
|
||||
p_tree->stat(nid).sum_hess = static_cast<bst_float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
|
||||
p_tree->Stat(nid).base_weight = static_cast<bst_float>(node_sum.CalcWeight(param_));
|
||||
p_tree->Stat(nid).sum_hess = static_cast<bst_float>(node_sum.sum_hess);
|
||||
node_sum.SetLeafVec(param_, p_tree->Leafvec(nid));
|
||||
}
|
||||
inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
|
||||
const WXQSketch::Summary &neg_grad,
|
||||
@@ -321,7 +315,7 @@ class SketchMaker: public BaseMaker {
|
||||
bst_uint fid,
|
||||
SplitEntry *best) {
|
||||
if (sum_hess.size == 0) return;
|
||||
double root_gain = node_sum.CalcGain(param);
|
||||
double root_gain = node_sum.CalcGain(param_);
|
||||
std::vector<bst_float> fsplits;
|
||||
for (size_t i = 0; i < pos_grad.size; ++i) {
|
||||
fsplits.push_back(pos_grad.data[i].value);
|
||||
@@ -350,17 +344,17 @@ class SketchMaker: public BaseMaker {
|
||||
s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
|
||||
c.SetSubstract(node_sum, s);
|
||||
// forward
|
||||
if (s.sum_hess >= param.min_child_weight &&
|
||||
c.sum_hess >= param.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
|
||||
if (s.sum_hess >= param_.min_child_weight &&
|
||||
c.sum_hess >= param_.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param_) + c.CalcGain(param_) - root_gain;
|
||||
best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], false);
|
||||
}
|
||||
// backward
|
||||
c.SetSubstract(feat_sum, s);
|
||||
s.SetSubstract(node_sum, c);
|
||||
if (s.sum_hess >= param.min_child_weight &&
|
||||
c.sum_hess >= param.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
|
||||
if (s.sum_hess >= param_.min_child_weight &&
|
||||
c.sum_hess >= param_.min_child_weight) {
|
||||
double loss_chg = s.CalcGain(param_) + c.CalcGain(param_) - root_gain;
|
||||
best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], true);
|
||||
}
|
||||
}
|
||||
@@ -368,10 +362,10 @@ class SketchMaker: public BaseMaker {
|
||||
// all including
|
||||
SKStats s = feat_sum, c;
|
||||
c.SetSubstract(node_sum, s);
|
||||
if (s.sum_hess >= param.min_child_weight &&
|
||||
c.sum_hess >= param.min_child_weight) {
|
||||
if (s.sum_hess >= param_.min_child_weight &&
|
||||
c.sum_hess >= param_.min_child_weight) {
|
||||
bst_float cpt = fsplits.back();
|
||||
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
|
||||
double loss_chg = s.CalcGain(param_) + c.CalcGain(param_) - root_gain;
|
||||
best->Update(static_cast<bst_float>(loss_chg),
|
||||
fid, cpt + std::abs(cpt) + 1.0f, false);
|
||||
}
|
||||
@@ -380,19 +374,19 @@ class SketchMaker: public BaseMaker {
|
||||
|
||||
// thread temp data
|
||||
// used to hold temporal sketch
|
||||
std::vector<std::vector<SketchEntry> > thread_sketch;
|
||||
std::vector<std::vector<SketchEntry> > thread_sketch_;
|
||||
// used to hold statistics
|
||||
std::vector<std::vector<SKStats> > thread_stats;
|
||||
std::vector<std::vector<SKStats> > thread_stats_;
|
||||
// node statistics
|
||||
std::vector<SKStats> node_stats;
|
||||
std::vector<SKStats> node_stats_;
|
||||
// summary array
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array;
|
||||
std::vector<WXQSketch::SummaryContainer> summary_array_;
|
||||
// reducer for summary
|
||||
rabit::Reducer<SKStats, SKStats::Reduce> stats_reducer;
|
||||
rabit::Reducer<SKStats, SKStats::Reduce> stats_reducer_;
|
||||
// reducer for summary
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer;
|
||||
rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer_;
|
||||
// per node, per feature sketch
|
||||
std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs;
|
||||
std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(SketchMaker, "grow_skmaker")
|
||||
|
||||
@@ -23,7 +23,7 @@ class TreeSyncher: public TreeUpdater {
|
||||
public:
|
||||
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {}
|
||||
|
||||
void Update(HostDeviceVector<bst_gpair> *gpair,
|
||||
void Update(HostDeviceVector<GradientPair> *gpair,
|
||||
DMatrix* dmat,
|
||||
const std::vector<RegTree*> &trees) override {
|
||||
if (rabit::GetWorldSize() == 1) return;
|
||||
@@ -31,14 +31,14 @@ class TreeSyncher: public TreeUpdater {
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = rabit::GetRank();
|
||||
if (rank == 0) {
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i]->Save(&fs);
|
||||
for (auto tree : trees) {
|
||||
tree->Save(&fs);
|
||||
}
|
||||
}
|
||||
fs.Seek(0);
|
||||
rabit::Broadcast(&s_model, 0);
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i]->Load(&fs);
|
||||
for (auto tree : trees) {
|
||||
tree->Load(&fs);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user