/*! * Copyright 2017-2021 by Contributors * \file updater_quantile_hist.cc * \brief use quantized feature values to construct a tree * \author Philip Cho, Tianqi Checn, Egor Smirnov */ #include #include #include #include #include #include #include #include #include #include #include #include "xgboost/logging.h" #include "xgboost/tree_updater.h" #include "constraints.h" #include "param.h" #include "./updater_quantile_hist.h" #include "./split_evaluator.h" #include "../common/random.h" #include "../common/hist_util.h" #include "../common/row_set.h" #include "../common/column_matrix.h" #include "../common/threading_utils.h" namespace xgboost { namespace tree { DMLC_REGISTRY_FILE_TAG(updater_quantile_hist); DMLC_REGISTER_PARAMETER(CPUHistMakerTrainParam); void QuantileHistMaker::Configure(const Args& args) { // initialize pruner if (!pruner_) { pruner_.reset(TreeUpdater::Create("prune", tparam_, task_)); } pruner_->Configure(args); param_.UpdateAllowUnknown(args); hist_maker_param_.UpdateAllowUnknown(args); } template void QuantileHistMaker::SetBuilder(const size_t n_trees, std::unique_ptr>* builder, DMatrix *dmat) { builder->reset( new Builder(n_trees, param_, std::move(pruner_), dmat, task_)); } template void QuantileHistMaker::CallBuilderUpdate(const std::unique_ptr>& builder, HostDeviceVector *gpair, DMatrix *dmat, GHistIndexMatrix const& gmat, const std::vector &trees) { for (auto tree : trees) { builder->Update(gmat, column_matrix_, gpair, dmat, tree); } } void QuantileHistMaker::Update(HostDeviceVector *gpair, DMatrix *dmat, const std::vector &trees) { auto it = dmat->GetBatches( BatchParam{GenericParameter::kCpuId, param_.max_bin}) .begin(); auto p_gmat = it.Page(); if (dmat != p_last_dmat_ || is_gmat_initialized_ == false) { updater_monitor_.Start("GmatInitialization"); column_matrix_.Init(*p_gmat, param_.sparse_threshold); updater_monitor_.Stop("GmatInitialization"); // A proper solution is puting cut matrix in DMatrix, see: // https://github.com/dmlc/xgboost/issues/5143 is_gmat_initialized_ = true; } // rescale learning rate according to size of trees float lr = param_.learning_rate; param_.learning_rate = lr / trees.size(); // build tree const size_t n_trees = trees.size(); if (hist_maker_param_.single_precision_histogram) { if (!float_builder_) { this->SetBuilder(n_trees, &float_builder_, dmat); } CallBuilderUpdate(float_builder_, gpair, dmat, *p_gmat, trees); } else { if (!double_builder_) { SetBuilder(n_trees, &double_builder_, dmat); } CallBuilderUpdate(double_builder_, gpair, dmat, *p_gmat, trees); } param_.learning_rate = lr; p_last_dmat_ = dmat; } bool QuantileHistMaker::UpdatePredictionCache( const DMatrix* data, linalg::VectorView out_preds) { if (hist_maker_param_.single_precision_histogram && float_builder_) { return float_builder_->UpdatePredictionCache(data, out_preds); } else if (double_builder_) { return double_builder_->UpdatePredictionCache(data, out_preds); } else { return false; } } template template void QuantileHistMaker::Builder::InitRoot( DMatrix *p_fmat, RegTree *p_tree, const std::vector &gpair_h, int *num_leaves, std::vector *expand) { CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0), 0.0f); nodes_for_explicit_hist_build_.clear(); nodes_for_subtraction_trick_.clear(); nodes_for_explicit_hist_build_.push_back(node); size_t page_id = 0; for (auto const &gidx : p_fmat->GetBatches( {GenericParameter::kCpuId, param_.max_bin})) { this->histogram_builder_->BuildHist( page_id, gidx, p_tree, row_set_collection_, nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, gpair_h); ++page_id; } { auto nid = RegTree::kRoot; GHistRowT hist = this->histogram_builder_->Histogram()[nid]; GradientPairT grad_stat; if (data_layout_ == DataLayout::kDenseDataZeroBased || data_layout_ == DataLayout::kDenseDataOneBased) { auto const &gmat = *(p_fmat ->GetBatches(BatchParam{ GenericParameter::kCpuId, param_.max_bin}) .begin()); const std::vector &row_ptr = gmat.cut.Ptrs(); const uint32_t ibegin = row_ptr[fid_least_bins_]; const uint32_t iend = row_ptr[fid_least_bins_ + 1]; auto begin = hist.data(); for (uint32_t i = ibegin; i < iend; ++i) { const GradientPairT et = begin[i]; grad_stat.Add(et.GetGrad(), et.GetHess()); } } else { const RowSetCollection::Elem e = row_set_collection_[nid]; for (const size_t *it = e.begin; it < e.end; ++it) { grad_stat.Add(gpair_h[*it].GetGrad(), gpair_h[*it].GetHess()); } rabit::Allreduce( reinterpret_cast(&grad_stat), 2); } auto weight = evaluator_->InitRoot(GradStats{grad_stat}); p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess(); p_tree->Stat(RegTree::kRoot).base_weight = weight; (*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight); std::vector entries{node}; builder_monitor_.Start("EvaluateSplits"); auto ft = p_fmat->Info().feature_types.ConstHostSpan(); for (auto const &gmat : p_fmat->GetBatches( BatchParam{GenericParameter::kCpuId, param_.max_bin})) { evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &entries); break; } builder_monitor_.Stop("EvaluateSplits"); node = entries.front(); } expand->push_back(node); ++(*num_leaves); } template void QuantileHistMaker::Builder::AddSplitsToTree( const std::vector& expand, RegTree *p_tree, int *num_leaves, std::vector* nodes_for_apply_split) { for (auto const& entry : expand) { if (entry.IsValid(param_, *num_leaves)) { nodes_for_apply_split->push_back(entry); evaluator_->ApplyTreeSplit(entry, p_tree); (*num_leaves)++; } } } // Split nodes to 2 sets depending on amount of rows in each node // Histograms for small nodes will be built explicitly // Histograms for big nodes will be built by 'Subtraction Trick' // Exception: in distributed setting, we always build the histogram for the left child node // and use 'Subtraction Trick' to built the histogram for the right child node. // This ensures that the workers operate on the same set of tree nodes. template void QuantileHistMaker::Builder::SplitSiblings( const std::vector &nodes_for_apply_split, std::vector *nodes_to_evaluate, RegTree *p_tree) { builder_monitor_.Start("SplitSiblings"); for (auto const& entry : nodes_for_apply_split) { int nid = entry.nid; const int cleft = (*p_tree)[nid].LeftChild(); const int cright = (*p_tree)[nid].RightChild(); const CPUExpandEntry left_node = CPUExpandEntry(cleft, p_tree->GetDepth(cleft), 0.0); const CPUExpandEntry right_node = CPUExpandEntry(cright, p_tree->GetDepth(cright), 0.0); nodes_to_evaluate->push_back(left_node); nodes_to_evaluate->push_back(right_node); if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) { nodes_for_explicit_hist_build_.push_back(left_node); nodes_for_subtraction_trick_.push_back(right_node); } else { nodes_for_explicit_hist_build_.push_back(right_node); nodes_for_subtraction_trick_.push_back(left_node); } } CHECK_EQ(nodes_for_subtraction_trick_.size(), nodes_for_explicit_hist_build_.size()); builder_monitor_.Stop("SplitSiblings"); } template template void QuantileHistMaker::Builder::ExpandTree( const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, DMatrix* p_fmat, RegTree* p_tree, const std::vector& gpair_h) { builder_monitor_.Start("ExpandTree"); int num_leaves = 0; Driver driver(static_cast(param_.grow_policy)); std::vector expand; InitRoot(p_fmat, p_tree, gpair_h, &num_leaves, &expand); driver.Push(expand[0]); int32_t depth = 0; while (!driver.IsEmpty()) { expand = driver.Pop(); depth = expand[0].depth + 1; std::vector nodes_for_apply_split; std::vector nodes_to_evaluate; nodes_for_explicit_hist_build_.clear(); nodes_for_subtraction_trick_.clear(); AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split); if (nodes_for_apply_split.size() != 0) { ApplySplit(nodes_for_apply_split, gmat, column_matrix, p_tree); SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree); if (depth < param_.max_depth) { size_t i = 0; for (auto const &gidx : p_fmat->GetBatches( {GenericParameter::kCpuId, param_.max_bin})) { this->histogram_builder_->BuildHist( i, gidx, p_tree, row_set_collection_, nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, gpair_h); ++i; } } else { int starting_index = std::numeric_limits::max(); int sync_count = 0; this->histogram_builder_->AddHistRows( &starting_index, &sync_count, nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, p_tree); } builder_monitor_.Start("EvaluateSplits"); auto ft = p_fmat->Info().feature_types.ConstHostSpan(); evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &nodes_to_evaluate); builder_monitor_.Stop("EvaluateSplits"); for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) { CPUExpandEntry left_node = nodes_to_evaluate.at(i * 2 + 0); CPUExpandEntry right_node = nodes_to_evaluate.at(i * 2 + 1); driver.Push(left_node); driver.Push(right_node); } } } builder_monitor_.Stop("ExpandTree"); } template void QuantileHistMaker::Builder::Update( const GHistIndexMatrix &gmat, const ColumnMatrix &column_matrix, HostDeviceVector *gpair, DMatrix *p_fmat, RegTree *p_tree) { builder_monitor_.Start("Update"); std::vector* gpair_ptr = &(gpair->HostVector()); // in case 'num_parallel_trees != 1' no posibility to change initial gpair if (GetNumberOfTrees() != 1) { gpair_local_.resize(gpair_ptr->size()); gpair_local_ = *gpair_ptr; gpair_ptr = &gpair_local_; } p_last_fmat_mutable_ = p_fmat; this->InitData(gmat, *p_fmat, *p_tree, gpair_ptr); if (column_matrix.AnyMissing()) { ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr); } else { ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr); } pruner_->Update(gpair, p_fmat, std::vector{p_tree}); builder_monitor_.Stop("Update"); } template bool QuantileHistMaker::Builder::UpdatePredictionCache( const DMatrix* data, linalg::VectorView out_preds) { // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in // conjunction with Update(). if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_ || p_last_fmat_ != p_last_fmat_mutable_) { return false; } builder_monitor_.Start("UpdatePredictionCache"); CHECK_GT(out_preds.Size(), 0U); size_t n_nodes = row_set_collection_.end() - row_set_collection_.begin(); common::BlockedSpace2d space(n_nodes, [&](size_t node) { return row_set_collection_[node].Size(); }, 1024); CHECK_EQ(out_preds.DeviceIdx(), GenericParameter::kCpuId); common::ParallelFor2d(space, this->nthread_, [&](size_t node, common::Range1d r) { const RowSetCollection::Elem rowset = row_set_collection_[node]; if (rowset.begin != nullptr && rowset.end != nullptr) { int nid = rowset.node_id; bst_float leaf_value; // if a node is marked as deleted by the pruner, traverse upward to locate // a non-deleted leaf. if ((*p_last_tree_)[nid].IsDeleted()) { while ((*p_last_tree_)[nid].IsDeleted()) { nid = (*p_last_tree_)[nid].Parent(); } CHECK((*p_last_tree_)[nid].IsLeaf()); } leaf_value = (*p_last_tree_)[nid].LeafValue(); for (const size_t* it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) { out_preds(*it) += leaf_value; } } }); builder_monitor_.Stop("UpdatePredictionCache"); return true; } template void QuantileHistMaker::Builder::InitSampling(const DMatrix& fmat, std::vector* gpair, std::vector* row_indices) { const auto& info = fmat.Info(); auto& rnd = common::GlobalRandom(); std::vector& gpair_ref = *gpair; #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG std::bernoulli_distribution coin_flip(param_.subsample); for (size_t i = 0; i < info.num_row_; ++i) { if (!(gpair_ref[i].GetHess() >= 0.0f && coin_flip(rnd)) || gpair_ref[i].GetGrad() == 0.0f) { gpair_ref[i] = GradientPair(0); } } #else const size_t nthread = this->nthread_; uint64_t initial_seed = rnd(); const size_t discard_size = info.num_row_ / nthread; std::bernoulli_distribution coin_flip(param_.subsample); dmlc::OMPException exc; #pragma omp parallel num_threads(nthread) { exc.Run([&]() { const size_t tid = omp_get_thread_num(); const size_t ibegin = tid * discard_size; const size_t iend = (tid == (nthread - 1)) ? info.num_row_ : ibegin + discard_size; RandomReplace::MakeIf([&](size_t i, RandomReplace::EngineT& eng) { return !(gpair_ref[i].GetHess() >= 0.0f && coin_flip(eng)); }, GradientPair(0), initial_seed, ibegin, iend, &gpair_ref); }); } exc.Rethrow(); #endif // XGBOOST_CUSTOMIZE_GLOBAL_PRNG } template size_t QuantileHistMaker::Builder::GetNumberOfTrees() { return n_trees_; } template void QuantileHistMaker::Builder::InitData( const GHistIndexMatrix &gmat, const DMatrix &fmat, const RegTree &tree, std::vector *gpair) { CHECK((param_.max_depth > 0 || param_.max_leaves > 0)) << "max_depth or max_leaves cannot be both 0 (unlimited); " << "at least one should be a positive quantity."; if (param_.grow_policy == TrainParam::kDepthWise) { CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) " << "when grow_policy is depthwise."; } builder_monitor_.Start("InitData"); const auto& info = fmat.Info(); { // initialize the row set row_set_collection_.Clear(); // initialize histogram collection uint32_t nbins = gmat.cut.Ptrs().back(); // initialize histogram builder dmlc::OMPException exc; #pragma omp parallel { exc.Run([&]() { this->nthread_ = omp_get_num_threads(); }); } exc.Rethrow(); this->histogram_builder_->Reset( nbins, BatchParam{GenericParameter::kCpuId, param_.max_bin}, this->nthread_, 1, rabit::IsDistributed()); std::vector& row_indices = *row_set_collection_.Data(); row_indices.resize(info.num_row_); size_t* p_row_indices = row_indices.data(); // mark subsample and build list of member rows if (param_.subsample < 1.0f) { CHECK_EQ(param_.sampling_method, TrainParam::kUniform) << "Only uniform sampling is supported, " << "gradient-based sampling is only support by GPU Hist."; builder_monitor_.Start("InitSampling"); InitSampling(fmat, gpair, &row_indices); builder_monitor_.Stop("InitSampling"); CHECK_EQ(row_indices.size(), info.num_row_); // We should check that the partitioning was done correctly // and each row of the dataset fell into exactly one of the categories } common::MemStackAllocator buff(this->nthread_); bool* p_buff = buff.Get(); std::fill(p_buff, p_buff + this->nthread_, false); const size_t block_size = info.num_row_ / this->nthread_ + !!(info.num_row_ % this->nthread_); #pragma omp parallel num_threads(this->nthread_) { exc.Run([&]() { const size_t tid = omp_get_thread_num(); const size_t ibegin = tid * block_size; const size_t iend = std::min(static_cast(ibegin + block_size), static_cast(info.num_row_)); for (size_t i = ibegin; i < iend; ++i) { if ((*gpair)[i].GetHess() < 0.0f) { p_buff[tid] = true; break; } } }); } exc.Rethrow(); bool has_neg_hess = false; for (int32_t tid = 0; tid < this->nthread_; ++tid) { if (p_buff[tid]) { has_neg_hess = true; } } if (has_neg_hess) { size_t j = 0; for (size_t i = 0; i < info.num_row_; ++i) { if ((*gpair)[i].GetHess() >= 0.0f) { p_row_indices[j++] = i; } } row_indices.resize(j); } else { #pragma omp parallel num_threads(this->nthread_) { exc.Run([&]() { const size_t tid = omp_get_thread_num(); const size_t ibegin = tid * block_size; const size_t iend = std::min(static_cast(ibegin + block_size), static_cast(info.num_row_)); for (size_t i = ibegin; i < iend; ++i) { p_row_indices[i] = i; } }); } exc.Rethrow(); } } row_set_collection_.Init(); { /* determine layout of data */ const size_t nrow = info.num_row_; const size_t ncol = info.num_col_; const size_t nnz = info.num_nonzero_; // number of discrete bins for feature 0 const uint32_t nbins_f0 = gmat.cut.Ptrs()[1] - gmat.cut.Ptrs()[0]; if (nrow * ncol == nnz) { // dense data with zero-based indexing data_layout_ = DataLayout::kDenseDataZeroBased; } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) { // dense data with one-based indexing data_layout_ = DataLayout::kDenseDataOneBased; } else { // sparse data data_layout_ = DataLayout::kSparseData; } } // store a pointer to the tree p_last_tree_ = &tree; if (data_layout_ == DataLayout::kDenseDataOneBased) { evaluator_.reset(new HistEvaluator{ param_, info, this->nthread_, column_sampler_, task_, true}); } else { evaluator_.reset(new HistEvaluator{ param_, info, this->nthread_, column_sampler_, task_, false}); } if (data_layout_ == DataLayout::kDenseDataZeroBased || data_layout_ == DataLayout::kDenseDataOneBased) { /* specialized code for dense data: choose the column that has a least positive number of discrete bins. For dense data (with no missing value), the sum of gradient histogram is equal to snode[nid] */ const std::vector& row_ptr = gmat.cut.Ptrs(); const auto nfeature = static_cast(row_ptr.size() - 1); uint32_t min_nbins_per_feature = 0; for (bst_uint i = 0; i < nfeature; ++i) { const uint32_t nbins = row_ptr[i + 1] - row_ptr[i]; if (nbins > 0) { if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) { min_nbins_per_feature = nbins; fid_least_bins_ = i; } } } CHECK_GT(min_nbins_per_feature, 0U); } builder_monitor_.Stop("InitData"); } template void QuantileHistMaker::Builder::FindSplitConditions( const std::vector& nodes, const RegTree& tree, const GHistIndexMatrix& gmat, std::vector* split_conditions) { const size_t n_nodes = nodes.size(); split_conditions->resize(n_nodes); for (size_t i = 0; i < nodes.size(); ++i) { const int32_t nid = nodes[i].nid; const bst_uint fid = tree[nid].SplitIndex(); const bst_float split_pt = tree[nid].SplitCond(); const uint32_t lower_bound = gmat.cut.Ptrs()[fid]; const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1]; int32_t split_cond = -1; // convert floating-point split_pt into corresponding bin_id // split_cond = -1 indicates that split_pt is less than all known cut points CHECK_LT(upper_bound, static_cast(std::numeric_limits::max())); for (uint32_t bound = lower_bound; bound < upper_bound; ++bound) { if (split_pt == gmat.cut.Values()[bound]) { split_cond = static_cast(bound); } } (*split_conditions)[i] = split_cond; } } template void QuantileHistMaker::Builder::AddSplitsToRowSet( const std::vector& nodes, RegTree* p_tree) { const size_t n_nodes = nodes.size(); for (unsigned int i = 0; i < n_nodes; ++i) { const int32_t nid = nodes[i].nid; const size_t n_left = partition_builder_.GetNLeftElems(i); const size_t n_right = partition_builder_.GetNRightElems(i); CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild()); row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(), n_left, n_right); } } template template void QuantileHistMaker::Builder::ApplySplit(const std::vector nodes, const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, RegTree* p_tree) { builder_monitor_.Start("ApplySplit"); // 1. Find split condition for each split const size_t n_nodes = nodes.size(); std::vector split_conditions; FindSplitConditions(nodes, *p_tree, gmat, &split_conditions); // 2.1 Create a blocked space of size SUM(samples in each node) common::BlockedSpace2d space(n_nodes, [&](size_t node_in_set) { int32_t nid = nodes[node_in_set].nid; return row_set_collection_[nid].Size(); }, kPartitionBlockSize); // 2.2 Initialize the partition builder // allocate buffers for storage intermediate results by each thread partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) { const int32_t nid = nodes[node_in_set].nid; const size_t size = row_set_collection_[nid].Size(); const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize); return n_tasks; }); // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node // Store results in intermediate buffers from partition_builder_ common::ParallelFor2d(space, this->nthread_, [&](size_t node_in_set, common::Range1d r) { size_t begin = r.begin(); const int32_t nid = nodes[node_in_set].nid; const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin); partition_builder_.AllocateForTask(task_id); switch (column_matrix.GetTypeSize()) { case common::kUint8BinsTypeSize: partition_builder_.template Partition(node_in_set, nid, r, split_conditions[node_in_set], column_matrix, *p_tree, row_set_collection_[nid].begin); break; case common::kUint16BinsTypeSize: partition_builder_.template Partition(node_in_set, nid, r, split_conditions[node_in_set], column_matrix, *p_tree, row_set_collection_[nid].begin); break; case common::kUint32BinsTypeSize: partition_builder_.template Partition(node_in_set, nid, r, split_conditions[node_in_set], column_matrix, *p_tree, row_set_collection_[nid].begin); break; default: CHECK(false); // no default behavior } }); // 3. Compute offsets to copy blocks of row-indexes // from partition_builder_ to row_set_collection_ partition_builder_.CalculateRowOffsets(); // 4. Copy elements from partition_builder_ to row_set_collection_ back // with updated row-indexes for each tree-node common::ParallelFor2d(space, this->nthread_, [&](size_t node_in_set, common::Range1d r) { const int32_t nid = nodes[node_in_set].nid; partition_builder_.MergeToArray(node_in_set, r.begin(), const_cast(row_set_collection_[nid].begin)); }); // 5. Add info about splits into row_set_collection_ AddSplitsToRowSet(nodes, p_tree); builder_monitor_.Stop("ApplySplit"); } template struct QuantileHistMaker::Builder; template struct QuantileHistMaker::Builder; XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker") .describe("(Deprecated, use grow_quantile_histmaker instead.)" " Grow tree using quantized histogram.") .set_body( [](ObjInfo task) { LOG(WARNING) << "grow_fast_histmaker is deprecated, " << "use grow_quantile_histmaker instead."; return new QuantileHistMaker(task); }); XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker") .describe("Grow tree using quantized histogram.") .set_body( [](ObjInfo task) { return new QuantileHistMaker(task); }); } // namespace tree } // namespace xgboost