/** * Copyright 2014-2023, XGBoost Contributors * \file gblinear.cc * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net * the update rule is parallel coordinate descent (shotgun) * \author Tianqi Chen */ #include #include #include #include #include #include #include #include "xgboost/gbm.h" #include "xgboost/json.h" #include "xgboost/predictor.h" #include "xgboost/linear_updater.h" #include "xgboost/logging.h" #include "xgboost/learner.h" #include "xgboost/linalg.h" #include "gblinear_model.h" #include "../common/timer.h" #include "../common/common.h" #include "../common/threading_utils.h" #include "../common/error_msg.h" namespace xgboost::gbm { DMLC_REGISTRY_FILE_TAG(gblinear); // training parameters struct GBLinearTrainParam : public XGBoostParameter { std::string updater; float tolerance; size_t max_row_perbatch; void CheckGPUSupport() { auto n_gpus = common::AllVisibleGPUs(); if (n_gpus == 0 && this->updater == "gpu_coord_descent") { common::AssertGPUSupport(); this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}}); LOG(WARNING) << "Loading configuration on a CPU only machine. Changing " "updater to `coord_descent`."; } } DMLC_DECLARE_PARAMETER(GBLinearTrainParam) { DMLC_DECLARE_FIELD(updater) .set_default("shotgun") .describe("Update algorithm for linear model. One of shotgun/coord_descent"); DMLC_DECLARE_FIELD(tolerance) .set_lower_bound(0.0f) .set_default(0.0f) .describe("Stop if largest weight update is smaller than this number."); DMLC_DECLARE_FIELD(max_row_perbatch) .set_default(std::numeric_limits::max()) .describe("Maximum rows per batch."); } }; void LinearCheckLayer(unsigned layer_begin) { CHECK_EQ(layer_begin, 0) << "Linear booster does not support prediction range."; } /*! * \brief gradient boosted linear model */ class GBLinear : public GradientBooster { public: explicit GBLinear(LearnerModelParam const* learner_model_param, Context const* ctx) : GradientBooster{ctx}, learner_model_param_{learner_model_param}, model_{learner_model_param}, previous_model_{learner_model_param} {} void Configure(const Args& cfg) override { if (model_.weight.size() == 0) { model_.Configure(cfg); } param_.UpdateAllowUnknown(cfg); param_.CheckGPUSupport(); if (param_.updater == "gpu_coord_descent") { LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0", R"(device="cuda", updater="coord_descent")"); } if (param_.updater == "coord_descent" && ctx_->IsCUDA()) { updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_)); } else { updater_.reset(LinearUpdater::Create(param_.updater, ctx_)); } updater_->Configure(cfg); monitor_.Init("GBLinear"); } int32_t BoostedRounds() const override { return model_.num_boosted_rounds; } bool ModelFitted() const override { return BoostedRounds() != 0; } void Load(dmlc::Stream* fi) override { model_.Load(fi); } void Save(dmlc::Stream* fo) const override { model_.Save(fo); } void SaveModel(Json* p_out) const override { auto& out = *p_out; out["name"] = String{"gblinear"}; out["model"] = Object(); auto& model = out["model"]; model_.SaveModel(&model); } void LoadModel(Json const& in) override { CHECK_EQ(get(in["name"]), "gblinear"); auto const& model = in["model"]; model_.LoadModel(model); } void LoadConfig(Json const& in) override { CHECK_EQ(get(in["name"]), "gblinear"); FromJson(in["gblinear_train_param"], ¶m_); param_.CheckGPUSupport(); updater_.reset(LinearUpdater::Create(param_.updater, ctx_)); this->updater_->LoadConfig(in["updater"]); } void SaveConfig(Json* p_out) const override { auto& out = *p_out; out["name"] = String{"gblinear"}; out["gblinear_train_param"] = ToJson(param_); out["updater"] = Object(); auto& j_updater = out["updater"]; CHECK(this->updater_); this->updater_->SaveConfig(&j_updater); } void DoBoost(DMatrix* p_fmat, HostDeviceVector* in_gpair, PredictionCacheEntry*, ObjFunction const*) override { monitor_.Start("DoBoost"); model_.LazyInitModel(); this->LazySumWeights(p_fmat); if (!this->CheckConvergence()) { updater_->Update(in_gpair, p_fmat, &model_, sum_instance_weight_); } model_.num_boosted_rounds++; monitor_.Stop("DoBoost"); } void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* predts, bool /*training*/, bst_layer_t layer_begin, bst_layer_t) override { monitor_.Start("PredictBatch"); LinearCheckLayer(layer_begin); auto* out_preds = &predts->predictions; this->PredictBatchInternal(p_fmat, &out_preds->HostVector()); monitor_.Stop("PredictBatch"); } // add base margin void PredictInstance(const SparsePage::Inst& inst, std::vector* out_preds, uint32_t layer_begin, uint32_t) override { LinearCheckLayer(layer_begin); const int ngroup = model_.learner_model_param->num_output_group; auto base_score = learner_model_param_->BaseScore(ctx_); for (int gid = 0; gid < ngroup; ++gid) { this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_score(0)); } } void PredictLeaf(DMatrix *, HostDeviceVector *, unsigned, unsigned) override { LOG(FATAL) << "gblinear does not support prediction of leaf index"; } void PredictContribution(DMatrix* p_fmat, HostDeviceVector* out_contribs, bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override { model_.LazyInitModel(); LinearCheckLayer(layer_begin); auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId); const int ngroup = model_.learner_model_param->num_output_group; const size_t ncolumns = model_.learner_model_param->num_feature + 1; // allocate space for (#features + bias) times #groups times #rows std::vector& contribs = out_contribs->HostVector(); contribs.resize(p_fmat->Info().num_row_ * ncolumns * ngroup); // make sure contributions is zeroed, we could be reusing a previously allocated one std::fill(contribs.begin(), contribs.end(), 0); auto base_score = learner_model_param_->BaseScore(ctx_); // start collecting the contributions for (const auto &batch : p_fmat->GetBatches()) { // parallel over local batch const auto nsize = static_cast(batch.Size()); auto page = batch.GetView(); common::ParallelFor(nsize, ctx_->Threads(), [&](bst_omp_uint i) { auto inst = page[i]; auto row_idx = static_cast(batch.base_rowid + i); // loop over output groups for (int gid = 0; gid < ngroup; ++gid) { bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns]; // calculate linear terms' contributions for (auto& ins : inst) { if (ins.index >= model_.learner_model_param->num_feature) continue; p_contribs[ins.index] = ins.fvalue * model_[ins.index][gid]; } // add base margin to BIAS p_contribs[ncolumns - 1] = model_.Bias()[gid] + ((base_margin.Size() != 0) ? base_margin(row_idx, gid) : base_score(0)); } }); } } void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector* out_contribs, bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override { LinearCheckLayer(layer_begin); std::vector& contribs = out_contribs->HostVector(); // linear models have no interaction effects const size_t nelements = model_.learner_model_param->num_feature * model_.learner_model_param->num_feature; contribs.resize(p_fmat->Info().num_row_ * nelements * model_.learner_model_param->num_output_group); std::fill(contribs.begin(), contribs.end(), 0); } std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const override { return model_.DumpModel(fmap, with_stats, format); } void FeatureScore(std::string const &importance_type, common::Span trees, std::vector *out_features, std::vector *out_scores) const override { CHECK(!model_.weight.empty()) << "Model is not initialized"; CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance."; CHECK_EQ(importance_type, "weight") << "gblinear only has `weight` defined for feature importance."; out_features->resize(this->learner_model_param_->num_feature, 0); std::iota(out_features->begin(), out_features->end(), 0); // Don't include the bias term in the feature importance scores // The bias is the last weight out_scores->resize(model_.weight.size() - learner_model_param_->num_output_group, 0); auto n_groups = learner_model_param_->num_output_group; linalg::TensorView scores{ *out_scores, {learner_model_param_->num_feature, n_groups}, Context::kCpuId}; for (size_t i = 0; i < learner_model_param_->num_feature; ++i) { for (bst_group_t g = 0; g < n_groups; ++g) { scores(i, g) = model_[i][g]; } } } bool UseGPU() const override { if (param_.updater == "gpu_coord_descent") { return true; } else { return false; } } protected: void PredictBatchInternal(DMatrix *p_fmat, std::vector *out_preds) { monitor_.Start("PredictBatchInternal"); model_.LazyInitModel(); std::vector &preds = *out_preds; auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId); // start collecting the prediction const int ngroup = model_.learner_model_param->num_output_group; preds.resize(p_fmat->Info().num_row_ * ngroup); auto base_score = learner_model_param_->BaseScore(Context::kCpuId); for (const auto &page : p_fmat->GetBatches()) { auto const& batch = page.GetView(); // output convention: nrow * k, where nrow is number of rows // k is number of group // parallel over local batch const auto nsize = static_cast(batch.Size()); if (base_margin.Size() != 0) { CHECK_EQ(base_margin.Size(), nsize * ngroup); } common::ParallelFor(nsize, ctx_->Threads(), [&](omp_ulong i) { const size_t ridx = page.base_rowid + i; // loop over output groups for (int gid = 0; gid < ngroup; ++gid) { float margin = (base_margin.Size() != 0) ? base_margin(ridx, gid) : base_score(0); this->Pred(batch[i], &preds[ridx * ngroup], gid, margin); } }); } monitor_.Stop("PredictBatchInternal"); } bool CheckConvergence() { if (param_.tolerance == 0.0f) return false; if (is_converged_) return true; if (previous_model_.weight.size() != model_.weight.size()) { previous_model_ = model_; return false; } float largest_dw = 0.0; for (size_t i = 0; i < model_.weight.size(); i++) { largest_dw = std::max( largest_dw, std::abs(model_.weight[i] - previous_model_.weight[i])); } previous_model_ = model_; is_converged_ = largest_dw <= param_.tolerance; return is_converged_; } void LazySumWeights(DMatrix *p_fmat) { if (!sum_weight_complete_) { auto &info = p_fmat->Info(); for (size_t i = 0; i < info.num_row_; i++) { sum_instance_weight_ += info.GetWeight(i); } sum_weight_complete_ = true; } } void Pred(const SparsePage::Inst &inst, bst_float *preds, int gid, bst_float base) { bst_float psum = model_.Bias()[gid] + base; for (const auto& ins : inst) { if (ins.index >= model_.learner_model_param->num_feature) continue; psum += ins.fvalue * model_[ins.index][gid]; } preds[gid] = psum; } // biase margin score LearnerModelParam const* learner_model_param_; // model field GBLinearModel model_; GBLinearModel previous_model_; GBLinearTrainParam param_; std::unique_ptr updater_; double sum_instance_weight_{}; bool sum_weight_complete_{false}; common::Monitor monitor_; bool is_converged_{false}; }; // register the objective functions DMLC_REGISTER_PARAMETER(GBLinearTrainParam); XGBOOST_REGISTER_GBM(GBLinear, "gblinear") .describe("Linear booster, implement generalized linear model.") .set_body([](LearnerModelParam const* booster_config, Context const* ctx) { return new GBLinear(booster_config, ctx); }); } // namespace xgboost::gbm