Improved gpu_hist_experimental algorithm (#2866)
- Implement colsampling, subsampling for gpu_hist_experimental - Optimised multi-GPU implementation for gpu_hist_experimental - Make nccl optional - Add Volta architecture flag - Optimise RegLossObj - Add timing utilities for debug verbose mode - Bump required cuda version to 8.0
This commit is contained in:
@@ -65,9 +65,9 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
||||
bst_float p = rec[k];
|
||||
const bst_float h = 2.0f * p * (1.0f - p) * wt;
|
||||
if (label == k) {
|
||||
out_gpair->at(i * nclass + k) = bst_gpair((p - 1.0f) * wt, h);
|
||||
(*out_gpair)[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h);
|
||||
} else {
|
||||
out_gpair->at(i * nclass + k) = bst_gpair(p* wt, h);
|
||||
(*out_gpair)[i * nclass + k] = bst_gpair(p* wt, h);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file regression.cc
|
||||
* \file regression_obj.cc
|
||||
* \brief Definition of single-value regression and classification objectives.
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
@@ -68,53 +68,61 @@ struct LogisticRaw : public LogisticRegression {
|
||||
|
||||
struct RegLossParam : public dmlc::Parameter<RegLossParam> {
|
||||
float scale_pos_weight;
|
||||
int nthread;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(RegLossParam) {
|
||||
DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
|
||||
.describe("Scale the weight of positive examples by this factor");
|
||||
DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
|
||||
"Number of threads to use.");
|
||||
}
|
||||
};
|
||||
|
||||
// regression loss function
|
||||
template<typename Loss>
|
||||
template <typename Loss>
|
||||
class RegLossObj : public ObjFunction {
|
||||
public:
|
||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||
RegLossObj() : labels_checked(false) {}
|
||||
|
||||
void Configure(
|
||||
const std::vector<std::pair<std::string, std::string> > &args) override {
|
||||
param_.InitAllowUnknown(args);
|
||||
}
|
||||
void GetGradient(const std::vector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) override {
|
||||
void GetGradient(const std::vector<bst_float> &preds, const MetaInfo &info,
|
||||
int iter, std::vector<bst_gpair> *out_gpair) override {
|
||||
CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
|
||||
CHECK_EQ(preds.size(), info.labels.size())
|
||||
<< "labels are not correctly provided"
|
||||
<< "preds.size=" << preds.size() << ", label.size=" << info.labels.size();
|
||||
<< "preds.size=" << preds.size()
|
||||
<< ", label.size=" << info.labels.size();
|
||||
|
||||
this->LazyCheckLabels(info.labels);
|
||||
out_gpair->resize(preds.size());
|
||||
// check if label in range
|
||||
bool label_correct = true;
|
||||
|
||||
// start calculating gradient
|
||||
const omp_ulong ndata = static_cast<omp_ulong>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
int nthread = param_.nthread == 0 ? omp_get_num_procs() : param_.nthread;
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (omp_ulong i = 0; i < ndata; ++i) {
|
||||
auto y = info.labels[i];
|
||||
bst_float p = Loss::PredTransform(preds[i]);
|
||||
bst_float w = info.GetWeight(i);
|
||||
if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight;
|
||||
if (!Loss::CheckLabel(info.labels[i])) label_correct = false;
|
||||
out_gpair->at(i) = bst_gpair(Loss::FirstOrderGradient(p, info.labels[i]) * w,
|
||||
Loss::SecondOrderGradient(p, info.labels[i]) * w);
|
||||
}
|
||||
if (!label_correct) {
|
||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||
// Branchless version of the below function
|
||||
// The branch is particularly slow as the cpu cannot predict the label
|
||||
// with any accuracy resulting in frequent pipeline stalls
|
||||
// if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight;
|
||||
w += y * ((param_.scale_pos_weight * w) - w);
|
||||
(*out_gpair)[i] = bst_gpair(Loss::FirstOrderGradient(p, y) * w,
|
||||
Loss::SecondOrderGradient(p, y) * w);
|
||||
}
|
||||
}
|
||||
const char* DefaultEvalMetric() const override {
|
||||
const char *DefaultEvalMetric() const override {
|
||||
return Loss::DefaultEvalMetric();
|
||||
}
|
||||
void PredTransform(std::vector<bst_float> *io_preds) override {
|
||||
std::vector<bst_float> &preds = *io_preds;
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
preds[j] = Loss::PredTransform(preds[j]);
|
||||
}
|
||||
@@ -124,7 +132,15 @@ class RegLossObj : public ObjFunction {
|
||||
}
|
||||
|
||||
protected:
|
||||
void LazyCheckLabels(const std::vector<float> &labels) {
|
||||
if (labels_checked) return;
|
||||
for (auto &y : labels) {
|
||||
CHECK(Loss::CheckLabel(y)) << Loss::LabelErrorMsg();
|
||||
}
|
||||
labels_checked = true;
|
||||
}
|
||||
RegLossParam param_;
|
||||
bool labels_checked;
|
||||
};
|
||||
|
||||
// register the objective functions
|
||||
@@ -149,10 +165,13 @@ XGBOOST_REGISTER_OBJECTIVE(LogisticRaw, "binary:logitraw")
|
||||
// declare parameter
|
||||
struct PoissonRegressionParam : public dmlc::Parameter<PoissonRegressionParam> {
|
||||
float max_delta_step;
|
||||
int nthread;
|
||||
DMLC_DECLARE_PARAMETER(PoissonRegressionParam) {
|
||||
DMLC_DECLARE_FIELD(max_delta_step).set_lower_bound(0.0f).set_default(0.7f)
|
||||
.describe("Maximum delta step we allow each weight estimation to be." \
|
||||
" This parameter is required for possion regression.");
|
||||
DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
|
||||
"Number of threads to use.");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -175,13 +194,14 @@ class PoissonRegression : public ObjFunction {
|
||||
bool label_correct = true;
|
||||
// start calculating gradient
|
||||
const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
int nthread = param_.nthread == 0 ? omp_get_num_procs() : param_.nthread;
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
|
||||
bst_float p = preds[i];
|
||||
bst_float w = info.GetWeight(i);
|
||||
bst_float y = info.labels[i];
|
||||
if (y >= 0.0f) {
|
||||
out_gpair->at(i) = bst_gpair((std::exp(p) - y) * w,
|
||||
(*out_gpair)[i] = bst_gpair((std::exp(p) - y) * w,
|
||||
std::exp(p + param_.max_delta_step) * w);
|
||||
} else {
|
||||
label_correct = false;
|
||||
@@ -192,7 +212,8 @@ class PoissonRegression : public ObjFunction {
|
||||
void PredTransform(std::vector<bst_float> *io_preds) override {
|
||||
std::vector<bst_float> &preds = *io_preds;
|
||||
const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
int nthread = param_.nthread == 0 ? omp_get_num_procs() : param_.nthread;
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (long j = 0; j < ndata; ++j) { // NOLINT(*)
|
||||
preds[j] = std::exp(preds[j]);
|
||||
}
|
||||
@@ -242,7 +263,7 @@ class GammaRegression : public ObjFunction {
|
||||
bst_float w = info.GetWeight(i);
|
||||
bst_float y = info.labels[i];
|
||||
if (y >= 0.0f) {
|
||||
out_gpair->at(i) = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w);
|
||||
(*out_gpair)[i] = bst_gpair((1 - y / std::exp(p)) * w, y / std::exp(p) * w);
|
||||
} else {
|
||||
label_correct = false;
|
||||
}
|
||||
@@ -276,9 +297,12 @@ XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
|
||||
// declare parameter
|
||||
struct TweedieRegressionParam : public dmlc::Parameter<TweedieRegressionParam> {
|
||||
float tweedie_variance_power;
|
||||
int nthread;
|
||||
DMLC_DECLARE_PARAMETER(TweedieRegressionParam) {
|
||||
DMLC_DECLARE_FIELD(tweedie_variance_power).set_range(1.0f, 2.0f).set_default(1.5f)
|
||||
.describe("Tweedie variance power. Must be between in range [1, 2).");
|
||||
DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
|
||||
"Number of threads to use.");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -301,7 +325,8 @@ class TweedieRegression : public ObjFunction {
|
||||
bool label_correct = true;
|
||||
// start calculating gradient
|
||||
const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
int nthread = param_.nthread == 0 ? omp_get_num_procs() : param_.nthread;
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
|
||||
bst_float p = preds[i];
|
||||
bst_float w = info.GetWeight(i);
|
||||
@@ -311,7 +336,7 @@ class TweedieRegression : public ObjFunction {
|
||||
bst_float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p);
|
||||
bst_float hess = -y * (1 - rho) * \
|
||||
std::exp((1 - rho) * p) + (2 - rho) * std::exp((2 - rho) * p);
|
||||
out_gpair->at(i) = bst_gpair(grad * w, hess * w);
|
||||
(*out_gpair)[i] = bst_gpair(grad * w, hess * w);
|
||||
} else {
|
||||
label_correct = false;
|
||||
}
|
||||
@@ -321,7 +346,8 @@ class TweedieRegression : public ObjFunction {
|
||||
void PredTransform(std::vector<bst_float> *io_preds) override {
|
||||
std::vector<bst_float> &preds = *io_preds;
|
||||
const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
int nthread = param_.nthread == 0 ? omp_get_num_procs() : param_.nthread;
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (long j = 0; j < ndata; ++j) { // NOLINT(*)
|
||||
preds[j] = std::exp(preds[j]);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user