Add warnings for large labels when using GPU histogram algorithms (#2834)
This commit is contained in:
parent
13e7a2cff0
commit
d9d5293cdb
@ -12,13 +12,13 @@ Specify the 'tree_method' parameter as one of the following algorithms.
|
||||
### Algorithms
|
||||
|
||||
```eval_rst
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| tree_method | Description |
|
||||
+==============+===============================================================================================================================================+
|
||||
+==============+=================================================================================================================================================================================================================+
|
||||
| gpu_exact | The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than 'gpu_hist' |
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Faster and uses considerably less memory. Splits may be less accurate. |
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Much faster and uses considerably less memory. NOTE: Cannot be used with labels larger in magnitude than 2^16 due to it's histogram aggregation algorithm. |
|
||||
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
```
|
||||
|
||||
### Supported parameters
|
||||
|
||||
@ -171,19 +171,19 @@ class bst_gpair_internal {
|
||||
|
||||
template<>
|
||||
inline XGBOOST_DEVICE float bst_gpair_internal<int64_t>::GetGrad() const {
|
||||
return grad_ * 1e-5f;
|
||||
return grad_ * 1e-4f;
|
||||
}
|
||||
template<>
|
||||
inline XGBOOST_DEVICE float bst_gpair_internal<int64_t>::GetHess() const {
|
||||
return hess_ * 1e-5f;
|
||||
return hess_ * 1e-4f;
|
||||
}
|
||||
template<>
|
||||
inline XGBOOST_DEVICE void bst_gpair_internal<int64_t>::SetGrad(float g) {
|
||||
grad_ = static_cast<int64_t>(std::round(g * 1e5));
|
||||
grad_ = static_cast<int64_t>(std::round(g * 1e4));
|
||||
}
|
||||
template<>
|
||||
inline XGBOOST_DEVICE void bst_gpair_internal<int64_t>::SetHess(float h) {
|
||||
hess_ = static_cast<int64_t>(std::round(h * 1e5));
|
||||
hess_ = static_cast<int64_t>(std::round(h * 1e4));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
@ -194,10 +194,10 @@ typedef detail::bst_gpair_internal<float> bst_gpair;
|
||||
/*! \brief High precision gradient statistics pair */
|
||||
typedef detail::bst_gpair_internal<double> bst_gpair_precise;
|
||||
|
||||
/*! \brief High precision gradient statistics pair with integer backed
|
||||
/*! \brief High precision gradient statistics pair with integer backed
|
||||
* storage. Operators are associative where floating point versions are not
|
||||
* associative. */
|
||||
typedef detail::bst_gpair_internal<int64_t> bst_gpair_integer;
|
||||
typedef detail::bst_gpair_internal<int64_t> bst_gpair_integer;
|
||||
|
||||
/*! \brief small eps gap for minimum split decision. */
|
||||
const bst_float rt_eps = 1e-6f;
|
||||
|
||||
@ -15,6 +15,27 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
/**
|
||||
* \fn void CheckGradientMax(const dh::dvec<bst_gpair>& gpair)
|
||||
*
|
||||
* \brief Check maximum gradient value is below 2^16. This is to prevent
|
||||
* overflow when using integer gradient summation.
|
||||
*/
|
||||
|
||||
inline void CheckGradientMax(const dh::dvec<bst_gpair>& gpair) {
|
||||
auto dptr = thrust::device_ptr<const float>(
|
||||
reinterpret_cast<const float*>(gpair.data()));
|
||||
float abs_max = thrust::reduce(dptr, dptr + (gpair.size() * 2), 0.f,
|
||||
[=] __device__(float a, float b) {
|
||||
a = abs(a);
|
||||
b = abs(b);
|
||||
return max(a, b);
|
||||
});
|
||||
|
||||
CHECK_LT(abs_max, std::pow(2.0f, 16.0f))
|
||||
<< "Labels are too large for this algorithm. Rescale to less than 2^16.";
|
||||
}
|
||||
|
||||
struct GPUTrainingParam {
|
||||
// minimum amount of hessian(weight) allowed in a child
|
||||
float min_child_weight;
|
||||
@ -64,7 +85,7 @@ struct DeviceSplitCandidate {
|
||||
: loss_chg(-FLT_MAX), dir(LeftDir), fvalue(0), findex(-1) {}
|
||||
|
||||
template <typename param_t>
|
||||
__host__ __device__ void Update(const DeviceSplitCandidate &other,
|
||||
__host__ __device__ void Update(const DeviceSplitCandidate& other,
|
||||
const param_t& param) {
|
||||
if (other.loss_chg > loss_chg &&
|
||||
other.left_sum.GetHess() >= param.min_child_weight &&
|
||||
@ -170,8 +191,10 @@ struct SumCallbackOp {
|
||||
};
|
||||
|
||||
template <typename gpair_t>
|
||||
__device__ inline float device_calc_loss_chg(
|
||||
const GPUTrainingParam& param, const gpair_t& left, const gpair_t& parent_sum, const float& parent_gain) {
|
||||
__device__ inline float device_calc_loss_chg(const GPUTrainingParam& param,
|
||||
const gpair_t& left,
|
||||
const gpair_t& parent_sum,
|
||||
const float& parent_gain) {
|
||||
gpair_t right = parent_sum - left;
|
||||
float left_gain = CalcGain(param, left.GetGrad(), left.GetHess());
|
||||
float right_gain = CalcGain(param, right.GetGrad(), right.GetHess());
|
||||
@ -187,8 +210,8 @@ __device__ float inline loss_chg_missing(const gpair_t& scan,
|
||||
bool& missing_left_out) { // NOLINT
|
||||
float missing_left_loss =
|
||||
device_calc_loss_chg(param, scan + missing, parent_sum, parent_gain);
|
||||
float missing_right_loss = device_calc_loss_chg(
|
||||
param, scan, parent_sum, parent_gain);
|
||||
float missing_right_loss =
|
||||
device_calc_loss_chg(param, scan, parent_sum, parent_gain);
|
||||
|
||||
if (missing_left_loss >= missing_right_loss) {
|
||||
missing_left_out = true;
|
||||
|
||||
@ -537,6 +537,9 @@ class GPUHistMaker : public TreeUpdater {
|
||||
device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
|
||||
gpair.begin() + device_row_segments[d_idx + 1]);
|
||||
|
||||
// Check gradients are within acceptable size range
|
||||
CheckGradientMax(device_gpair[d_idx]);
|
||||
|
||||
subsample_gpair(&device_gpair[d_idx], param.subsample,
|
||||
device_row_segments[d_idx]);
|
||||
|
||||
|
||||
@ -334,6 +334,8 @@ struct DeviceShard {
|
||||
ridx_segments.front() = std::make_pair(0, ridx.size());
|
||||
this->gpair.copy(host_gpair.begin() + row_start_idx,
|
||||
host_gpair.begin() + row_end_idx);
|
||||
// Check gradients are within acceptable size range
|
||||
CheckGradientMax(gpair);
|
||||
hist.Reset();
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user