Use matrix for gradient. (#9508)

- Use the `linalg::Matrix` for storing gradients.
- New API for the custom objective.
- Custom objective for multi-class/multi-target is now required to return the correct shape.
- Custom objective for Python can accept arrays with any strides. (row-major, column-major)
This commit is contained in:
Jiaming Yuan
2023-08-24 05:29:52 +08:00
committed by GitHub
parent 6103dca0bb
commit 972730cde0
77 changed files with 1052 additions and 651 deletions

View File

@@ -274,8 +274,8 @@ class GradientPairInt64 {
GradientPairInt64(GradientPairInt64 const &g) = default;
GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; }
XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; }
[[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
[[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
grad_ += rhs.grad_;

View File

@@ -789,16 +789,14 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
* \param out The address to hold number of rows.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle,
bst_ulong *out);
XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, bst_ulong *out);
/*!
* \brief get number of columns
* \param handle the handle to the DMatrix
* \param out The output of number of columns
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
bst_ulong *out);
XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
/*!
* \brief Get number of valid values from DMatrix.
@@ -945,21 +943,30 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle
* @example c-api-demo.c
*/
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param len length of grad/hess array
* \return 0 when success, -1 when failure happens
/**
* @deprecated since 2.1.0
*/
XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
DMatrixHandle dtrain,
float *grad,
float *hess,
bst_ulong len);
XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, float *grad,
float *hess, bst_ulong len);
/**
* @brief Update a model with gradient and Hessian. This is used for training with a
* custom objective function.
*
* @since 2.0.0
*
* @param handle handle
* @param dtrain The training data.
* @param iter The current iteration round. When training continuation is used, the count
* should restart.
* @param grad Json encoded __(cuda)_array_interface__ for gradient.
* @param hess Json encoded __(cuda)_array_interface__ for Hessian.
*
* @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
char const *grad, char const *hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle

View File

@@ -70,22 +70,25 @@ class GradientBooster : public Model, public Configurable {
GradientBooster* /*out*/, bool* /*out_of_bound*/) const {
LOG(FATAL) << "Slice is not supported by the current booster.";
}
/*! \brief Return number of boosted rounds.
/**
* @brief Return number of boosted rounds.
*/
virtual int32_t BoostedRounds() const = 0;
[[nodiscard]] virtual std::int32_t BoostedRounds() const = 0;
/**
* \brief Whether the model has already been trained. When tree booster is chosen, then
* returns true when there are existing trees.
*/
virtual bool ModelFitted() const = 0;
/*!
* \brief perform update to the model(boosting)
* \param p_fmat feature matrix that provide access to features
* \param in_gpair address of the gradient pair statistics of the data
* \param prediction The output prediction cache entry that needs to be updated.
* the booster may change content of gpair
[[nodiscard]] virtual bool ModelFitted() const = 0;
/**
* @brief perform update to the model(boosting)
*
* @param p_fmat feature matrix that provide access to features
* @param in_gpair address of the gradient pair statistics of the data
* @param prediction The output prediction cache entry that needs to be updated.
* the booster may change content of gpair
* @param obj The objective function used for boosting.
*/
virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
virtual void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
PredictionCacheEntry*, ObjFunction const* obj) = 0;
/**
@@ -165,18 +168,17 @@ class GradientBooster : public Model, public Configurable {
* \param format the format to dump the model in
* \return a vector of dump for boosters.
*/
virtual std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const = 0;
[[nodiscard]] virtual std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const = 0;
virtual void FeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*!
* \brief Whether the current booster uses GPU.
/**
* @brief Whether the current booster uses GPU.
*/
virtual bool UseGPU() const = 0;
[[nodiscard]] virtual bool UseGPU() const = 0;
/*!
* \brief create a gradient booster from given name
* \param name name of gradient booster

View File

@@ -76,17 +76,18 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
* \param iter current iteration number
* \param train reference to the data matrix.
*/
virtual void UpdateOneIter(int iter, std::shared_ptr<DMatrix> train) = 0;
/*!
* \brief Do customized gradient boosting with in_gpair.
* in_gair can be mutated after this call.
* \param iter current iteration number
* \param train reference to the data matrix.
* \param in_gpair The input gradient statistics.
virtual void UpdateOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train) = 0;
/**
* @brief Do customized gradient boosting with in_gpair.
*
* @note in_gpair can be mutated after this call.
*
* @param iter current iteration number
* @param train reference to the data matrix.
* @param in_gpair The input gradient statistics.
*/
virtual void BoostOneIter(int iter,
std::shared_ptr<DMatrix> train,
HostDeviceVector<GradientPair>* in_gpair) = 0;
virtual void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
linalg::Matrix<GradientPair>* in_gpair) = 0;
/*!
* \brief evaluate the model for specific iteration using the configured metrics.
* \param iter iteration number

View File

@@ -292,7 +292,7 @@ enum Order : std::uint8_t {
template <typename T, int32_t kDim>
class TensorView {
public:
using ShapeT = size_t[kDim];
using ShapeT = std::size_t[kDim];
using StrideT = ShapeT;
private:
@@ -400,10 +400,14 @@ class TensorView {
* \param shape shape of the tensor
* \param device Device ordinal
*/
template <typename I, int32_t D>
template <typename I, std::int32_t D>
LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device)
: TensorView{data, shape, device, Order::kC} {}
template <typename I, std::int32_t D>
LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device)
: TensorView{data, shape, device.ordinal, Order::kC} {}
template <typename I, int32_t D>
LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device, Order order)
: data_{data}, ptr_{data_.data()}, device_{device} {
@@ -446,6 +450,10 @@ class TensorView {
});
this->CalcSize();
}
template <typename I, std::int32_t D>
LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
DeviceOrd device)
: TensorView{data, shape, stride, device.ordinal} {}
template <
typename U,
@@ -741,7 +749,7 @@ auto ArrayInterfaceStr(TensorView<T, D> const &t) {
template <typename T, int32_t kDim = 5>
class Tensor {
public:
using ShapeT = size_t[kDim];
using ShapeT = std::size_t[kDim];
using StrideT = ShapeT;
private:
@@ -775,6 +783,9 @@ class Tensor {
template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device, order} {}
template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
template <typename I, size_t D>
explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
@@ -814,6 +825,10 @@ class Tensor {
// shape
this->Initialize(shape, device);
}
template <typename I, int32_t D>
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
Order order = kC)
: Tensor{data, shape, device.ordinal, order} {}
/**
* \brief Index operator. Not thread safe, should not be used in performance critical
* region. For more efficient indexing, consider getting a view first.
@@ -832,9 +847,9 @@ class Tensor {
}
/**
* \brief Get a \ref TensorView for this tensor.
* @brief Get a @ref TensorView for this tensor.
*/
TensorView<T, kDim> View(int32_t device) {
TensorView<T, kDim> View(std::int32_t device) {
if (device >= 0) {
data_.SetDevice(device);
auto span = data_.DeviceSpan();
@@ -844,7 +859,7 @@ class Tensor {
return {span, shape_, device, order_};
}
}
TensorView<T const, kDim> View(int32_t device) const {
TensorView<T const, kDim> View(std::int32_t device) const {
if (device >= 0) {
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
@@ -854,6 +869,26 @@ class Tensor {
return {span, shape_, device, order_};
}
}
auto View(DeviceOrd device) {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.DeviceSpan();
return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
} else {
auto span = data_.HostSpan();
return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
}
}
auto View(DeviceOrd device) const {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
} else {
auto span = data_.ConstHostSpan();
return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
}
}
auto HostView() const { return this->View(-1); }
auto HostView() { return this->View(-1); }
@@ -931,6 +966,7 @@ class Tensor {
* \brief Set device ordinal for this tensor.
*/
void SetDevice(int32_t device) const { data_.SetDevice(device); }
void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
[[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
};

View File

@@ -49,9 +49,8 @@ class LinearUpdater : public Configurable {
* \param model Model to be updated.
* \param sum_instance_weight The sum instance weights, used to normalise l1/l2 penalty.
*/
virtual void Update(HostDeviceVector<GradientPair>* in_gpair, DMatrix* data,
gbm::GBLinearModel* model,
double sum_instance_weight) = 0;
virtual void Update(linalg::Matrix<GradientPair>* in_gpair, DMatrix* data,
gbm::GBLinearModel* model, double sum_instance_weight) = 0;
/*!
* \brief Create a linear updater given name

View File

@@ -41,17 +41,16 @@ class ObjFunction : public Configurable {
* \param args arguments to the objective function.
*/
virtual void Configure(const std::vector<std::pair<std::string, std::string> >& args) = 0;
/*!
* \brief Get gradient over each of predictions, given existing information.
* \param preds prediction of current round
* \param info information about labels, weights, groups in rank
* \param iteration current iteration number.
* \param out_gpair output of get gradient, saves gradient and second order gradient in
/**
* @brief Get gradient over each of predictions, given existing information.
*
* @param preds prediction of current round
* @param info information about labels, weights, groups in rank
* @param iteration current iteration number.
* @param out_gpair output of get gradient, saves gradient and second order gradient in
*/
virtual void GetGradient(const HostDeviceVector<bst_float>& preds,
const MetaInfo& info,
int iteration,
HostDeviceVector<GradientPair>* out_gpair) = 0;
virtual void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) = 0;
/*! \return the default evaluation metric for the objective */
virtual const char* DefaultEvalMetric() const = 0;
@@ -81,9 +80,7 @@ class ObjFunction : public Configurable {
* used by gradient boosting
* \return transformed value
*/
virtual bst_float ProbToMargin(bst_float base_score) const {
return base_score;
}
[[nodiscard]] virtual bst_float ProbToMargin(bst_float base_score) const { return base_score; }
/**
* \brief Make initialize estimation of prediction.
*
@@ -94,14 +91,14 @@ class ObjFunction : public Configurable {
/*!
* \brief Return task of this objective.
*/
virtual struct ObjInfo Task() const = 0;
[[nodiscard]] virtual struct ObjInfo Task() const = 0;
/**
* \brief Return number of targets for input matrix. Right now XGBoost supports only
* @brief Return number of targets for input matrix. Right now XGBoost supports only
* multi-target regression.
*/
virtual bst_target_t Targets(MetaInfo const& info) const {
[[nodiscard]] virtual bst_target_t Targets(MetaInfo const& info) const {
if (info.labels.Shape(1) > 1) {
LOG(FATAL) << "multioutput is not supported by current objective function";
LOG(FATAL) << "multioutput is not supported by the current objective function";
}
return 1;
}

View File

@@ -71,7 +71,7 @@ class TreeUpdater : public Configurable {
* but maybe different random seeds, usually one tree is passed in at a time,
* there can be multiple trees when we train random forest style model
*/
virtual void Update(tree::TrainParam const* param, HostDeviceVector<GradientPair>* gpair,
virtual void Update(tree::TrainParam const* param, linalg::Matrix<GradientPair>* gpair,
DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& out_trees) = 0;