From dedd87662bf6bda6021496b09efcfae28a33943a Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 30 Dec 2015 18:22:15 -0800 Subject: [PATCH] [OBJ] Add basic objective function and registry --- dmlc-core | 2 +- include/xgboost/base.h | 14 + include/xgboost/data.h | 5 +- include/xgboost/objective.h | 96 +++++ old_src/learner/helper_utils.h | 80 ---- old_src/learner/objective-inl.hpp | 642 ------------------------------ old_src/learner/objective.h | 89 ----- src/c_api.cc | 2 - src/common/math.h | 107 +++++ src/common/random.h | 26 ++ src/data/data.cc | 31 +- src/data/simple_csr_source.h | 11 +- src/objective/multi_class.cc | 134 +++++++ src/objective/objective.cc | 17 + src/objective/rank.cc | 329 +++++++++++++++ src/objective/regression.cc | 216 ++++++++++ 16 files changed, 965 insertions(+), 836 deletions(-) create mode 100644 include/xgboost/objective.h delete mode 100644 old_src/learner/helper_utils.h delete mode 100644 old_src/learner/objective-inl.hpp delete mode 100644 old_src/learner/objective.h delete mode 100644 src/c_api.cc create mode 100644 src/common/math.h create mode 100644 src/common/random.h create mode 100644 src/objective/multi_class.cc create mode 100644 src/objective/objective.cc create mode 100644 src/objective/rank.cc create mode 100644 src/objective/regression.cc diff --git a/dmlc-core b/dmlc-core index 4b951c037..98879773f 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 4b951c0378386b7f4d9eae72be2ecd3b9c816afe +Subproject commit 98879773f062e1c5b8a380e002c25672f2b48b13 diff --git a/include/xgboost/base.h b/include/xgboost/base.h index b1d0e453a..61667928f 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -7,6 +7,7 @@ #define XGBOOST_BASE_H_ #include +#include namespace xgboost { /*! @@ -17,10 +18,23 @@ typedef uint32_t bst_uint; /*! \brief float type, used for storing statistics */ typedef float bst_float; +/*! \brief gradient statistics pair usually needed in gradient boosting */ +struct bst_gpair { + /*! \brief gradient statistics */ + bst_float grad; + /*! \brief second order gradient statistics */ + bst_float hess; + bst_gpair() {} + bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {} +}; + const float rt_eps = 1e-5f; // min gap between feature values to allow a split happen const float rt_2eps = rt_eps * 2.0f; +typedef dmlc::omp_ulong omp_ulong; +typedef dmlc::omp_uint bst_omp_uint; + /*! * \brief define compatible keywords in g++ * Used to support g++-4.6 and g++4.7 diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 160f5bad6..e85537347 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "./base.h" namespace xgboost { @@ -261,7 +262,7 @@ class DMatrix { * \return a Created DMatrix. */ static DMatrix* Create(std::unique_ptr&& source, - const char* cache_prefix=nullptr); + const char* cache_prefix = nullptr); /*! * \brief Create a DMatrix by loaidng data from parser. * Parser can later be deleted after the DMatrix i created. @@ -275,7 +276,7 @@ class DMatrix { * \return A created DMatrix. */ static DMatrix* Create(dmlc::Parser* parser, - const char* cache_prefix=nullptr); + const char* cache_prefix = nullptr); }; } // namespace xgboost diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h new file mode 100644 index 000000000..0b1d895d2 --- /dev/null +++ b/include/xgboost/objective.h @@ -0,0 +1,96 @@ +/*! + * Copyright 2014 by Contributors + * \file objective.h + * \brief interface of objective function used by xgboost. + * \author Tianqi Chen, Kailong Chen + */ +#ifndef XGBOOST_OBJECTIVE_H_ +#define XGBOOST_OBJECTIVE_H_ + +#include +#include +#include +#include +#include +#include "./data.h" +#include "./base.h" + +namespace xgboost { +/*! \brief interface of objective function */ +class ObjFunction { + public: + /*! \brief virtual destructor */ + virtual ~ObjFunction() {} + /*! + * \brief Initialize the objective with the specified parameters. + * \param args arguments to the objective function. + */ + virtual void Init(const std::vector >& args) = 0; + /*! + * \brief Get gradient over each of predictions, given existing information. + * \param preds prediction of current round + * \param info information about labels, weights, groups in rank + * \param iteration current iteration number. + * \param out_gpair output of get gradient, saves gradient and second order gradient in + */ + virtual void GetGradient(const std::vector& preds, + const MetaInfo& info, + int iteration, + std::vector* out_gpair) = 0; + /*! \return the default evaluation metric for the objective */ + virtual const char* DefaultEvalMetric() const = 0; + // the following functions are optional, most of time default implementation is good enough + /*! + * \brief transform prediction values, this is only called when Prediction is called + * \param io_preds prediction values, saves to this vector as well + */ + virtual void PredTransform(std::vector *io_preds) {} + /*! + * \brief transform prediction values, this is only called when Eval is called, + * usually it redirect to PredTransform + * \param io_preds prediction values, saves to this vector as well + */ + virtual void EvalTransform(std::vector *io_preds) { + this->PredTransform(io_preds); + } + /*! + * \brief transform probability value back to margin + * this is used to transform user-set base_score back to margin + * used by gradient boosting + * \return transformed value + */ + virtual float ProbToMargin(float base_score) const { + return base_score; + } + /*! + * \brief Create an objective function according to name. + * \param name Name of the objective. + */ + static ObjFunction* Create(const char* name); +}; + +/*! + * \brief Registry entry for DataIterator factory functions. + */ +struct ObjFunctionReg + : public dmlc::FunctionRegEntryBase > { +}; + +/*! + * \brief Macro to register objective + * + * \code + * // example of registering a objective + * XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear") + * .describe("Linear regression objective") + * .set_body([]() { + * return new RegLossObj(LossType::kLinearSquare); + * }); + * \endcode + */ +#define XGBOOST_REGISTER_OBJECTIVE(UniqueId, Name) \ + static ::xgboost::ObjFunctionReg & __make_ ## ObjFunctionReg ## _ ## UniqueId ## __ = \ + ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(#Name) +} // namespace xgboost +#endif // XGBOOST_OBJECTIVE_H_ diff --git a/old_src/learner/helper_utils.h b/old_src/learner/helper_utils.h deleted file mode 100644 index 0db1b46f3..000000000 --- a/old_src/learner/helper_utils.h +++ /dev/null @@ -1,80 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file helper_utils.h - * \brief useful helper functions - * \author Tianqi Chen, Kailong Chen - */ -#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_ -#define XGBOOST_LEARNER_HELPER_UTILS_H_ - -#include -#include -#include -#include -namespace xgboost { -namespace learner { -// simple helper function to do softmax -inline static void Softmax(std::vector* p_rec) { - std::vector &rec = *p_rec; - float wmax = rec[0]; - for (size_t i = 1; i < rec.size(); ++i) { - wmax = std::max(rec[i], wmax); - } - double wsum = 0.0f; - for (size_t i = 0; i < rec.size(); ++i) { - rec[i] = std::exp(rec[i]-wmax); - wsum += rec[i]; - } - for (size_t i = 0; i < rec.size(); ++i) { - rec[i] /= static_cast(wsum); - } -} - -inline static int FindMaxIndex(const float *rec, size_t size) { - size_t mxid = 0; - for (size_t i = 1; i < size; ++i) { - if (rec[i] > rec[mxid]) { - mxid = i; - } - } - return static_cast(mxid); -} - -// simple helper function to do softmax -inline static int FindMaxIndex(const std::vector& rec) { - return FindMaxIndex(BeginPtr(rec), rec.size()); -} - -// perform numerically safe logsum -inline float LogSum(float x, float y) { - if (x < y) { - return y + std::log(std::exp(x - y) + 1.0f); - } else { - return x + std::log(std::exp(y - x) + 1.0f); - } -} -// numerically safe logsum -inline float LogSum(const float *rec, size_t size) { - float mx = rec[0]; - for (size_t i = 1; i < size; ++i) { - mx = std::max(mx, rec[i]); - } - float sum = 0.0f; - for (size_t i = 0; i < size; ++i) { - sum += std::exp(rec[i] - mx); - } - return mx + std::log(sum); -} - -// comparator functions for sorting pairs in descending order -inline static bool CmpFirst(const std::pair &a, - const std::pair &b) { - return a.first > b.first; -} -inline static bool CmpSecond(const std::pair &a, - const std::pair &b) { - return a.second > b.second; -} -} // namespace learner -} // namespace xgboost -#endif // XGBOOST_LEARNER_HELPER_UTILS_H_ diff --git a/old_src/learner/objective-inl.hpp b/old_src/learner/objective-inl.hpp deleted file mode 100644 index ce23b02fb..000000000 --- a/old_src/learner/objective-inl.hpp +++ /dev/null @@ -1,642 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file objective-inl.hpp - * \brief objective function implementations - * \author Tianqi Chen, Kailong Chen - */ -#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ -#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ - -#include -#include -#include -#include -#include -#include "../data.h" -#include "./objective.h" -#include "./helper_utils.h" -#include "../utils/random.h" -#include "../utils/omp.h" - -namespace xgboost { -namespace learner { -/*! \brief defines functions to calculate some commonly used functions */ -struct LossType { - /*! \brief indicate which type we are using */ - int loss_type; - // list of constants - static const int kLinearSquare = 0; - static const int kLogisticNeglik = 1; - static const int kLogisticClassify = 2; - static const int kLogisticRaw = 3; - /*! - * \brief transform the linear sum to prediction - * \param x linear sum of boosting ensemble - * \return transformed prediction - */ - inline float PredTransform(float x) const { - switch (loss_type) { - case kLogisticRaw: - case kLinearSquare: return x; - case kLogisticClassify: - case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x)); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - /*! - * \brief check if label range is valid - */ - inline bool CheckLabel(float x) const { - if (loss_type != kLinearSquare) { - return x >= 0.0f && x <= 1.0f; - } - return true; - } - /*! - * \brief error message displayed when check label fail - */ - inline const char * CheckLabelErrorMsg(void) const { - if (loss_type != kLinearSquare) { - return "label must be in [0,1] for logistic regression"; - } else { - return ""; - } - } - /*! - * \brief calculate first order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return first order gradient - */ - inline float FirstOrderGradient(float predt, float label) const { - switch (loss_type) { - case kLinearSquare: return predt - label; - case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt)); - case kLogisticClassify: - case kLogisticNeglik: return predt - label; - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - /*! - * \brief calculate second order gradient of loss, given transformed prediction - * \param predt transformed prediction - * \param label true label - * \return second order gradient - */ - inline float SecondOrderGradient(float predt, float label) const { - // cap second order gradient to positive value - const float eps = 1e-16f; - switch (loss_type) { - case kLinearSquare: return 1.0f; - case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt)); - case kLogisticClassify: - case kLogisticNeglik: return std::max(predt * (1.0f - predt), eps); - default: utils::Error("unknown loss_type"); return 0.0f; - } - } - /*! - * \brief transform probability value back to margin - */ - inline float ProbToMargin(float base_score) const { - if (loss_type == kLogisticRaw || - loss_type == kLogisticClassify || - loss_type == kLogisticNeglik ) { - utils::Check(base_score > 0.0f && base_score < 1.0f, - "base_score must be in (0,1) for logistic loss"); - base_score = -std::log(1.0f / base_score - 1.0f); - } - return base_score; - } - /*! \brief get default evaluation metric for the objective */ - inline const char *DefaultEvalMetric(void) const { - if (loss_type == kLogisticClassify) return "error"; - if (loss_type == kLogisticRaw) return "auc"; - return "rmse"; - } -}; - -/*! \brief objective function that only need to */ -class RegLossObj : public IObjFunction { - public: - explicit RegLossObj(int loss_type) { - loss.loss_type = loss_type; - scale_pos_weight = 1.0f; - } - virtual ~RegLossObj(void) {} - virtual void SetParam(const char *name, const char *val) { - using namespace std; - if (!strcmp("scale_pos_weight", name)) { - scale_pos_weight = static_cast(atof(val)); - } - } - virtual void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) { - utils::Check(info.labels.size() != 0, "label set cannot be empty"); - utils::Check(preds.size() % info.labels.size() == 0, - "labels are not correctly provided"); - std::vector &gpair = *out_gpair; - gpair.resize(preds.size()); - // check if label in range - bool label_correct = true; - // start calculating gradient - const unsigned nstep = static_cast(info.labels.size()); - const bst_omp_uint ndata = static_cast(preds.size()); - #pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < ndata; ++i) { - const unsigned j = i % nstep; - float p = loss.PredTransform(preds[i]); - float w = info.GetWeight(j); - if (info.labels[j] == 1.0f) w *= scale_pos_weight; - if (!loss.CheckLabel(info.labels[j])) label_correct = false; - gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w, - loss.SecondOrderGradient(p, info.labels[j]) * w); - } - utils::Check(label_correct, loss.CheckLabelErrorMsg()); - } - virtual const char* DefaultEvalMetric(void) const { - return loss.DefaultEvalMetric(); - } - virtual void PredTransform(std::vector *io_preds) { - std::vector &preds = *io_preds; - const bst_omp_uint ndata = static_cast(preds.size()); - #pragma omp parallel for schedule(static) - for (bst_omp_uint j = 0; j < ndata; ++j) { - preds[j] = loss.PredTransform(preds[j]); - } - } - virtual float ProbToMargin(float base_score) const { - return loss.ProbToMargin(base_score); - } - - protected: - float scale_pos_weight; - LossType loss; -}; - -// poisson regression for count -class PoissonRegression : public IObjFunction { - public: - PoissonRegression(void) { - max_delta_step = 0.0f; - } - virtual ~PoissonRegression(void) {} - - virtual void SetParam(const char *name, const char *val) { - using namespace std; - if (!strcmp("max_delta_step", name)) { - max_delta_step = static_cast(atof(val)); - } - } - virtual void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) { - utils::Check(max_delta_step != 0.0f, - "PoissonRegression: need to set max_delta_step"); - utils::Check(info.labels.size() != 0, "label set cannot be empty"); - utils::Check(preds.size() == info.labels.size(), - "labels are not correctly provided"); - std::vector &gpair = *out_gpair; - gpair.resize(preds.size()); - // check if label in range - bool label_correct = true; - // start calculating gradient - const long ndata = static_cast(preds.size()); // NOLINT(*) - #pragma omp parallel for schedule(static) - for (long i = 0; i < ndata; ++i) { // NOLINT(*) - float p = preds[i]; - float w = info.GetWeight(i); - float y = info.labels[i]; - if (y >= 0.0f) { - gpair[i] = bst_gpair((std::exp(p) - y) * w, - std::exp(p + max_delta_step) * w); - } else { - label_correct = false; - } - } - utils::Check(label_correct, - "PoissonRegression: label must be nonnegative"); - } - virtual void PredTransform(std::vector *io_preds) { - std::vector &preds = *io_preds; - const long ndata = static_cast(preds.size()); // NOLINT(*) - #pragma omp parallel for schedule(static) - for (long j = 0; j < ndata; ++j) { // NOLINT(*) - preds[j] = std::exp(preds[j]); - } - } - virtual void EvalTransform(std::vector *io_preds) { - PredTransform(io_preds); - } - virtual float ProbToMargin(float base_score) const { - return std::log(base_score); - } - virtual const char* DefaultEvalMetric(void) const { - return "poisson-nloglik"; - } - - private: - float max_delta_step; -}; - -// softmax multi-class classification -class SoftmaxMultiClassObj : public IObjFunction { - public: - explicit SoftmaxMultiClassObj(int output_prob) - : output_prob(output_prob) { - nclass = 0; - } - virtual ~SoftmaxMultiClassObj(void) {} - virtual void SetParam(const char *name, const char *val) { - using namespace std; - if (!strcmp( "num_class", name )) nclass = atoi(val); - } - virtual void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) { - utils::Check(nclass != 0, "must set num_class to use softmax"); - utils::Check(info.labels.size() != 0, "label set cannot be empty"); - utils::Check(preds.size() % (static_cast(nclass) * info.labels.size()) == 0, - "SoftmaxMultiClassObj: label size and pred size does not match"); - std::vector &gpair = *out_gpair; - gpair.resize(preds.size()); - const unsigned nstep = static_cast(info.labels.size() * nclass); - const bst_omp_uint ndata = static_cast(preds.size() / nclass); - int label_error = 0; - #pragma omp parallel - { - std::vector rec(nclass); - #pragma omp for schedule(static) - for (bst_omp_uint i = 0; i < ndata; ++i) { - for (int k = 0; k < nclass; ++k) { - rec[k] = preds[i * nclass + k]; - } - Softmax(&rec); - const unsigned j = i % nstep; - int label = static_cast(info.labels[j]); - if (label < 0 || label >= nclass) { - label_error = label; label = 0; - } - const float wt = info.GetWeight(j); - for (int k = 0; k < nclass; ++k) { - float p = rec[k]; - const float h = 2.0f * p * (1.0f - p) * wt; - if (label == k) { - gpair[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h); - } else { - gpair[i * nclass + k] = bst_gpair(p* wt, h); - } - } - } - } - utils::Check(label_error >= 0 && label_error < nclass, - "SoftmaxMultiClassObj: label must be in [0, num_class),"\ - " num_class=%d but found %d in label", nclass, label_error); - } - virtual void PredTransform(std::vector *io_preds) { - this->Transform(io_preds, output_prob); - } - virtual void EvalTransform(std::vector *io_preds) { - this->Transform(io_preds, 1); - } - virtual const char* DefaultEvalMetric(void) const { - return "merror"; - } - - private: - inline void Transform(std::vector *io_preds, int prob) { - utils::Check(nclass != 0, "must set num_class to use softmax"); - std::vector &preds = *io_preds; - std::vector tmp; - const bst_omp_uint ndata = static_cast(preds.size()/nclass); - if (prob == 0) tmp.resize(ndata); - #pragma omp parallel - { - std::vector rec(nclass); - #pragma omp for schedule(static) - for (bst_omp_uint j = 0; j < ndata; ++j) { - for (int k = 0; k < nclass; ++k) { - rec[k] = preds[j * nclass + k]; - } - if (prob == 0) { - tmp[j] = static_cast(FindMaxIndex(rec)); - } else { - Softmax(&rec); - for (int k = 0; k < nclass; ++k) { - preds[j * nclass + k] = rec[k]; - } - } - } - } - if (prob == 0) preds = tmp; - } - // data field - int nclass; - int output_prob; -}; - -/*! \brief objective for lambda rank */ -class LambdaRankObj : public IObjFunction { - public: - LambdaRankObj(void) { - loss.loss_type = LossType::kLogisticRaw; - fix_list_weight = 0.0f; - num_pairsample = 1; - } - virtual ~LambdaRankObj(void) {} - virtual void SetParam(const char *name, const char *val) { - using namespace std; - if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val); - if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast(atof(val)); - if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val); - } - virtual void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) { - utils::Check(preds.size() == info.labels.size(), "label size predict size not match"); - std::vector &gpair = *out_gpair; - gpair.resize(preds.size()); - // quick consistency when group is not available - std::vector tgptr(2, 0); tgptr[1] = static_cast(info.labels.size()); - const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; - utils::Check(gptr.size() != 0 && gptr.back() == info.labels.size(), - "group structure not consistent with #rows"); - const bst_omp_uint ngroup = static_cast(gptr.size() - 1); - #pragma omp parallel - { - // parall construct, declare random number generator here, so that each - // thread use its own random number generator, seed by thread id and current iteration - random::Random rnd; rnd.Seed(iter* 1111 + omp_get_thread_num()); - std::vector pairs; - std::vector lst; - std::vector< std::pair > rec; - #pragma omp for schedule(static) - for (bst_omp_uint k = 0; k < ngroup; ++k) { - lst.clear(); pairs.clear(); - for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) { - lst.push_back(ListEntry(preds[j], info.labels[j], j)); - gpair[j] = bst_gpair(0.0f, 0.0f); - } - std::sort(lst.begin(), lst.end(), ListEntry::CmpPred); - rec.resize(lst.size()); - for (unsigned i = 0; i < lst.size(); ++i) { - rec[i] = std::make_pair(lst[i].label, i); - } - std::sort(rec.begin(), rec.end(), CmpFirst); - // enumerate buckets with same label, for each item in the lst, grab another sample randomly - for (unsigned i = 0; i < rec.size(); ) { - unsigned j = i + 1; - while (j < rec.size() && rec[j].first == rec[i].first) ++j; - // bucket in [i,j), get a sample outside bucket - unsigned nleft = i, nright = static_cast(rec.size() - j); - if (nleft + nright != 0) { - int nsample = num_pairsample; - while (nsample --) { - for (unsigned pid = i; pid < j; ++pid) { - unsigned ridx = static_cast(rnd.RandDouble() * (nleft+nright)); - if (ridx < nleft) { - pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second)); - } else { - pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second)); - } - } - } - } - i = j; - } - // get lambda weight for the pairs - this->GetLambdaWeight(lst, &pairs); - // rescale each gradient and hessian so that the lst have constant weighted - float scale = 1.0f / num_pairsample; - if (fix_list_weight != 0.0f) { - scale *= fix_list_weight / (gptr[k+1] - gptr[k]); - } - for (size_t i = 0; i < pairs.size(); ++i) { - const ListEntry &pos = lst[pairs[i].pos_index]; - const ListEntry &neg = lst[pairs[i].neg_index]; - const float w = pairs[i].weight * scale; - float p = loss.PredTransform(pos.pred - neg.pred); - float g = loss.FirstOrderGradient(p, 1.0f); - float h = loss.SecondOrderGradient(p, 1.0f); - // accumulate gradient and hessian in both pid, and nid - gpair[pos.rindex].grad += g * w; - gpair[pos.rindex].hess += 2.0f * w * h; - gpair[neg.rindex].grad -= g * w; - gpair[neg.rindex].hess += 2.0f * w * h; - } - } - } - } - virtual const char* DefaultEvalMetric(void) const { - return "map"; - } - - protected: - /*! \brief helper information in a list */ - struct ListEntry { - /*! \brief the predict score we in the data */ - float pred; - /*! \brief the actual label of the entry */ - float label; - /*! \brief row index in the data matrix */ - unsigned rindex; - // constructor - ListEntry(float pred, float label, unsigned rindex) - : pred(pred), label(label), rindex(rindex) {} - // comparator by prediction - inline static bool CmpPred(const ListEntry &a, const ListEntry &b) { - return a.pred > b.pred; - } - // comparator by label - inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) { - return a.label > b.label; - } - }; - /*! \brief a pair in the lambda rank */ - struct LambdaPair { - /*! \brief positive index: this is a position in the list */ - unsigned pos_index; - /*! \brief negative index: this is a position in the list */ - unsigned neg_index; - /*! \brief weight to be filled in */ - float weight; - // constructor - LambdaPair(unsigned pos_index, unsigned neg_index) - : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {} - }; - /*! - * \brief get lambda weight for existing pairs - * \param list a list that is sorted by pred score - * \param io_pairs record of pairs, containing the pairs to fill in weights - */ - virtual void GetLambdaWeight(const std::vector &sorted_list, - std::vector *io_pairs) = 0; - - private: - // loss function - LossType loss; - // number of samples peformed for each instance - int num_pairsample; - // fix weight of each elements in list - float fix_list_weight; -}; - -class PairwiseRankObj: public LambdaRankObj{ - public: - virtual ~PairwiseRankObj(void) {} - - protected: - virtual void GetLambdaWeight(const std::vector &sorted_list, - std::vector *io_pairs) {} -}; - -// beta version: NDCG lambda rank -class LambdaRankObjNDCG : public LambdaRankObj { - public: - virtual ~LambdaRankObjNDCG(void) {} - - protected: - virtual void GetLambdaWeight(const std::vector &sorted_list, - std::vector *io_pairs) { - std::vector &pairs = *io_pairs; - float IDCG; - { - std::vector labels(sorted_list.size()); - for (size_t i = 0; i < sorted_list.size(); ++i) { - labels[i] = sorted_list[i].label; - } - std::sort(labels.begin(), labels.end(), std::greater()); - IDCG = CalcDCG(labels); - } - if (IDCG == 0.0) { - for (size_t i = 0; i < pairs.size(); ++i) { - pairs[i].weight = 0.0f; - } - } else { - IDCG = 1.0f / IDCG; - for (size_t i = 0; i < pairs.size(); ++i) { - unsigned pos_idx = pairs[i].pos_index; - unsigned neg_idx = pairs[i].neg_index; - float pos_loginv = 1.0f / std::log(pos_idx + 2.0f); - float neg_loginv = 1.0f / std::log(neg_idx + 2.0f); - int pos_label = static_cast(sorted_list[pos_idx].label); - int neg_label = static_cast(sorted_list[neg_idx].label); - float original = - ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv; - float changed = - ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv; - float delta = (original - changed) * IDCG; - if (delta < 0.0f) delta = - delta; - pairs[i].weight = delta; - } - } - } - inline static float CalcDCG(const std::vector &labels) { - double sumdcg = 0.0; - for (size_t i = 0; i < labels.size(); ++i) { - const unsigned rel = static_cast(labels[i]); - if (rel != 0) { - sumdcg += ((1 << rel) - 1) / std::log(static_cast(i + 2)); - } - } - return static_cast(sumdcg); - } -}; - -class LambdaRankObjMAP : public LambdaRankObj { - public: - virtual ~LambdaRankObjMAP(void) {} - - protected: - struct MAPStats { - /*! \brief the accumulated precision */ - float ap_acc; - /*! - * \brief the accumulated precision, - * assuming a positive instance is missing - */ - float ap_acc_miss; - /*! - * \brief the accumulated precision, - * assuming that one more positive instance is inserted ahead - */ - float ap_acc_add; - /* \brief the accumulated positive instance count */ - float hits; - MAPStats(void) {} - MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits) - : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {} - }; - /*! - * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 - * in sorted triples - * \param sorted_list the list containing entry information - * \param index1,index2 the instances switched - * \param map_stats a vector containing the accumulated precisions for each position in a list - */ - inline float GetLambdaMAP(const std::vector &sorted_list, - int index1, int index2, - std::vector *p_map_stats) { - std::vector &map_stats = *p_map_stats; - if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) { - return 0.0f; - } - if (index1 > index2) std::swap(index1, index2); - float original = map_stats[index2].ap_acc; - if (index1 != 0) original -= map_stats[index1 - 1].ap_acc; - float changed = 0; - float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f; - float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f; - if (label1 == label2) { - return 0.0; - } else if (label1 < label2) { - changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add; - changed += (map_stats[index1].hits + 1.0f) / (index1 + 1); - } else { - changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss; - changed += map_stats[index2].hits / (index2 + 1); - } - float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits); - if (ans < 0) ans = -ans; - return ans; - } - /* - * \brief obtain preprocessing results for calculating delta MAP - * \param sorted_list the list containing entry information - * \param map_stats a vector containing the accumulated precisions for each position in a list - */ - inline void GetMAPStats(const std::vector &sorted_list, - std::vector *p_map_acc) { - std::vector &map_acc = *p_map_acc; - map_acc.resize(sorted_list.size()); - float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; - for (size_t i = 1; i <= sorted_list.size(); ++i) { - if (sorted_list[i - 1].label > 0.0f) { - hit++; - acc1 += hit / i; - acc2 += (hit - 1) / i; - acc3 += (hit + 1) / i; - } - map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit); - } - } - virtual void GetLambdaWeight(const std::vector &sorted_list, - std::vector *io_pairs) { - std::vector &pairs = *io_pairs; - std::vector map_stats; - GetMAPStats(sorted_list, &map_stats); - for (size_t i = 0; i < pairs.size(); ++i) { - pairs[i].weight = - GetLambdaMAP(sorted_list, pairs[i].pos_index, - pairs[i].neg_index, &map_stats); - } - } -}; - -} // namespace learner -} // namespace xgboost -#endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_ diff --git a/old_src/learner/objective.h b/old_src/learner/objective.h deleted file mode 100644 index 774286854..000000000 --- a/old_src/learner/objective.h +++ /dev/null @@ -1,89 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file objective.h - * \brief interface of objective function used for gradient boosting - * \author Tianqi Chen, Kailong Chen - */ -#ifndef XGBOOST_LEARNER_OBJECTIVE_H_ -#define XGBOOST_LEARNER_OBJECTIVE_H_ - -#include -#include "./dmatrix.h" - -namespace xgboost { -namespace learner { -/*! \brief interface of objective function */ -class IObjFunction{ - public: - /*! \brief virtual destructor */ - virtual ~IObjFunction(void) {} - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - virtual void SetParam(const char *name, const char *val) = 0; - /*! - * \brief get gradient over each of predictions, given existing information - * \param preds prediction of current round - * \param info information about labels, weights, groups in rank - * \param iter current iteration number - * \param out_gpair output of get gradient, saves gradient and second order gradient in - */ - virtual void GetGradient(const std::vector &preds, - const MetaInfo &info, - int iter, - std::vector *out_gpair) = 0; - /*! \return the default evaluation metric for the objective */ - virtual const char* DefaultEvalMetric(void) const = 0; - // the following functions are optional, most of time default implementation is good enough - /*! - * \brief transform prediction values, this is only called when Prediction is called - * \param io_preds prediction values, saves to this vector as well - */ - virtual void PredTransform(std::vector *io_preds) {} - /*! - * \brief transform prediction values, this is only called when Eval is called, - * usually it redirect to PredTransform - * \param io_preds prediction values, saves to this vector as well - */ - virtual void EvalTransform(std::vector *io_preds) { - this->PredTransform(io_preds); - } - /*! - * \brief transform probability value back to margin - * this is used to transform user-set base_score back to margin - * used by gradient boosting - * \return transformed value - */ - virtual float ProbToMargin(float base_score) const { - return base_score; - } -}; -} // namespace learner -} // namespace xgboost - -// this are implementations of objective functions -#include "objective-inl.hpp" -// factory function -namespace xgboost { -namespace learner { -/*! \brief factory function to create objective function by name */ -inline IObjFunction* CreateObjFunction(const char *name) { - using namespace std; - if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare); - if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik); - if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify); - if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw); - if (!strcmp("count:poisson", name)) return new PoissonRegression(); - if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0); - if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1); - if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj(); - if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG(); - if (!strcmp("rank:map", name)) return new LambdaRankObjMAP(); - utils::Error("unknown objective function type: %s", name); - return NULL; -} -} // namespace learner -} // namespace xgboost -#endif // XGBOOST_LEARNER_OBJECTIVE_H_ diff --git a/src/c_api.cc b/src/c_api.cc deleted file mode 100644 index cf1b06d3c..000000000 --- a/src/c_api.cc +++ /dev/null @@ -1,2 +0,0 @@ -#include - diff --git a/src/common/math.h b/src/common/math.h new file mode 100644 index 000000000..fd2f9d5c3 --- /dev/null +++ b/src/common/math.h @@ -0,0 +1,107 @@ +/*! + * Copyright 2015 by Contributors + * \file math.h + * \brief additional math utils + * \author Tianqi Chen + */ +#ifndef XGBOOST_COMMON_MATH_H_ +#define XGBOOST_COMMON_MATH_H_ + +#include +#include +#include +#include + +namespace xgboost { +namespace common { +/*! + * \brief calculate the sigmoid of the input. + * \param x input parameter + * \return the transformed value. + */ +inline float Sigmoid(float x) { + return 1.0f / (1.0f + std::exp(-x)); +} + +/*! + * \brief do inplace softmax transformaton on p_rec + * \param p_rec the input/output vector of the values. + */ +inline void Softmax(std::vector* p_rec) { + std::vector &rec = *p_rec; + float wmax = rec[0]; + for (size_t i = 1; i < rec.size(); ++i) { + wmax = std::max(rec[i], wmax); + } + double wsum = 0.0f; + for (size_t i = 0; i < rec.size(); ++i) { + rec[i] = std::exp(rec[i] - wmax); + wsum += rec[i]; + } + for (size_t i = 0; i < rec.size(); ++i) { + rec[i] /= static_cast(wsum); + } +} + +/*! + * \brief Find the maximum iterator within the iterators + * \param begin The begining iterator. + * \param end The end iterator. + * \return the iterator point to the maximum value. + * \tparam Iterator The type of the iterator. + */ +template +inline Iterator FindMaxIndex(Iterator begin, Iterator end) { + Iterator maxit = begin; + for (Iterator it = begin; it != end; ++it) { + if (*it > *maxit) maxit = it; + } + return maxit; +} + +/*! + * \brief perform numerically safe logsum + * \param x left input operand + * \param y right input operand + * \return log(exp(x) + exp(y)) + */ +inline float LogSum(float x, float y) { + if (x < y) { + return y + std::log(std::exp(x - y) + 1.0f); + } else { + return x + std::log(std::exp(y - x) + 1.0f); + } +} + +/*! + * \brief perform numerically safe logsum + * \param begin The begining iterator. + * \param end The end iterator. + * \return the iterator point to the maximum value. + * \tparam Iterator The type of the iterator. + */ +template +inline float LogSum(Iterator begin, Iterator end) { + float mx = *begin; + for (Iterator it = begin; it != end; ++it) { + mx = std::max(mx, *it); + } + float sum = 0.0f; + for (Iterator it = begin; it != end; ++it) { + sum += std::exp(*it - mx); + } + return mx + std::log(sum); +} + +// comparator functions for sorting pairs in descending order +inline static bool CmpFirst(const std::pair &a, + const std::pair &b) { + return a.first > b.first; +} +inline static bool CmpSecond(const std::pair &a, + const std::pair &b) { + return a.second > b.second; +} +} // namespace common +} // namespace xgboost +#endif // XGBOOST_COMMON_MATH_H_ diff --git a/src/common/random.h b/src/common/random.h new file mode 100644 index 000000000..16441a9c9 --- /dev/null +++ b/src/common/random.h @@ -0,0 +1,26 @@ +/*! + * Copyright 2015 by Contributors + * \file random.h + * \brief Utility related to random. + * \author Tianqi Chen + */ +#ifndef XGBOOST_COMMON_RANDOM_H_ +#define XGBOOST_COMMON_RANDOM_H_ + +#include + +namespace xgboost { +namespace common { +/*! + * \brief Random Engine + */ +typedef std::mt19937 RandomEngine; +/*! + * \brief global singleton of a random engine. + * Only use this engine when necessary, not thread-safe. + */ +static RandomEngine* GlobalRandom(); + +} // namespace common +} // namespace xgboost +#endif // XGBOOST_COMMON_RANDOM_H_ diff --git a/src/data/data.cc b/src/data/data.cc index d020ff0de..11bef8a9b 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -2,8 +2,8 @@ * Copyright 2015 by Contributors * \file data.cc */ -#include #include +#include namespace xgboost { // implementation of inline functions @@ -35,7 +35,8 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) { CHECK_EQ(version, kVersion) << "MetaInfo: invalid format"; CHECK(fi->Read(&num_row, sizeof(num_row)) == sizeof(num_row)) << "MetaInfo: invalid format"; CHECK(fi->Read(&num_col, sizeof(num_col)) == sizeof(num_col)) << "MetaInfo: invalid format"; - CHECK(fi->Read(&num_nonzero, sizeof(num_nonzero)) == sizeof(num_nonzero)) << "MetaInfo: invalid format"; + CHECK(fi->Read(&num_nonzero, sizeof(num_nonzero)) == sizeof(num_nonzero)) + << "MetaInfo: invalid format"; CHECK(fi->Read(&labels)) << "MetaInfo: invalid format"; CHECK(fi->Read(&group_ptr)) << "MetaInfo: invalid format"; CHECK(fi->Read(&weights)) << "MetaInfo: invalid format"; @@ -45,19 +46,19 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) { // macro to dispatch according to specified pointer types #define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc) \ - switch(dtype) { \ - case kFloat32: { \ - const float* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ - } \ - case kDouble: { \ - const double* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ - } \ - case kUInt32: { \ - const uint32_t* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ - } \ - case kUInt64: { \ - const uint64_t* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ - } \ + switch (dtype) { \ + case kFloat32: { \ + const float* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + } \ + case kDouble: { \ + const double* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + } \ + case kUInt32: { \ + const uint32_t* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + } \ + case kUInt64: { \ + const uint64_t* cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + } \ default: LOG(FATAL) << "Unknown data type" << dtype; \ } \ diff --git a/src/data/simple_csr_source.h b/src/data/simple_csr_source.h index 3832ba852..a9faa98e3 100644 --- a/src/data/simple_csr_source.h +++ b/src/data/simple_csr_source.h @@ -5,13 +5,14 @@ * This is an in-memory data structure that holds the data in row oriented format. * \author Tianqi Chen */ -#ifndef XGBOOST_DATA_SIMPLE_CSR_ROW_ITER_H_ -#define XGBOOST_DATA_SIMPLE_CSR_ROW_ITER_H_ +#ifndef XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_ +#define XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_ -#include -#include #include #include +#include +#include + namespace xgboost { /*! \brief namespace of internal data structures*/ @@ -78,4 +79,4 @@ class SimpleCSRSource : public DataSource { }; } // namespace data } // namespace xgboost -#endif // XGBOOST_DATA_SIMPLE_CSR_ROW_ITER_H_ +#endif // XGBOOST_DATA_SIMPLE_CSR_SOURCE_H_ diff --git a/src/objective/multi_class.cc b/src/objective/multi_class.cc new file mode 100644 index 000000000..119a0f377 --- /dev/null +++ b/src/objective/multi_class.cc @@ -0,0 +1,134 @@ +/*! + * Copyright 2015 by Contributors + * \file multi_class.cc + * \brief Definition of multi-class classification objectives. + * \author Tianqi Chen + */ +#include +#include +#include +#include +#include +#include +#include "../common/math.h" + +namespace xgboost { +namespace obj { + +struct SoftmaxMultiClassParam : public dmlc::Parameter { + int num_class; + // declare parameters + DMLC_DECLARE_PARAMETER(SoftmaxMultiClassParam) { + DMLC_DECLARE_FIELD(num_class).set_lower_bound(1) + .describe("Number of output class in the multi-class classification."); + } +}; + +class SoftmaxMultiClassObj : public ObjFunction { + public: + explicit SoftmaxMultiClassObj(bool output_prob) + : output_prob_(output_prob) { + } + void Init(const std::vector >& args) override { + param_.Init(args); + } + void GetGradient(const std::vector& preds, + const MetaInfo& info, + int iter, + std::vector* out_gpair) override { + CHECK_NE(info.labels.size(), 0) << "label set cannot be empty"; + CHECK(preds.size() == (static_cast(param_.num_class) * info.labels.size())) + << "SoftmaxMultiClassObj: label size and pred size does not match"; + out_gpair->resize(preds.size()); + const int nclass = param_.num_class; + const omp_ulong ndata = static_cast(preds.size() / nclass); + + int label_error = 0; + #pragma omp parallel + { + std::vector rec(nclass); + #pragma omp for schedule(static) + for (omp_ulong i = 0; i < ndata; ++i) { + for (int k = 0; k < nclass; ++k) { + rec[k] = preds[i * nclass + k]; + } + common::Softmax(&rec); + int label = static_cast(info.labels[i]); + if (label < 0 || label >= nclass) { + label_error = label; label = 0; + } + const float wt = info.GetWeight(i); + for (int k = 0; k < nclass; ++k) { + float p = rec[k]; + const float h = 2.0f * p * (1.0f - p) * wt; + if (label == k) { + out_gpair->at(i * nclass + k) = bst_gpair((p - 1.0f) * wt, h); + } else { + out_gpair->at(i * nclass + k) = bst_gpair(p* wt, h); + } + } + } + } + CHECK(label_error >= 0 && label_error < nclass) + << "SoftmaxMultiClassObj: label must be in [0, num_class)," + << " num_class=" << nclass + << " but found " << label_error << " in label."; + } + void PredTransform(std::vector* io_preds) override { + this->Transform(io_preds, output_prob_); + } + void EvalTransform(std::vector* io_preds) override { + this->Transform(io_preds, true); + } + const char* DefaultEvalMetric() const override { + return "merror"; + } + + private: + inline void Transform(std::vector *io_preds, bool prob) { + std::vector &preds = *io_preds; + std::vector tmp; + const int nclass = param_.num_class; + const omp_ulong ndata = static_cast(preds.size() / nclass); + if (!prob) tmp.resize(ndata); + + #pragma omp parallel + { + std::vector rec(nclass); + #pragma omp for schedule(static) + for (omp_ulong j = 0; j < ndata; ++j) { + for (int k = 0; k < nclass; ++k) { + rec[k] = preds[j * nclass + k]; + } + if (!prob) { + tmp[j] = static_cast( + common::FindMaxIndex(rec.begin(), rec.end()) - rec.begin()); + } else { + common::Softmax(&rec); + for (int k = 0; k < nclass; ++k) { + preds[j * nclass + k] = rec[k]; + } + } + } + } + if (!prob) preds = tmp; + } + // output probability + bool output_prob_; + // parameter + SoftmaxMultiClassParam param_; +}; + +// register the ojective functions +DMLC_REGISTER_PARAMETER(SoftmaxMultiClassParam); + +XGBOOST_REGISTER_OBJECTIVE(SoftmaxMultiClass, "multi:softmax") +.describe("Softmax for multi-class classification, output class index.") +.set_body([]() { return new SoftmaxMultiClassObj(false); }); + +XGBOOST_REGISTER_OBJECTIVE(SoftprobMultiClass, "multi:softprob") +.describe("Softmax for multi-class classification, output probability distribution.") +.set_body([]() { return new SoftmaxMultiClassObj(true); }); + +} // namespace obj +} // namespace xgboost diff --git a/src/objective/objective.cc b/src/objective/objective.cc new file mode 100644 index 000000000..acba980c2 --- /dev/null +++ b/src/objective/objective.cc @@ -0,0 +1,17 @@ +/*! + * Copyright 2015 by Contributors + * \file objective.cc + * \brief global objective function definition. + */ +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg); +} // namespace dmlc + +namespace xgboost { +/*! \brief namespace of objective function */ +namespace obj { +} // namespace obj +} // namespace xgboost + diff --git a/src/objective/rank.cc b/src/objective/rank.cc new file mode 100644 index 000000000..649661ced --- /dev/null +++ b/src/objective/rank.cc @@ -0,0 +1,329 @@ +/*! + * Copyright 2015 by Contributors + * \file rank.cc + * \brief Definition of rank loss. + * \author Tianqi Chen, Kailong Chen + */ +#include +#include +#include +#include +#include +#include +#include "../common/math.h" +#include "../common/random.h" + +namespace xgboost { +namespace obj { + +struct LambdaRankParam : public dmlc::Parameter { + int num_pairsample; + float fix_list_weight; + // declare parameters + DMLC_DECLARE_PARAMETER(LambdaRankParam) { + DMLC_DECLARE_FIELD(num_pairsample).set_lower_bound(1).set_default(1) + .describe("Number of pair generated for each instance."); + DMLC_DECLARE_FIELD(fix_list_weight).set_lower_bound(0.0f).set_default(0.0f) + .describe("Normalize the weight of each list by this value," + " if equals 0, no effect will happen"); + + } +}; + +// objective for lambda rank +class LambdaRankObj : public ObjFunction { + public: + void Init(const std::vector >& args) override { + param_.Init(args); + } + void GetGradient(const std::vector& preds, + const MetaInfo& info, + int iter, + std::vector* out_gpair) override { + CHECK_EQ(preds.size(),info.labels.size()) << "label size predict size not match"; + std::vector& gpair = *out_gpair; + + gpair.resize(preds.size()); + // quick consistency when group is not available + std::vector tgptr(2, 0); tgptr[1] = static_cast(info.labels.size()); + const std::vector &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; + CHECK(gptr.size() != 0 && gptr.back() == info.labels.size()) + << "group structure not consistent with #rows"; + const bst_omp_uint ngroup = static_cast(gptr.size() - 1); + #pragma omp parallel + { + // parall construct, declare random number generator here, so that each + // thread use its own random number generator, seed by thread id and current iteration + common::RandomEngine rnd(iter * 1111 + omp_get_thread_num()); + + std::vector pairs; + std::vector lst; + std::vector< std::pair > rec; + #pragma omp for schedule(static) + for (bst_omp_uint k = 0; k < ngroup; ++k) { + lst.clear(); pairs.clear(); + for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) { + lst.push_back(ListEntry(preds[j], info.labels[j], j)); + gpair[j] = bst_gpair(0.0f, 0.0f); + } + std::sort(lst.begin(), lst.end(), ListEntry::CmpPred); + rec.resize(lst.size()); + for (unsigned i = 0; i < lst.size(); ++i) { + rec[i] = std::make_pair(lst[i].label, i); + } + std::sort(rec.begin(), rec.end(), common::CmpFirst); + // enumerate buckets with same label, for each item in the lst, grab another sample randomly + for (unsigned i = 0; i < rec.size(); ) { + unsigned j = i + 1; + while (j < rec.size() && rec[j].first == rec[i].first) ++j; + // bucket in [i,j), get a sample outside bucket + unsigned nleft = i, nright = static_cast(rec.size() - j); + if (nleft + nright != 0) { + int nsample = param_.num_pairsample; + while (nsample --) { + for (unsigned pid = i; pid < j; ++pid) { + unsigned ridx = std::uniform_int_distribution(0, nleft + nright - 1)(rnd); + if (ridx < nleft) { + pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second)); + } else { + pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second)); + } + } + } + } + i = j; + } + // get lambda weight for the pairs + this->GetLambdaWeight(lst, &pairs); + // rescale each gradient and hessian so that the lst have constant weighted + float scale = 1.0f / param_.num_pairsample; + if (param_.fix_list_weight != 0.0f) { + scale *= param_.fix_list_weight / (gptr[k + 1] - gptr[k]); + } + for (size_t i = 0; i < pairs.size(); ++i) { + const ListEntry &pos = lst[pairs[i].pos_index]; + const ListEntry &neg = lst[pairs[i].neg_index]; + const float w = pairs[i].weight * scale; + const float eps = 1e-16f; + float p = common::Sigmoid(pos.pred - neg.pred); + float g = p - 1.0f; + float h = std::max(p * (1.0f - p), eps); + // accumulate gradient and hessian in both pid, and nid + gpair[pos.rindex].grad += g * w; + gpair[pos.rindex].hess += 2.0f * w * h; + gpair[neg.rindex].grad -= g * w; + gpair[neg.rindex].hess += 2.0f * w * h; + } + } + } + } + const char* DefaultEvalMetric(void) const override { + return "map"; + } + + protected: + /*! \brief helper information in a list */ + struct ListEntry { + /*! \brief the predict score we in the data */ + float pred; + /*! \brief the actual label of the entry */ + float label; + /*! \brief row index in the data matrix */ + unsigned rindex; + // constructor + ListEntry(float pred, float label, unsigned rindex) + : pred(pred), label(label), rindex(rindex) {} + // comparator by prediction + inline static bool CmpPred(const ListEntry &a, const ListEntry &b) { + return a.pred > b.pred; + } + // comparator by label + inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) { + return a.label > b.label; + } + }; + /*! \brief a pair in the lambda rank */ + struct LambdaPair { + /*! \brief positive index: this is a position in the list */ + unsigned pos_index; + /*! \brief negative index: this is a position in the list */ + unsigned neg_index; + /*! \brief weight to be filled in */ + float weight; + // constructor + LambdaPair(unsigned pos_index, unsigned neg_index) + : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {} + }; + /*! + * \brief get lambda weight for existing pairs + * \param list a list that is sorted by pred score + * \param io_pairs record of pairs, containing the pairs to fill in weights + */ + virtual void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) = 0; + + private: + LambdaRankParam param_; +}; + +class PairwiseRankObj: public LambdaRankObj{ + protected: + void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) override {} +}; + +// beta version: NDCG lambda rank +class LambdaRankObjNDCG : public LambdaRankObj { + protected: + void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) override { + std::vector &pairs = *io_pairs; + float IDCG; + { + std::vector labels(sorted_list.size()); + for (size_t i = 0; i < sorted_list.size(); ++i) { + labels[i] = sorted_list[i].label; + } + std::sort(labels.begin(), labels.end(), std::greater()); + IDCG = CalcDCG(labels); + } + if (IDCG == 0.0) { + for (size_t i = 0; i < pairs.size(); ++i) { + pairs[i].weight = 0.0f; + } + } else { + IDCG = 1.0f / IDCG; + for (size_t i = 0; i < pairs.size(); ++i) { + unsigned pos_idx = pairs[i].pos_index; + unsigned neg_idx = pairs[i].neg_index; + float pos_loginv = 1.0f / std::log(pos_idx + 2.0f); + float neg_loginv = 1.0f / std::log(neg_idx + 2.0f); + int pos_label = static_cast(sorted_list[pos_idx].label); + int neg_label = static_cast(sorted_list[neg_idx].label); + float original = + ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv; + float changed = + ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv; + float delta = (original - changed) * IDCG; + if (delta < 0.0f) delta = - delta; + pairs[i].weight = delta; + } + } + } + inline static float CalcDCG(const std::vector &labels) { + double sumdcg = 0.0; + for (size_t i = 0; i < labels.size(); ++i) { + const unsigned rel = static_cast(labels[i]); + if (rel != 0) { + sumdcg += ((1 << rel) - 1) / std::log(static_cast(i + 2)); + } + } + return static_cast(sumdcg); + } +}; + +class LambdaRankObjMAP : public LambdaRankObj { + protected: + struct MAPStats { + /*! \brief the accumulated precision */ + float ap_acc; + /*! + * \brief the accumulated precision, + * assuming a positive instance is missing + */ + float ap_acc_miss; + /*! + * \brief the accumulated precision, + * assuming that one more positive instance is inserted ahead + */ + float ap_acc_add; + /* \brief the accumulated positive instance count */ + float hits; + MAPStats(void) {} + MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits) + : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {} + }; + /*! + * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 + * in sorted triples + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param map_stats a vector containing the accumulated precisions for each position in a list + */ + inline float GetLambdaMAP(const std::vector &sorted_list, + int index1, int index2, + std::vector *p_map_stats) { + std::vector &map_stats = *p_map_stats; + if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) { + return 0.0f; + } + if (index1 > index2) std::swap(index1, index2); + float original = map_stats[index2].ap_acc; + if (index1 != 0) original -= map_stats[index1 - 1].ap_acc; + float changed = 0; + float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f; + float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f; + if (label1 == label2) { + return 0.0; + } else if (label1 < label2) { + changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add; + changed += (map_stats[index1].hits + 1.0f) / (index1 + 1); + } else { + changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss; + changed += map_stats[index2].hits / (index2 + 1); + } + float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits); + if (ans < 0) ans = -ans; + return ans; + } + /* + * \brief obtain preprocessing results for calculating delta MAP + * \param sorted_list the list containing entry information + * \param map_stats a vector containing the accumulated precisions for each position in a list + */ + inline void GetMAPStats(const std::vector &sorted_list, + std::vector *p_map_acc) { + std::vector &map_acc = *p_map_acc; + map_acc.resize(sorted_list.size()); + float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; + for (size_t i = 1; i <= sorted_list.size(); ++i) { + if (sorted_list[i - 1].label > 0.0f) { + hit++; + acc1 += hit / i; + acc2 += (hit - 1) / i; + acc3 += (hit + 1) / i; + } + map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit); + } + } + void GetLambdaWeight(const std::vector &sorted_list, + std::vector *io_pairs) override { + std::vector &pairs = *io_pairs; + std::vector map_stats; + GetMAPStats(sorted_list, &map_stats); + for (size_t i = 0; i < pairs.size(); ++i) { + pairs[i].weight = + GetLambdaMAP(sorted_list, pairs[i].pos_index, + pairs[i].neg_index, &map_stats); + } + } +}; + +// register the ojective functions +DMLC_REGISTER_PARAMETER(LambdaRankParam); + +XGBOOST_REGISTER_OBJECTIVE(PairwieRankObj, "rank:pairwise") +.describe("Pairwise rank objective.") +.set_body([]() { return new PairwiseRankObj(); }); + +XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, "rank:ndcg") +.describe("LambdaRank with NDCG as objective.") +.set_body([]() { return new LambdaRankObjNDCG(); }); + +XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, "rank:map") +.describe("LambdaRank with MAP as objective.") +.set_body([]() { return new LambdaRankObjMAP(); }); + +} // namespace obj +} // namespace xgboost + diff --git a/src/objective/regression.cc b/src/objective/regression.cc new file mode 100644 index 000000000..2c0db1e02 --- /dev/null +++ b/src/objective/regression.cc @@ -0,0 +1,216 @@ +/*! + * Copyright 2015 by Contributors + * \file regression.cc + * \brief Definition of single-value regression and classification objectives. + * \author Tianqi Chen, Kailong Chen + */ +#include +#include +#include +#include +#include +#include +#include "../common/math.h" + +namespace xgboost { +namespace obj { +// common regressions +// linear regression +struct LinearSquareLoss { + static float PredTransform(float x) { return x; } + static bool CheckLabel(float x) { return true; } + static float FirstOrderGradient(float predt, float label) { return predt - label; } + static float SecondOrderGradient(float predt, float label) { return 1.0f; } + static float ProbToMargin(float base_score) { return base_score; } + static const char* LabelErrorMsg() { return ""; } + static const char* DefaultEvalMetric() { return "rmse"; } +}; +// logistic loss for probability regression task +struct LogisticRegression { + static float PredTransform(float x) { return common::Sigmoid(x); } + static bool CheckLabel(float x) { return x >= 0.0f && x <= 1.0f; } + static float FirstOrderGradient(float predt, float label) { return predt - label; } + static float SecondOrderGradient(float predt, float label) { + const float eps = 1e-16f; + return std::max(predt * (1.0f - predt), eps); + } + static float ProbToMargin(float base_score) { + CHECK(base_score > 0.0f && base_score < 1.0f) + << "base_score must be in (0,1) for logistic loss"; + return -std::log(1.0f / base_score - 1.0f); + } + static const char* LabelErrorMsg() { + return "label must be in [0,1] for logistic regression"; + } + static const char* DefaultEvalMetric() { return "rmse"; } +}; +// logistic loss for binary classification task. +struct LogisticClassification : public LogisticRegression { + static const char* DefaultEvalMetric() { return "error"; } +}; +// logistic loss, but predict un-transformed margin +struct LogisticRaw : public LogisticRegression { + static float PredTransform(float x) { return x; } + static float FirstOrderGradient(float predt, float label) { + predt = common::Sigmoid(predt); + return predt - label; + } + static float SecondOrderGradient(float predt, float label) { + const float eps = 1e-16f; + predt = common::Sigmoid(predt); + return std::max(predt * (1.0f - predt), eps); + } + static const char* DefaultEvalMetric() { return "auc"; } +}; + +struct RegLossParam : public dmlc::Parameter { + float scale_pos_weight; + // declare parameters + DMLC_DECLARE_PARAMETER(RegLossParam) { + DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f) + .describe("Scale the weight of positive examples by this factor"); + } +}; + +// regression los function +template +class RegLossObj : public ObjFunction { + public: + void Init(const std::vector >& args) override { + param_.Init(args); + } + void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) override { + CHECK_NE(info.labels.size(), 0) << "label set cannot be empty"; + CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; + out_gpair->resize(preds.size()); + // check if label in range + bool label_correct = true; + // start calculating gradient + const omp_ulong ndata = static_cast(preds.size()); + #pragma omp parallel for schedule(static) + for (omp_ulong i = 0; i < ndata; ++i) { + float p = Loss::PredTransform(preds[i]); + float w = info.GetWeight(i); + if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight; + if (Loss::CheckLabel(info.labels[i])) label_correct = false; + out_gpair->at(i) = bst_gpair(Loss::FirstOrderGradient(p, info.labels[i]) * w, + Loss::SecondOrderGradient(p, info.labels[i]) * w); + } + if (!label_correct) { + LOG(FATAL) << Loss::LabelErrorMsg(); + } + } + virtual const char* DefaultEvalMetric() const { + return Loss::DefaultEvalMetric(); + } + virtual void PredTransform(std::vector *io_preds) { + std::vector &preds = *io_preds; + const bst_omp_uint ndata = static_cast(preds.size()); + #pragma omp parallel for schedule(static) + for (bst_omp_uint j = 0; j < ndata; ++j) { + preds[j] = Loss::PredTransform(preds[j]); + } + } + virtual float ProbToMargin(float base_score) const { + return Loss::ProbToMargin(base_score); + } + + protected: + RegLossParam param_; +}; + +// register the ojective functions +DMLC_REGISTER_PARAMETER(RegLossParam); + +XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear") +.describe("Linear regression.") +.set_body([]() { return new RegLossObj(); }); + +XGBOOST_REGISTER_OBJECTIVE(LogisticRegression, "reg:logistic") +.describe("Logistic regression for probability regression task.") +.set_body([]() { return new RegLossObj(); }); + +XGBOOST_REGISTER_OBJECTIVE(LogisticClassification, "binary:logistic") +.describe("Logistic regression for binary classification task.") +.set_body([]() { return new RegLossObj(); }); + +XGBOOST_REGISTER_OBJECTIVE(LogisticRaw, "binary:logitraw") +.describe("Logistic regression for classification, output score before logistic transformation") +.set_body([]() { return new RegLossObj(); }); + +// declare parameter +struct PoissonRegressionParam : public dmlc::Parameter { + float max_delta_step; + DMLC_DECLARE_PARAMETER(PoissonRegressionParam) { + DMLC_DECLARE_FIELD(max_delta_step).set_lower_bound(0.0f) + .describe("Maximum delta step we allow each weight estimation to be." \ + " This parameter is required for possion regression."); + } +}; + +// poisson regression for count +class PoissonRegression : public ObjFunction { + public: + // declare functions + void Init(const std::vector >& args) override { + param_.Init(args); + } + + void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) override { + CHECK_NE(info.labels.size(), 0) << "label set cannot be empty"; + CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; + out_gpair->resize(preds.size()); + // check if label in range + bool label_correct = true; + // start calculating gradient + const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) + float p = preds[i]; + float w = info.GetWeight(i); + float y = info.labels[i]; + if (y >= 0.0f) { + out_gpair->at(i) = bst_gpair((std::exp(p) - y) * w, + std::exp(p + param_.max_delta_step) * w); + } else { + label_correct = false; + } + } + CHECK(label_correct) << "PoissonRegression: label must be nonnegative"; + } + void PredTransform(std::vector *io_preds) override { + std::vector &preds = *io_preds; + const long ndata = static_cast(preds.size()); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (long j = 0; j < ndata; ++j) { // NOLINT(*) + preds[j] = std::exp(preds[j]); + } + } + void EvalTransform(std::vector *io_preds) override { + PredTransform(io_preds); + } + float ProbToMargin(float base_score) const override { + return std::log(base_score); + } + const char* DefaultEvalMetric(void) const override { + return "poisson-nloglik"; + } + + private: + PoissonRegressionParam param_; +}; + +// register the ojective functions +DMLC_REGISTER_PARAMETER(PoissonRegressionParam); + +XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson") +.describe("Possion regression for count data.") +.set_body([]() { return new PoissonRegression(); }); +} // namespace obj +} // namespace xgboost