diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index a2ca9536f..15957480c 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -28,13 +28,13 @@ extern "C" { void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R; void (*Error)(const char *fmt, ...) = error; } -} // namespace utils - -namespace wrapper { -bool CheckNAN(float v) { +bool CheckNAN(double v) { return ISNAN(v); } -} // namespace wrapper +bool LogGamma(double v) { + return lgammafn(v); +} +} // namespace utils namespace random { void Seed(unsigned seed) { diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 61b84a80e..58a179192 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -8,6 +8,7 @@ extern "C" { #include #include +#include } extern "C" { diff --git a/doc/parameter.md b/doc/parameter.md index ec37a819c..7fca58286 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -55,6 +55,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame - "reg:logistic" --logistic regression - "binary:logistic" --logistic regression for binary classification, output probability - "binary:logitraw" --logistic regression for binary classification, output score before logistic transformation + - "count:poisson" --poisson regression for count data, output mean of poisson distribution + - max_delta_step is set to 1 by default in poisson regression(used to safeguard optimization) - "multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes) - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 8798ff99b..433b5a00b 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -12,6 +12,7 @@ #include #include #include "../sync/sync.h" +#include "../utils/math.h" #include "./evaluation.h" #include "./helper_utils.h" @@ -106,6 +107,18 @@ struct EvalError : public EvalEWiseBase { } }; +/*! \brief loglikelihood of poission distribution */ +struct EvalPoissionNegLogLik : public EvalEWiseBase { + virtual const char *Name(void) const { + return "poisson-nloglik"; + } + inline static float EvalRow(float y, float py) { + const float eps = 1e-16f; + if (py < eps) py = eps; + return utils::LogGamma(y + 1.0f) + py - std::log(py) * y; + } +}; + /*! * \brief base class of multi-class evaluation * \tparam Derived the name of subclass diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index 146f757a2..85358e72e 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -46,6 +46,7 @@ inline IEvaluator* CreateEvaluator(const char *name) { if (!strcmp(name, "merror")) return new EvalMatchError(); if (!strcmp(name, "logloss")) return new EvalLogLoss(); if (!strcmp(name, "mlogloss")) return new EvalMultiLogLoss(); + if (!strcmp(name, "poisson-nloglik")) return new EvalPoissionNegLogLik(); if (!strcmp(name, "auc")) return new EvalAuc(); if (!strncmp(name, "ams@", 4)) return new EvalAMS(name); if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name); diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index ecbc17615..9ceec969e 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -107,7 +107,9 @@ class BoostLearner : public rabit::Serializable { } if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val); if (!strcmp("save_base64", name)) save_base64 = atoi(val); - if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val); + if (!strcmp(name, "num_class")) { + this->SetParam("num_output_group", val); + } if (!strcmp(name, "nthread")) { omp_set_num_threads(atoi(val)); } @@ -383,15 +385,25 @@ class BoostLearner : public rabit::Serializable { utils::Assert(gbm_ == NULL, "GBM and obj should be NULL"); obj_ = CreateObjFunction(name_obj_.c_str()); gbm_ = gbm::CreateGradBooster(name_gbm_.c_str()); - + this->InitAdditionDefaultParam(); + // set parameters for (size_t i = 0; i < cfg_.size(); ++i) { obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); - } + } if (evaluator_.Size() == 0) { evaluator_.AddEval(obj_->DefaultEvalMetric()); } } + /*! + * \brief additional default value for specific objs + */ + inline void InitAdditionDefaultParam(void) { + if (name_obj_ == "count:poisson") { + obj_->SetParam("max_delta_step", "0.7"); + gbm_->SetParam("max_delta_step", "0.7"); + } + } /*! * \brief get un-transformed prediction * \param data training data matrix diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index e5e4b05e1..d0ecf7a27 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -114,7 +114,7 @@ struct LossType { }; /*! \brief objective function that only need to */ -class RegLossObj : public IObjFunction{ +class RegLossObj : public IObjFunction { public: explicit RegLossObj(int loss_type) { loss.loss_type = loss_type; @@ -173,6 +173,72 @@ class RegLossObj : public IObjFunction{ LossType loss; }; +// poisson regression for count +class PoissonRegression : public IObjFunction { + public: + explicit PoissonRegression(void) { + max_delta_step = 0.0f; + } + virtual ~PoissonRegression(void) {} + + virtual void SetParam(const char *name, const char *val) { + using namespace std; + if (!strcmp( "max_delta_step", name )) { + max_delta_step = static_cast(atof(val)); + } + } + virtual void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) { + utils::Check(max_delta_step != 0.0f, + "PoissonRegression: need to set max_delta_step"); + utils::Check(info.labels.size() != 0, "label set cannot be empty"); + utils::Check(preds.size() == info.labels.size(), + "labels are not correctly provided"); + std::vector &gpair = *out_gpair; + gpair.resize(preds.size()); + // check if label in range + bool label_correct = true; + // start calculating gradient + const long ndata = static_cast(preds.size()); + #pragma omp parallel for schedule(static) + for (long i = 0; i < ndata; ++i) { + float p = preds[i]; + float w = info.GetWeight(i); + float y = info.labels[i]; + if (y >= 0.0f) { + gpair[i] = bst_gpair((std::exp(p) - y) * w, + std::exp(p + max_delta_step) * w); + } else { + label_correct = false; + } + } + utils::Check(label_correct, + "PoissonRegression: label must be nonnegative"); + } + virtual void PredTransform(std::vector *io_preds) { + std::vector &preds = *io_preds; + const long ndata = static_cast(preds.size()); + #pragma omp parallel for schedule(static) + for (long j = 0; j < ndata; ++j) { + preds[j] = std::exp(preds[j]); + } + } + virtual void EvalTransform(std::vector *io_preds) { + PredTransform(io_preds); + } + virtual float ProbToMargin(float base_score) const { + return std::log(base_score); + } + virtual const char* DefaultEvalMetric(void) const { + return "poisson-nloglik"; + } + + private: + float max_delta_step; +}; + // softmax multi-class classification class SoftmaxMultiClassObj : public IObjFunction { public: diff --git a/src/learner/objective.h b/src/learner/objective.h index 6b11b7d18..c0a525a43 100644 --- a/src/learner/objective.h +++ b/src/learner/objective.h @@ -72,6 +72,7 @@ inline IObjFunction* CreateObjFunction(const char *name) { if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik); if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify); if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw); + if (!strcmp("count:poisson", name)) return new PoissonRegression(); if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0); if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1); if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj(); diff --git a/src/utils/math.h b/src/utils/math.h new file mode 100644 index 000000000..06612b1a8 --- /dev/null +++ b/src/utils/math.h @@ -0,0 +1,36 @@ +#ifndef XGBOOST_UTILS_MATH_H_ +#define XGBOOST_UTILS_MATH_H_ +/*! + * \file math.h + * \brief support additional math + * \author Tianqi Chen + */ +#include +#ifdef _MSC_VER +extern "C" { +#include +} +#endif +namespace xgboost { +namespace utils { +#ifdef XGBOOST_STRICT_CXX98_ +// check nan +bool CheckNAN(double v); +double LogGamma(double v); +#else +template +inline bool CheckNAN(T v) { +#ifdef _MSC_VER + return (_isnan(x) != 0); +#else + return isnan(v); +#endif +} +template +inline T LogGamma(T v) { + return lgamma(v); +} +#endif +} // namespace utils +} // namespace xgboost +#endif // XGBOOST_UTILS_MATH_H_ diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index d2aa5169f..8ec3aa3f4 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -9,16 +9,12 @@ #include // include all std functions using namespace std; - -#ifdef _MSC_VER -#define isnan(x) (_isnan(x) != 0) -#endif - #include "./xgboost_wrapper.h" #include "../src/data.h" #include "../src/learner/learner-inl.hpp" #include "../src/io/io.h" #include "../src/utils/utils.h" +#include "../src/utils/math.h" #include "../src/utils/group_data.h" #include "../src/io/simple_dmatrix-inl.hpp" @@ -97,14 +93,6 @@ class Booster: public learner::BoostLearner { private: bool init_model; }; -#if !defined(XGBOOST_STRICT_CXX98_) -inline bool CheckNAN(float v) { - return isnan(v); -} -#else -// redirect to defs in R -bool CheckNAN(float v); -#endif } // namespace wrapper } // namespace xgboost @@ -175,7 +163,7 @@ extern "C"{ bst_ulong nrow, bst_ulong ncol, float missing) { - bool nan_missing = CheckNAN(missing); + bool nan_missing = utils::CheckNAN(missing); DMatrixSimple *p_mat = new DMatrixSimple(); DMatrixSimple &mat = *p_mat; mat.info.info.num_row = nrow; @@ -183,7 +171,7 @@ extern "C"{ for (bst_ulong i = 0; i < nrow; ++i, data += ncol) { bst_ulong nelem = 0; for (bst_ulong j = 0; j < ncol; ++j) { - if (CheckNAN(data[j])) { + if (utils::CheckNAN(data[j])) { utils::Check(nan_missing, "There are NAN in the matrix, however, you did not set missing=NAN"); } else {