add poisson regression

2015-05-04 10:48:25 -07:00 · 2015-05-04 10:48:25 -07:00 · 667a752e04
commit 667a752e04
parent a310db86a1
10 changed files with 144 additions and 24 deletions
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@ -28,13 +28,13 @@ extern "C" {
  void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
  void (*Error)(const char *fmt, ...) = error;
 }
-}  // namespace utils
-
-namespace wrapper {
-bool CheckNAN(float v) {
+bool CheckNAN(double v) {
  return ISNAN(v);
 }
-} // namespace wrapper
+bool LogGamma(double v) {
+  return lgammafn(v);
+}
+} // namespace utils

 namespace random {
 void Seed(unsigned seed) {
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@ -8,6 +8,7 @@
 extern "C" {
 #include <Rinternals.h>
 #include <R_ext/Random.h>
+#include <Rmath.h>
 }

 extern "C" {
--- a/doc/parameter.md
+++ b/doc/parameter.md
@ -55,6 +55,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
 - "reg:logistic" --logistic regression
 - "binary:logistic" --logistic regression for binary classification, output probability
 - "binary:logitraw" --logistic regression for binary classification, output score before logistic transformation
+ - "count:poisson" --poisson regression for count data, output mean of poisson distribution
+   - max_delta_step is set to 1 by default in poisson regression(used to safeguard optimization)
 - "multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
 - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
 - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@ -12,6 +12,7 @@
 #include <climits>
 #include <algorithm>
 #include "../sync/sync.h"
+#include "../utils/math.h"
 #include "./evaluation.h"
 #include "./helper_utils.h"

@ -106,6 +107,18 @@ struct EvalError : public EvalEWiseBase<EvalError> {
  }
 };

+/*! \brief loglikelihood of poission distribution */
+struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
+  virtual const char *Name(void) const {
+    return "poisson-nloglik";
+  }
+  inline static float EvalRow(float y, float py) {
+    const float eps = 1e-16f;
+    if (py < eps) py = eps;
+    return utils::LogGamma(y + 1.0f) + py - std::log(py) * y;
+  }
+};
+
 /*! 
 * \brief base class of multi-class evaluation
 * \tparam Derived the name of subclass
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@ -46,6 +46,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
  if (!strcmp(name, "merror")) return new EvalMatchError();
  if (!strcmp(name, "logloss")) return new EvalLogLoss();
  if (!strcmp(name, "mlogloss")) return new EvalMultiLogLoss();
+  if (!strcmp(name, "poisson-nloglik")) return new EvalPoissionNegLogLik();
  if (!strcmp(name, "auc")) return new EvalAuc();
  if (!strncmp(name, "ams@", 4)) return new EvalAMS(name);
  if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -107,7 +107,9 @@ class BoostLearner : public rabit::Serializable {
    }
    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
    if (!strcmp("save_base64", name)) save_base64 = atoi(val);
-    if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
+    if (!strcmp(name, "num_class")) {
+      this->SetParam("num_output_group", val);
+    }
    if (!strcmp(name, "nthread")) {
      omp_set_num_threads(atoi(val));
    }
@ -383,7 +385,8 @@ class BoostLearner : public rabit::Serializable {
    utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
    obj_ = CreateObjFunction(name_obj_.c_str());
    gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
-    
+    this->InitAdditionDefaultParam();
+    // set parameters
    for (size_t i = 0; i < cfg_.size(); ++i) {
      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@ -392,6 +395,15 @@ class BoostLearner : public rabit::Serializable {
      evaluator_.AddEval(obj_->DefaultEvalMetric());
    }
  }
+  /*! 
+   * \brief additional default value for specific objs
+   */
+  inline void InitAdditionDefaultParam(void) {
+    if (name_obj_ == "count:poisson") {
+      obj_->SetParam("max_delta_step", "0.7");
+      gbm_->SetParam("max_delta_step", "0.7");
+    }
+  }
  /*! 
   * \brief get un-transformed prediction
   * \param data training data matrix
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@ -114,7 +114,7 @@ struct LossType {
 };

 /*! \brief objective function that only need to */
-class RegLossObj : public IObjFunction{
+class RegLossObj : public IObjFunction {
 public:
  explicit RegLossObj(int loss_type) {
    loss.loss_type = loss_type;
@ -173,6 +173,72 @@ class RegLossObj : public IObjFunction{
  LossType loss;
 };

+// poisson regression for count
+class PoissonRegression : public IObjFunction {
+ public:
+  explicit PoissonRegression(void) {
+    max_delta_step = 0.0f;
+  }
+  virtual ~PoissonRegression(void) {}
+  
+  virtual void SetParam(const char *name, const char *val) {
+    using namespace std;
+    if (!strcmp( "max_delta_step", name )) {
+      max_delta_step = static_cast<float>(atof(val));
+    }
+  }
+  virtual void GetGradient(const std::vector<float> &preds,
+                           const MetaInfo &info,
+                           int iter,
+                           std::vector<bst_gpair> *out_gpair) {
+    utils::Check(max_delta_step != 0.0f,
+                 "PoissonRegression: need to set max_delta_step");
+    utils::Check(info.labels.size() != 0, "label set cannot be empty");
+    utils::Check(preds.size() == info.labels.size(),
+                 "labels are not correctly provided");
+    std::vector<bst_gpair> &gpair = *out_gpair;
+    gpair.resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
+    // start calculating gradient
+    const long ndata = static_cast<bst_omp_uint>(preds.size());
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < ndata; ++i) {
+      float p = preds[i];
+      float w = info.GetWeight(i);
+      float y = info.labels[i];
+      if (y >= 0.0f) {
+        gpair[i] = bst_gpair((std::exp(p) - y) * w,
+                             std::exp(p + max_delta_step) * w);
+      } else {
+        label_correct = false;
+      }
+    }
+    utils::Check(label_correct,
+                 "PoissonRegression: label must be nonnegative");
+  }
+  virtual void PredTransform(std::vector<float> *io_preds) {
+    std::vector<float> &preds = *io_preds;
+    const long ndata = static_cast<long>(preds.size());
+    #pragma omp parallel for schedule(static)
+    for (long j = 0; j < ndata; ++j) {
+      preds[j] = std::exp(preds[j]);
+    }
+  }
+  virtual void EvalTransform(std::vector<float> *io_preds) {
+    PredTransform(io_preds);
+  }
+  virtual float ProbToMargin(float base_score) const {
+    return std::log(base_score);
+  }
+  virtual const char* DefaultEvalMetric(void) const {
+    return "poisson-nloglik";
+  }
+  
+ private:
+  float max_delta_step;
+};
+
 // softmax multi-class classification
 class SoftmaxMultiClassObj : public IObjFunction {
 public:
--- a/src/learner/objective.h
+++ b/src/learner/objective.h
@ -72,6 +72,7 @@ inline IObjFunction* CreateObjFunction(const char *name) {
  if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
  if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
  if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
+  if (!strcmp("count:poisson", name)) return new PoissonRegression();
  if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0);
  if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1);
  if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj();
--- a/src/utils/math.h
+++ b/src/utils/math.h
@ -0,0 +1,36 @@
+#ifndef XGBOOST_UTILS_MATH_H_
+#define XGBOOST_UTILS_MATH_H_
+/*!
+ * \file math.h
+ * \brief support additional math
+ * \author Tianqi Chen
+ */
+#include <cmath>
+#ifdef _MSC_VER
+extern "C" {
+#include <amp_math.h>
+}
+#endif
+namespace xgboost {
+namespace utils {
+#ifdef XGBOOST_STRICT_CXX98_
+// check nan
+bool CheckNAN(double v);
+double LogGamma(double v);
+#else
+template<typename T>
+inline bool CheckNAN(T v) {
+#ifdef _MSC_VER
+  return (_isnan(x) != 0);
+#else
+  return isnan(v);
+#endif
+}
+template<typename T>
+inline T LogGamma(T v) {
+  return lgamma(v);
+}
+#endif
+}  // namespace utils
+}  // namespace xgboost
+#endif  // XGBOOST_UTILS_MATH_H_
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@ -9,16 +9,12 @@
 #include <algorithm>
 // include all std functions
 using namespace std;
-
-#ifdef _MSC_VER
-#define isnan(x) (_isnan(x) != 0)
-#endif
-
 #include "./xgboost_wrapper.h"
 #include "../src/data.h"
 #include "../src/learner/learner-inl.hpp"
 #include "../src/io/io.h"
 #include "../src/utils/utils.h"
+#include "../src/utils/math.h"
 #include "../src/utils/group_data.h"
 #include "../src/io/simple_dmatrix-inl.hpp"

@ -97,14 +93,6 @@ class Booster: public learner::BoostLearner {
 private:
  bool init_model;
 };
-#if !defined(XGBOOST_STRICT_CXX98_)
-inline bool CheckNAN(float v) {
-  return isnan(v);
-}
-#else
-// redirect to defs in R
-bool CheckNAN(float v);
-#endif
 }  // namespace wrapper
 }  // namespace xgboost

@ -175,7 +163,7 @@ extern "C"{
                               bst_ulong nrow,
                               bst_ulong ncol,
                               float  missing) {    
-    bool nan_missing = CheckNAN(missing);
+    bool nan_missing = utils::CheckNAN(missing);
    DMatrixSimple *p_mat = new DMatrixSimple();
    DMatrixSimple &mat = *p_mat;
    mat.info.info.num_row = nrow;
@ -183,7 +171,7 @@ extern "C"{
    for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
      bst_ulong nelem = 0;
      for (bst_ulong j = 0; j < ncol; ++j) {
-        if (CheckNAN(data[j])) {
+        if (utils::CheckNAN(data[j])) {
          utils::Check(nan_missing,
                       "There are NAN in the matrix, however, you did not set missing=NAN"); 
        } else {