Tweedie Regression Post-Rebase (#1737)

* add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * rebased with upstream master and added R example * changed parameter name to tweedie_variance_power * linting error fix * refactored tweedie-nloglik metric to be more like the other parameterized metrics * added upper and lower bound check to tweedie metric * add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * added upper and lower bound check to tweedie metric * added back readme line that was accidentally deleted * rebased with upstream master and added R example * rebased again on top of upstream master * linting error fix * added upper and lower bound check to tweedie metric * rebased with master * lint fix * removed whitespace at end of line 186 - elementwise_metric.cc
2016-11-05 20:02:32 -04:00
parent 52b9867be5
commit 2ad0948444
4 changed files with 156 additions and 0 deletions
--- a/R-package/demo/tweedie_regression.R
+++ b/R-package/demo/tweedie_regression.R
@@ -0,0 +1,49 @@
+library(xgboost)
+library(data.table)
+library(cplm)
+
+data(AutoClaim)
+
+# auto insurance dataset analyzed by Yip and Yau (2005)
+dt <- data.table(AutoClaim)
+
+# exclude these columns from the model matrix
+exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
+
+# retains the missing values
+# NOTE: this dataset is comes ready out of the box
+options(na.action = 'na.pass')
+x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
+options(na.action = 'na.omit')
+
+# response
+y <- dt[, CLM_AMT5]
+
+d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
+
+# the tweedie_variance_power parameter determines the shape of 
+# distribution
+# - closer to 1 is more poisson like and the mass
+#   is more concentrated near zero 
+# - closer to 2 is more gamma like and the mass spreads to the 
+#   the right with less concentration near zero
+
+params <- list(
+  objective = 'reg:tweedie',
+  eval_metric = 'rmse', 
+  tweedie_variance_power = 1.4,
+  max_depth = 6,
+  eta = 1)
+
+bst <- xgb.train(
+  data = d_train, 
+  params = params, 
+  maximize = FALSE,
+  watchlist = list(train = d_train), 
+  nrounds = 20)
+
+var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
+
+preds <- predict(bst, d_train)
+
+rmse <- sqrt(sum(mean((y - preds)^2)))
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -107,6 +107,11 @@ Parameters for Linear Booster
 * lambda_bias
  - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)

+Parameters for Tweedie Regression
+-----------------------------
+* tweedie_variance_power [default=1.5]
+  - Parameter that controls the variance of the tweedie distribution.  Set closer to 2 to shift towards a gamma distribution and closer to 1 to shift towards a poisson distribution.
+
 Learning Task Parameters
 ------------------------
 Specify the learning task and the corresponding learning objective. The objective options are below:
@@ -121,6 +126,8 @@ Specify the learning task and the corresponding learning objective. The objectiv
 - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
 - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss
 - "reg:gamma" --gamma regression for severity data, output mean of gamma distribution
+ - "reg:tweedie" --tweedie regression for insurance data
+   - tweedie_variance_power is set to 1.5 by default in tweedie regression and must be in the range [1, 2)
 * base_score [ default=0.5 ]
  - the initial prediction score of all instances, global bias
  - for sufficient number of iterations, changing this value will not have too much effect.
--- a/src/metric/elementwise_metric.cc
+++ b/src/metric/elementwise_metric.cc
@@ -163,6 +163,30 @@ struct EvalGammaNLogLik: public EvalEWiseBase<EvalGammaNLogLik> {
  }
 };

+struct EvalTweedieNLogLik: public EvalEWiseBase<EvalTweedieNLogLik> {
+  explicit EvalTweedieNLogLik(const char* param) {
+    CHECK(param != nullptr)
+        << "tweedie-nloglik must be in format tweedie-nloglik@rho";
+    rho_ = atof(param);
+    CHECK(rho_ < 2 && rho_ >= 1)
+        << "tweedie variance power must be in interval [1, 2)";
+    std::ostringstream os;
+    os << "tweedie-nloglik@" << rho_;
+    name_ = os.str();
+  }
+  const char *Name() const override {
+    return name_.c_str();
+  }
+  inline float EvalRow(float y, float p) const {
+    float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_);
+    float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_);
+    return -a + b;
+  }
+ protected:
+  std::string name_;
+  float rho_;
+};
+
 XGBOOST_REGISTER_METRIC(RMSE, "rmse")
 .describe("Rooted mean square error.")
 .set_body([](const char* param) { return new EvalRMSE(); });
@@ -191,5 +215,11 @@ XGBOOST_REGISTER_METRIC(GammaNLogLik, "gamma-nloglik")
 .describe("Negative log-likelihood for gamma regression.")
 .set_body([](const char* param) { return new EvalGammaNLogLik(); });

+XGBOOST_REGISTER_METRIC(TweedieNLogLik, "tweedie-nloglik")
+.describe("tweedie-nloglik@rho for tweedie regression.")
+.set_body([](const char* param) {
+  return new EvalTweedieNLogLik(param);
+});
+
 }  // namespace metric
 }  // namespace xgboost
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@@ -272,5 +272,75 @@ class GammaRegression : public ObjFunction {
 XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
 .describe("Gamma regression for severity data.")
 .set_body([]() { return new GammaRegression(); });
+
+// declare parameter
+struct TweedieRegressionParam : public dmlc::Parameter<TweedieRegressionParam> {
+  float tweedie_variance_power;
+  DMLC_DECLARE_PARAMETER(TweedieRegressionParam) {
+    DMLC_DECLARE_FIELD(tweedie_variance_power).set_range(1.0f, 2.0f).set_default(1.5f)
+      .describe("Tweedie variance power.  Must be between in range [1, 2).");
+  }
+};
+
+// tweedie regression
+class TweedieRegression : public ObjFunction {
+ public:
+  // declare functions
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.InitAllowUnknown(args);
+  }
+
+  void GetGradient(const std::vector<float> &preds,
+                   const MetaInfo &info,
+                   int iter,
+                   std::vector<bst_gpair> *out_gpair) override {
+    CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
+    out_gpair->resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
+    // start calculating gradient
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+    #pragma omp parallel for schedule(static)
+    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
+      float p = preds[i];
+      float w = info.GetWeight(i);
+      float y = info.labels[i];
+      float rho = param_.tweedie_variance_power;
+      if (y >= 0.0f) {
+        float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p);
+        float hess = -y * (1 - rho) * std::exp((1 - rho) * p) + (2 - rho) * std::exp((2 - rho) * p);
+        out_gpair->at(i) = bst_gpair(grad * w, hess * w);
+      } else {
+        label_correct = false;
+      }
+    }
+    CHECK(label_correct) << "TweedieRegression: label must be nonnegative";
+  }
+  void PredTransform(std::vector<float> *io_preds) override {
+    std::vector<float> &preds = *io_preds;
+    const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
+    #pragma omp parallel for schedule(static)
+    for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
+      preds[j] = std::exp(preds[j]);
+    }
+  }
+  const char* DefaultEvalMetric(void) const override {
+    std::ostringstream os;
+    os << "tweedie-nloglik@" << param_.tweedie_variance_power;
+    std::string metric = os.str();
+    return metric.c_str();
+  }
+
+ private:
+  TweedieRegressionParam param_;
+};
+
+// register the ojective functions
+DMLC_REGISTER_PARAMETER(TweedieRegressionParam);
+
+XGBOOST_REGISTER_OBJECTIVE(TweedieRegression, "reg:tweedie")
+.describe("Tweedie regression for insurance data.")
+.set_body([]() { return new TweedieRegression(); });
 }  // namespace obj
 }  // namespace xgboost