Add rmsle metric and reg:squaredlogerror objective (#4541)

2019-06-11 05:48:27 +08:00 · 2019-06-11 05:48:27 +08:00 · 2f1319f273
commit 2f1319f273
parent 9683fd433e
7 changed files with 92 additions and 9 deletions
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -151,7 +151,7 @@ Parameters for Tree Booster
    - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
    - ``prune``: prunes the splits where loss < min_split_loss (or gamma).

-  - In a distributed setting, the implicit updater sequence value would be adjusted to ``grow_histmaker,prune`` by default, and you can set ``tree_method`` as ``hist`` to use ``grow_histmaker``. 
+  - In a distributed setting, the implicit updater sequence value would be adjusted to ``grow_histmaker,prune`` by default, and you can set ``tree_method`` as ``hist`` to use ``grow_histmaker``.

 * ``refresh_leaf`` [default=1]

@ -295,6 +295,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 * ``objective`` [default=reg:squarederror]

  - ``reg:squarederror``: regression with squared loss
+  - ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`.  All input labels are required to be greater than -1.  Also, see metric ``rmsle`` for possible issue  with this objective.
  - ``reg:logistic``: logistic regression
  - ``binary:logistic``: logistic regression for binary classification, output probability
  - ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
@ -325,6 +326,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
  - The choices are listed below:

    - ``rmse``: `root mean square error <http://en.wikipedia.org/wiki/Root_mean_square_error>`_
+    - ``rmsle``: root mean square log error: :math:`\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}`. Default metric of ``reg:squaredlogerror`` objective. This metric reduces errors generated by outliers in dataset.  But because ``log`` function is employed, ``rmsle`` might output ``nan`` when prediction value is less than -1.  See ``reg:squaredlogerror`` for other requirements.
    - ``mae``: `mean absolute error <https://en.wikipedia.org/wiki/Mean_absolute_error>`_
    - ``logloss``: `negative log-likelihood <http://en.wikipedia.org/wiki/Log-likelihood>`_
    - ``error``: Binary classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@ -24,8 +24,9 @@ private[spark] trait LearningTaskParams extends Params {

  /**
   * Specify the learning task and the corresponding learning objective.
-   * options: reg:squarederror, reg:logistic, binary:logistic, binary:logitraw, count:poisson,
-   * multi:softmax, multi:softprob, rank:pairwise, reg:gamma. default: reg:squarederror
+   * options: reg:squarederror, reg:squaredlogerror, reg:logistic, binary:logistic, binary:logitraw,
+   * count:poisson, multi:softmax, multi:softprob, rank:pairwise, reg:gamma.
+   * default: reg:squarederror
   */
  final val objective = new Param[String](this, "objective", "objective function used for " +
    s"training, options: {${LearningTaskParams.supportedObjective.mkString(",")}",
@ -56,7 +57,7 @@ private[spark] trait LearningTaskParams extends Params {
  /**
   * evaluation metrics for validation data, a default metric will be assigned according to
   * objective(rmse for regression, and error for classification, mean average precision for
-   * ranking). options: rmse, mae, logloss, error, merror, mlogloss, auc, aucpr, ndcg, map,
+   * ranking). options: rmse, rmsle, mae, logloss, error, merror, mlogloss, auc, aucpr, ndcg, map,
   * gamma-deviance
   */
  final val evalMetric = new Param[String](this, "evalMetric", "evaluation metrics for " +
@ -106,14 +107,14 @@ private[spark] trait LearningTaskParams extends Params {

 private[spark] object LearningTaskParams {
  val supportedObjective = HashSet("reg:linear", "reg:squarederror", "reg:logistic",
-    "binary:logistic", "binary:logitraw", "count:poisson", "multi:softmax", "multi:softprob",
-    "rank:pairwise", "rank:ndcg", "rank:map", "reg:gamma", "reg:tweedie")
+    "reg:squaredlogerror", "binary:logistic", "binary:logitraw", "count:poisson", "multi:softmax",
+    "multi:softprob", "rank:pairwise", "rank:ndcg", "rank:map", "reg:gamma", "reg:tweedie")

  val supportedObjectiveType = HashSet("regression", "classification")

  val evalMetricsToMaximize = HashSet("auc", "aucpr", "ndcg", "map")

-  val evalMetricsToMinimize = HashSet("rmse", "mae", "logloss", "error", "merror",
+  val evalMetricsToMinimize = HashSet("rmse", "rmsle", "mae", "logloss", "error", "merror",
    "mlogloss", "gamma-deviance")

  val supportedEvalMetrics = evalMetricsToMaximize union evalMetricsToMinimize
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -153,6 +153,19 @@ struct EvalRowRMSE {
  }
 };

+struct EvalRowRMSLE {
+  char const* Name() const {
+    return "rmsle";
+  }
+  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
+    bst_float diff = std::log1p(label) - std::log1p(pred);
+    return diff * diff;
+  }
+  static bst_float GetFinal(bst_float esum, bst_float wsum) {
+    return std::sqrt(esum / wsum);
+  }
+};
+
 struct EvalRowMAE {
  const char *Name() const {
    return "mae";
@ -349,6 +362,10 @@ XGBOOST_REGISTER_METRIC(RMSE, "rmse")
 .describe("Rooted mean square error.")
 .set_body([](const char* param) { return new EvalEWiseBase<EvalRowRMSE>(); });

+XGBOOST_REGISTER_METRIC(RMSLE, "rmsle")
+.describe("Rooted mean square log error.")
+.set_body([](const char* param) { return new EvalEWiseBase<EvalRowRMSLE>(); });
+
 XGBOOST_REGISTER_METRIC(MAE, "mae")
 .describe("Mean absolute error.")
 .set_body([](const char* param) { return new EvalEWiseBase<EvalRowMAE>(); });
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@ -1,5 +1,5 @@
 /*!
- * Copyright 2017 XGBoost contributors
+ * Copyright 2017-2019 XGBoost contributors
 */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
@ -36,6 +36,29 @@ struct LinearSquareLoss {
  static const char* DefaultEvalMetric() { return "rmse"; }
 };

+struct SquaredLogError {
+  XGBOOST_DEVICE static bst_float PredTransform(bst_float x) { return x; }
+  XGBOOST_DEVICE static bool CheckLabel(bst_float label) {
+    return label > -1;
+  }
+  XGBOOST_DEVICE static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    predt = fmaxf(predt, -1 + 1e-6);  // ensure correct value for log1p
+    return (std::log1p(predt) - std::log1p(label)) / (predt + 1);
+  }
+  XGBOOST_DEVICE static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    predt = fmaxf(predt, -1 + 1e-6);
+    float res = (-std::log1p(predt) + std::log1p(label) + 1) /
+                std::pow(predt + 1, 2);
+    res = fmaxf(res, 1e-6f);
+    return res;
+  }
+  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+  static const char* LabelErrorMsg() {
+    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
+  }
+  static const char* DefaultEvalMetric() { return "rmsle"; }
+};
+
 // logistic loss for probability regression task
 struct LogisticRegression {
  // duplication is necessary, as __device__ specifier
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@ -1,5 +1,5 @@
 /*!
- * Copyright 2015-2018 by Contributors
+ * Copyright 2015-2019 by Contributors
 * \file regression_obj.cu
 * \brief Definition of single-value regression and classification objectives.
 * \author Tianqi Chen, Kailong Chen
@ -124,6 +124,10 @@ XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegression, "reg:squarederror")
 .describe("Regression with squared error.")
 .set_body([]() { return new RegLossObj<LinearSquareLoss>(); });

+XGBOOST_REGISTER_OBJECTIVE(SquareLogError, "reg:squaredlogerror")
+.describe("Regression with root mean squared logarithmic error.")
+.set_body([]() { return new RegLossObj<SquaredLogError>(); });
+
 XGBOOST_REGISTER_OBJECTIVE(LogisticRegression, "reg:logistic")
 .describe("Logistic regression for probability regression task.")
 .set_body([]() { return new RegLossObj<LogisticRegression>(); });
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@ -19,6 +19,18 @@ TEST(Metric, DeclareUnifiedTest(RMSE)) {
  delete metric;
 }

+TEST(Metric, DeclareUnifiedTest(RMSLE)) {
+  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &lparam);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "rmsle");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}), 0.40632, 1e-4);
+  delete metric;
+}
+
 TEST(Metric, DeclareUnifiedTest(MAE)) {
  auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
  xgboost::Metric * metric = xgboost::Metric::Create("mae", &lparam);
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@ -31,6 +31,30 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
  delete obj;
 }

+TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
+  xgboost::LearnerTrainParam tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
+  std::vector<std::pair<std::string, std::string>> args;
+
+  xgboost::ObjFunction * obj =
+      xgboost::ObjFunction::Create("reg:squaredlogerror", &tparam);
+  obj->Configure(args);
+
+  CheckObjFunction(obj,
+                   {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},  // pred
+                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // labels
+                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // weights
+                   {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f},
+                   { 1.3205f,  1.0492f,  0.69215f,  0.34115f, 0.1091f});
+  CheckObjFunction(obj,
+                   {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},  // pred
+                   {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},  // labels
+                   {},                              // empty weights
+                   {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f},
+                   { 1.3205f,  1.0492f,  0.69215f,  0.34115f, 0.1091f});
+  ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"rmsle"});
+  delete obj;
+}
+
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
  xgboost::LearnerTrainParam tparam = xgboost::CreateEmptyGenericParam(0, NGPUS);
  std::vector<std::pair<std::string, std::string>> args;