diff --git a/R-package/demo/tweedie_regression.R b/R-package/demo/tweedie_regression.R new file mode 100755 index 000000000..4d272f696 --- /dev/null +++ b/R-package/demo/tweedie_regression.R @@ -0,0 +1,49 @@ +library(xgboost) +library(data.table) +library(cplm) + +data(AutoClaim) + +# auto insurance dataset analyzed by Yip and Yau (2005) +dt <- data.table(AutoClaim) + +# exclude these columns from the model matrix +exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY') + +# retains the missing values +# NOTE: this dataset is comes ready out of the box +options(na.action = 'na.pass') +x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F]) +options(na.action = 'na.omit') + +# response +y <- dt[, CLM_AMT5] + +d_train <- xgb.DMatrix(data = x, label = y, missing = NA) + +# the tweedie_variance_power parameter determines the shape of +# distribution +# - closer to 1 is more poisson like and the mass +# is more concentrated near zero +# - closer to 2 is more gamma like and the mass spreads to the +# the right with less concentration near zero + +params <- list( + objective = 'reg:tweedie', + eval_metric = 'rmse', + tweedie_variance_power = 1.4, + max_depth = 6, + eta = 1) + +bst <- xgb.train( + data = d_train, + params = params, + maximize = FALSE, + watchlist = list(train = d_train), + nrounds = 20) + +var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst) + +preds <- predict(bst, d_train) + +rmse <- sqrt(sum(mean((y - preds)^2))) \ No newline at end of file diff --git a/doc/parameter.md b/doc/parameter.md index 739b542bf..8efe07410 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -107,6 +107,11 @@ Parameters for Linear Booster * lambda_bias - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important) +Parameters for Tweedie Regression +----------------------------- +* tweedie_variance_power [default=1.5] + - Parameter that controls the variance of the tweedie distribution. Set closer to 2 to shift towards a gamma distribution and closer to 1 to shift towards a poisson distribution. + Learning Task Parameters ------------------------ Specify the learning task and the corresponding learning objective. The objective options are below: @@ -121,6 +126,8 @@ Specify the learning task and the corresponding learning objective. The objectiv - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss - "reg:gamma" --gamma regression for severity data, output mean of gamma distribution + - "reg:tweedie" --tweedie regression for insurance data + - tweedie_variance_power is set to 1.5 by default in tweedie regression and must be in the range [1, 2) * base_score [ default=0.5 ] - the initial prediction score of all instances, global bias - for sufficient number of iterations, changing this value will not have too much effect. diff --git a/src/metric/elementwise_metric.cc b/src/metric/elementwise_metric.cc index 6e087ba26..b7047e495 100644 --- a/src/metric/elementwise_metric.cc +++ b/src/metric/elementwise_metric.cc @@ -163,6 +163,30 @@ struct EvalGammaNLogLik: public EvalEWiseBase { } }; +struct EvalTweedieNLogLik: public EvalEWiseBase { + explicit EvalTweedieNLogLik(const char* param) { + CHECK(param != nullptr) + << "tweedie-nloglik must be in format tweedie-nloglik@rho"; + rho_ = atof(param); + CHECK(rho_ < 2 && rho_ >= 1) + << "tweedie variance power must be in interval [1, 2)"; + std::ostringstream os; + os << "tweedie-nloglik@" << rho_; + name_ = os.str(); + } + const char *Name() const override { + return name_.c_str(); + } + inline float EvalRow(float y, float p) const { + float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_); + float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_); + return -a + b; + } + protected: + std::string name_; + float rho_; +}; + XGBOOST_REGISTER_METRIC(RMSE, "rmse") .describe("Rooted mean square error.") .set_body([](const char* param) { return new EvalRMSE(); }); @@ -191,5 +215,11 @@ XGBOOST_REGISTER_METRIC(GammaNLogLik, "gamma-nloglik") .describe("Negative log-likelihood for gamma regression.") .set_body([](const char* param) { return new EvalGammaNLogLik(); }); +XGBOOST_REGISTER_METRIC(TweedieNLogLik, "tweedie-nloglik") +.describe("tweedie-nloglik@rho for tweedie regression.") +.set_body([](const char* param) { + return new EvalTweedieNLogLik(param); +}); + } // namespace metric } // namespace xgboost diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc index 743ff083f..d26838048 100644 --- a/src/objective/regression_obj.cc +++ b/src/objective/regression_obj.cc @@ -272,5 +272,75 @@ class GammaRegression : public ObjFunction { XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma") .describe("Gamma regression for severity data.") .set_body([]() { return new GammaRegression(); }); + +// declare parameter +struct TweedieRegressionParam : public dmlc::Parameter { + float tweedie_variance_power; + DMLC_DECLARE_PARAMETER(TweedieRegressionParam) { + DMLC_DECLARE_FIELD(tweedie_variance_power).set_range(1.0f, 2.0f).set_default(1.5f) + .describe("Tweedie variance power. Must be between in range [1, 2)."); + } +}; + +// tweedie regression +class TweedieRegression : public ObjFunction { + public: + // declare functions + void Configure(const std::vector >& args) override { + param_.InitAllowUnknown(args); + } + + void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) override { + CHECK_NE(info.labels.size(), 0) << "label set cannot be empty"; + CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; + out_gpair->resize(preds.size()); + // check if label in range + bool label_correct = true; + // start calculating gradient + const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) + float p = preds[i]; + float w = info.GetWeight(i); + float y = info.labels[i]; + float rho = param_.tweedie_variance_power; + if (y >= 0.0f) { + float grad = -y * std::exp((1 - rho) * p) + std::exp((2 - rho) * p); + float hess = -y * (1 - rho) * std::exp((1 - rho) * p) + (2 - rho) * std::exp((2 - rho) * p); + out_gpair->at(i) = bst_gpair(grad * w, hess * w); + } else { + label_correct = false; + } + } + CHECK(label_correct) << "TweedieRegression: label must be nonnegative"; + } + void PredTransform(std::vector *io_preds) override { + std::vector &preds = *io_preds; + const long ndata = static_cast(preds.size()); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (long j = 0; j < ndata; ++j) { // NOLINT(*) + preds[j] = std::exp(preds[j]); + } + } + const char* DefaultEvalMetric(void) const override { + std::ostringstream os; + os << "tweedie-nloglik@" << param_.tweedie_variance_power; + std::string metric = os.str(); + return metric.c_str(); + } + + private: + TweedieRegressionParam param_; +}; + +// register the ojective functions +DMLC_REGISTER_PARAMETER(TweedieRegressionParam); + +XGBOOST_REGISTER_OBJECTIVE(TweedieRegression, "reg:tweedie") +.describe("Tweedie regression for insurance data.") +.set_body([]() { return new TweedieRegression(); }); } // namespace obj } // namespace xgboost