Tweedie Regression Post-Rebase (#1737)

* add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * rebased with upstream master and added R example * changed parameter name to tweedie_variance_power * linting error fix * refactored tweedie-nloglik metric to be more like the other parameterized metrics * added upper and lower bound check to tweedie metric * add support for tweedie regression * added back readme line that was accidentally deleted * fixed linting errors * added upper and lower bound check to tweedie metric * added back readme line that was accidentally deleted * rebased with upstream master and added R example * rebased again on top of upstream master * linting error fix * added upper and lower bound check to tweedie metric * rebased with master * lint fix * removed whitespace at end of line 186 - elementwise_metric.cc
2016-11-05 20:02:32 -04:00
parent 52b9867be5
commit 2ad0948444
4 changed files with 156 additions and 0 deletions
--- a/R-package/demo/tweedie_regression.R
+++ b/R-package/demo/tweedie_regression.R
@@ -0,0 +1,49 @@
+library(xgboost)
+library(data.table)
+library(cplm)
+
+data(AutoClaim)
+
+# auto insurance dataset analyzed by Yip and Yau (2005)
+dt <- data.table(AutoClaim)
+
+# exclude these columns from the model matrix
+exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
+
+# retains the missing values
+# NOTE: this dataset is comes ready out of the box
+options(na.action = 'na.pass')
+x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
+options(na.action = 'na.omit')
+
+# response
+y <- dt[, CLM_AMT5]
+
+d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
+
+# the tweedie_variance_power parameter determines the shape of 
+# distribution
+# - closer to 1 is more poisson like and the mass
+#   is more concentrated near zero 
+# - closer to 2 is more gamma like and the mass spreads to the 
+#   the right with less concentration near zero
+
+params <- list(
+  objective = 'reg:tweedie',
+  eval_metric = 'rmse', 
+  tweedie_variance_power = 1.4,
+  max_depth = 6,
+  eta = 1)
+
+bst <- xgb.train(
+  data = d_train, 
+  params = params, 
+  maximize = FALSE,
+  watchlist = list(train = d_train), 
+  nrounds = 20)
+
+var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
+
+preds <- predict(bst, d_train)
+
+rmse <- sqrt(sum(mean((y - preds)^2)))