Add support for Gamma regression (#1258)

* Add support for Gamma regression * Use base_score to replace the lp_bias * Remove the lp_bias config block * Add a demo for running gamma regression in Python * Typo fix * Revise the description for objective * Add a script to generate the autoclaims dataset
2016-07-07 01:22:46 +08:00
parent f74e2439e0
commit 77d17f6264
5 changed files with 181 additions and 0 deletions
--- a/demo/data/gen_autoclaims.R
+++ b/demo/data/gen_autoclaims.R
@@ -0,0 +1,18 @@
+site <- 'http://cran.r-project.org'
+if (!require('dummies'))
+    install.packages('dummies', repos=site)
+if (!require('insuranceData'))
+    install.packages('insuranceData', repos=site)
+
+library(dummies)
+library(insuranceData)
+
+data(AutoClaims)
+data = AutoClaims
+
+data$STATE = as.factor(data$STATE)
+data$CLASS = as.factor(data$CLASS)
+data$GENDER = as.factor(data$GENDER)
+
+data.dummy <- dummy.data.frame(data, dummy.class='factor', omit.constants=T);
+write.table(data.dummy, 'autoclaims.csv', sep=',', row.names=F, col.names=F, quote=F)
--- a/demo/guide-python/gamma_regression.py
+++ b/demo/guide-python/gamma_regression.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+import xgboost as xgb
+import numpy as np
+
+#  this script demonstrates how to fit gamma regression model (with log link function)
+#  in xgboost, before running the demo you need to generate the autoclaims dataset
+#  by running gen_autoclaims.R located in xgboost/demo/data.
+
+data = np.genfromtxt('../data/autoclaims.csv', delimiter=',')
+dtrain = xgb.DMatrix(data[0:4741, 0:34], data[0:4741, 34])
+dtest = xgb.DMatrix(data[4741:6773, 0:34], data[4741:6773, 34])
+
+# for gamma regression, we need to set the objective to 'reg:gamma', it also suggests
+# to set the base_score to a value between 1 to 5 if the number of iteration is small
+param = {'silent':1, 'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3}
+
+# the rest of settings are the same
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 30
+
+# training and evaluation
+bst = xgb.train(param, dtrain, num_round, watchlist)
+preds = bst.predict(dtest)
+labels = dtest.get_label()
+print ('test deviance=%f' % (2 * np.sum((labels - preds) / preds - np.log(labels) + np.log(preds))))