50 lines
1.2 KiB
R
50 lines
1.2 KiB
R
library(xgboost)
|
|
library(data.table)
|
|
library(cplm)
|
|
|
|
data(AutoClaim)
|
|
|
|
# auto insurance dataset analyzed by Yip and Yau (2005)
|
|
dt <- data.table(AutoClaim)
|
|
|
|
# exclude these columns from the model matrix
|
|
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
|
|
|
|
# retains the missing values
|
|
# NOTE: this dataset is comes ready out of the box
|
|
options(na.action = 'na.pass')
|
|
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE])
|
|
options(na.action = 'na.omit')
|
|
|
|
# response
|
|
y <- dt[, CLM_AMT5]
|
|
|
|
d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
|
|
|
|
# the tweedie_variance_power parameter determines the shape of
|
|
# distribution
|
|
# - closer to 1 is more poisson like and the mass
|
|
# is more concentrated near zero
|
|
# - closer to 2 is more gamma like and the mass spreads to the
|
|
# the right with less concentration near zero
|
|
|
|
params <- list(
|
|
objective = 'reg:tweedie',
|
|
eval_metric = 'rmse',
|
|
tweedie_variance_power = 1.4,
|
|
max_depth = 6,
|
|
eta = 1)
|
|
|
|
bst <- xgb.train(
|
|
data = d_train,
|
|
params = params,
|
|
maximize = FALSE,
|
|
evals = list(train = d_train),
|
|
nrounds = 20)
|
|
|
|
var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
|
|
|
|
preds <- predict(bst, d_train)
|
|
|
|
rmse <- sqrt(sum(mean((y - preds) ^ 2)))
|