Previously, we use `libsvm` as default when format is not specified. However, the dmlc data parser is not particularly robust against errors, and the most common type of error is undefined format. Along with which, we will recommend users to use other data loader instead. We will continue the maintenance of the parsers as it's currently used for many internal tests including federated learning.
52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
"""
|
|
Demo for GLM
|
|
============
|
|
"""
|
|
import os
|
|
|
|
import xgboost as xgb
|
|
|
|
##
|
|
# this script demonstrate how to fit generalized linear model in xgboost
|
|
# basically, we are using linear model, instead of tree for our boosters
|
|
##
|
|
CURRENT_DIR = os.path.dirname(__file__)
|
|
dtrain = xgb.DMatrix(
|
|
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
|
)
|
|
dtest = xgb.DMatrix(
|
|
os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
|
|
)
|
|
# change booster to gblinear, so that we are fitting a linear model
|
|
# alpha is the L1 regularizer
|
|
# lambda is the L2 regularizer
|
|
# you can also set lambda_bias which is L2 regularizer on the bias term
|
|
param = {
|
|
"objective": "binary:logistic",
|
|
"booster": "gblinear",
|
|
"alpha": 0.0001,
|
|
"lambda": 1,
|
|
}
|
|
|
|
# normally, you do not need to set eta (step_size)
|
|
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
|
# there could be affection on convergence with parallelization on certain cases
|
|
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
|
# param['eta'] = 1
|
|
|
|
##
|
|
# the rest of settings are the same
|
|
##
|
|
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
|
num_round = 4
|
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
|
preds = bst.predict(dtest)
|
|
labels = dtest.get_label()
|
|
print(
|
|
"error=%f"
|
|
% (
|
|
sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
|
|
/ float(len(preds))
|
|
)
|
|
)
|