Previously, we use `libsvm` as default when format is not specified. However, the dmlc data parser is not particularly robust against errors, and the most common type of error is undefined format. Along with which, we will recommend users to use other data loader instead. We will continue the maintenance of the parsers as it's currently used for many internal tests including federated learning.
92 lines
2.4 KiB
Python
92 lines
2.4 KiB
Python
"""
|
|
Demo for using cross validation
|
|
===============================
|
|
"""
|
|
import os
|
|
|
|
import numpy as np
|
|
|
|
import xgboost as xgb
|
|
|
|
# load data in do training
|
|
CURRENT_DIR = os.path.dirname(__file__)
|
|
dtrain = xgb.DMatrix(
|
|
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
|
|
)
|
|
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
|
num_round = 2
|
|
|
|
print("running cross validation")
|
|
# do cross validation, this will print result out as
|
|
# [iteration] metric_name:mean_value+std_value
|
|
# std_value is standard deviation of the metric
|
|
xgb.cv(
|
|
param,
|
|
dtrain,
|
|
num_round,
|
|
nfold=5,
|
|
metrics={"error"},
|
|
seed=0,
|
|
callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
|
|
)
|
|
|
|
print("running cross validation, disable standard deviation display")
|
|
# do cross validation, this will print result out as
|
|
# [iteration] metric_name:mean_value
|
|
res = xgb.cv(
|
|
param,
|
|
dtrain,
|
|
num_boost_round=10,
|
|
nfold=5,
|
|
metrics={"error"},
|
|
seed=0,
|
|
callbacks=[
|
|
xgb.callback.EvaluationMonitor(show_stdv=False),
|
|
xgb.callback.EarlyStopping(3),
|
|
],
|
|
)
|
|
print(res)
|
|
print("running cross validation, with preprocessing function")
|
|
|
|
|
|
# define the preprocessing function
|
|
# used to return the preprocessed training, test data, and parameter
|
|
# we can use this to do weight rescale, etc.
|
|
# as a example, we try to set scale_pos_weight
|
|
def fpreproc(dtrain, dtest, param):
|
|
label = dtrain.get_label()
|
|
ratio = float(np.sum(label == 0)) / np.sum(label == 1)
|
|
param["scale_pos_weight"] = ratio
|
|
return (dtrain, dtest, param)
|
|
|
|
|
|
# do cross validation, for each fold
|
|
# the dtrain, dtest, param will be passed into fpreproc
|
|
# then the return value of fpreproc will be used to generate
|
|
# results of that fold
|
|
xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)
|
|
|
|
###
|
|
# you can also do cross validation with customized loss function
|
|
# See custom_objective.py
|
|
##
|
|
print("running cross validation, with customized loss function")
|
|
|
|
|
|
def logregobj(preds, dtrain):
|
|
labels = dtrain.get_label()
|
|
preds = 1.0 / (1.0 + np.exp(-preds))
|
|
grad = preds - labels
|
|
hess = preds * (1.0 - preds)
|
|
return grad, hess
|
|
|
|
|
|
def evalerror(preds, dtrain):
|
|
labels = dtrain.get_label()
|
|
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
|
|
|
|
|
|
param = {"max_depth": 2, "eta": 1}
|
|
# train with customized objective
|
|
xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
|