in the middle of guide-r
This commit is contained in:
parent
bc1817ca2f
commit
905051b7cb
@ -1,7 +1,14 @@
|
||||
require(xgboost)
|
||||
|
||||
dtrain <- xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest <- xgb.DMatrix('../data/agaricus.txt.test')
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
@ -18,36 +25,17 @@ preds2 <- predict(bst2,dtest)
|
||||
stopifnot(sum((preds-preds2)^2)==0)
|
||||
|
||||
|
||||
cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n')
|
||||
read.libsvm <- function(fname, maxcol) {
|
||||
content <- readLines(fname)
|
||||
nline <- length(content)
|
||||
label <- numeric(nline)
|
||||
mat <- matrix(0, nline, maxcol + 1)
|
||||
for (i in 1:nline) {
|
||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] <- as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv <- strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex <- as.integer(kv[1]) + 1
|
||||
fvalue <- as.numeric(kv[2])
|
||||
mat[i, findex] <- fvalue
|
||||
}
|
||||
}
|
||||
mat <- as(mat, "sparseMatrix")
|
||||
return(list(label = label, data = mat))
|
||||
}
|
||||
csc <- read.libsvm("../data/agaricus.txt.train", 126)
|
||||
y <- csc$label
|
||||
x <- csc$data
|
||||
class(x)
|
||||
dtrain <- xgb.DMatrix(x, label = y)
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
cat('start running example of build DMatrix from numpy array\n')
|
||||
x <- as.matrix(x)
|
||||
x <- iris[,1:4]
|
||||
y <- iris[,5]
|
||||
class(x)
|
||||
dtrain <- xgb.DMatrix(x, label = y)
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n')
|
||||
x <- as(x,'dgCMatrix')
|
||||
class(x)
|
||||
dtrain <- xgb.DMatrix(x, label = y)
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
|
||||
|
||||
29
demo/guide-R/boost_from_prediction.R
Executable file
29
demo/guide-R/boost_from_prediction.R
Executable file
@ -0,0 +1,29 @@
|
||||
require(xgboost)
|
||||
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
|
||||
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
print('start running example to start from a initial prediction\n')
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
|
||||
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin=TRUE)
|
||||
# dtrain.set_base_margin(ptrain)
|
||||
# dtest.set_base_margin(ptest)
|
||||
|
||||
|
||||
cat('this is result of running from initial prediction\n')
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
|
||||
|
||||
|
||||
|
||||
63
demo/guide-R/cross_validation.R
Executable file
63
demo/guide-R/cross_validation.R
Executable file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||
num_round = 2
|
||||
|
||||
print ('running cross validation')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0)
|
||||
|
||||
print ('running cross validation, disable standard deviation display')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0, show_stdv = False)
|
||||
|
||||
print ('running cross validation, with preprocessing function')
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
# as a example, we try to set scale_pos_weight
|
||||
def fpreproc(dtrain, dtest, param):
|
||||
label = dtrain.get_label()
|
||||
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||
param['scale_pos_weight'] = ratio
|
||||
return (dtrain, dtest, param)
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
# then the return value of fpreproc will be used to generate
|
||||
# results of that fold
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
# See custom_objective.py
|
||||
##
|
||||
print ('running cross validation, with cutomsized loss function')
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
44
demo/guide-R/custom_objective.R
Executable file
44
demo/guide-R/custom_objective.R
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
#
|
||||
print ('start running example to used cutomized objective function')
|
||||
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 2
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||
32
demo/guide-R/generalized_linear_model.R
Executable file
32
demo/guide-R/generalized_linear_model.R
Executable file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
##
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
|
||||
'alpha': 0.0001, 'lambda': 1 }
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
# param['eta'] = 1
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 4
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
22
demo/guide-R/predict_first_ntree.R
Executable file
22
demo/guide-R/predict_first_ntree.R
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 3
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
print ('start testing prediction from first n trees')
|
||||
### predict using first 1 tree
|
||||
label = dtest.get_label()
|
||||
ypred1 = bst.predict(dtest, ntree_limit=1)
|
||||
# by default, we predict using all the trees
|
||||
ypred2 = bst.predict(dtest)
|
||||
print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
|
||||
print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
|
||||
Loading…
x
Reference in New Issue
Block a user