diff --git a/demo/guide-R/basic_walkthrough.R b/demo/guide-R/basic_walkthrough.R index 959e5f0ed..bc10ebc41 100644 --- a/demo/guide-R/basic_walkthrough.R +++ b/demo/guide-R/basic_walkthrough.R @@ -1,7 +1,14 @@ require(xgboost) -dtrain <- xgb.DMatrix('../data/agaricus.txt.train') -dtest <- xgb.DMatrix('../data/agaricus.txt.test') +data(iris) +iris[,5] <- as.numeric(iris[,5]=='setosa') +iris <- as.matrix(iris) +set.seed(20) +test_ind <- sample(1:nrow(iris),50) +train_ind <- setdiff(1:nrow(iris),test_ind) +dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) +dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) + param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') watchlist <- list(eval = dtest, train = dtrain) num_round <- 2 @@ -18,36 +25,17 @@ preds2 <- predict(bst2,dtest) stopifnot(sum((preds-preds2)^2)==0) -cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n') -read.libsvm <- function(fname, maxcol) { - content <- readLines(fname) - nline <- length(content) - label <- numeric(nline) - mat <- matrix(0, nline, maxcol + 1) - for (i in 1:nline) { - arr <- as.vector(strsplit(content[i], " ")[[1]]) - label[i] <- as.numeric(arr[[1]]) - for (j in 2:length(arr)) { - kv <- strsplit(arr[j], ":")[[1]] - # to avoid 0 index - findex <- as.integer(kv[1]) + 1 - fvalue <- as.numeric(kv[2]) - mat[i, findex] <- fvalue - } - } - mat <- as(mat, "sparseMatrix") - return(list(label = label, data = mat)) -} -csc <- read.libsvm("../data/agaricus.txt.train", 126) -y <- csc$label -x <- csc$data -class(x) -dtrain <- xgb.DMatrix(x, label = y) -bst <- xgb.train(param, dtrain, num_round, watchlist) - cat('start running example of build DMatrix from numpy array\n') -x <- as.matrix(x) +x <- iris[,1:4] +y <- iris[,5] class(x) dtrain <- xgb.DMatrix(x, label = y) bst <- xgb.train(param, dtrain, num_round, watchlist) +cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n') +x <- as(x,'dgCMatrix') +class(x) +dtrain <- xgb.DMatrix(x, label = y) +bst <- xgb.train(param, dtrain, num_round, watchlist) + + diff --git a/demo/guide-R/boost_from_prediction.R b/demo/guide-R/boost_from_prediction.R new file mode 100755 index 000000000..69fe4153c --- /dev/null +++ b/demo/guide-R/boost_from_prediction.R @@ -0,0 +1,29 @@ +require(xgboost) + +data(iris) +iris[,5] <- as.numeric(iris[,5]=='setosa') +iris <- as.matrix(iris) +set.seed(20) +test_ind <- sample(1:nrow(iris),50) +train_ind <- setdiff(1:nrow(iris),test_ind) +dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) +dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) + + +watchlist <- list(eval = dtest, train = dtrain) +print('start running example to start from a initial prediction\n') +param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') +bst <- xgb.train( param, dtrain, 1, watchlist ) + +ptrain <- predict(bst, dtrain, outputmargin=TRUE) +ptest <- predict(bst, dtest, outputmargin=TRUE) +# dtrain.set_base_margin(ptrain) +# dtest.set_base_margin(ptest) + + +cat('this is result of running from initial prediction\n') +bst <- xgb.train( param, dtrain, 1, watchlist ) + + + + diff --git a/demo/guide-R/cross_validation.R b/demo/guide-R/cross_validation.R new file mode 100755 index 000000000..a50586c58 --- /dev/null +++ b/demo/guide-R/cross_validation.R @@ -0,0 +1,63 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} +num_round = 2 + +print ('running cross validation') +# do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed = 0) + +print ('running cross validation, disable standard deviation display') +# do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed = 0, show_stdv = False) + +print ('running cross validation, with preprocessing function') +# define the preprocessing function +# used to return the preprocessed training, test data, and parameter +# we can use this to do weight rescale, etc. +# as a example, we try to set scale_pos_weight +def fpreproc(dtrain, dtest, param): + label = dtrain.get_label() + ratio = float(np.sum(label == 0)) / np.sum(label==1) + param['scale_pos_weight'] = ratio + return (dtrain, dtest, param) + +# do cross validation, for each fold +# the dtrain, dtest, param will be passed into fpreproc +# then the return value of fpreproc will be used to generate +# results of that fold +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'auc'}, seed = 0, fpreproc = fpreproc) + +### +# you can also do cross validation with cutomized loss function +# See custom_objective.py +## +print ('running cross validation, with cutomsized loss function') +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess +def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +param = {'max_depth':2, 'eta':1, 'silent':1} +# train with customized objective +xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, + obj = logregobj, feval=evalerror) + diff --git a/demo/guide-R/custom_objective.R b/demo/guide-R/custom_objective.R new file mode 100755 index 000000000..5a7f110f4 --- /dev/null +++ b/demo/guide-R/custom_objective.R @@ -0,0 +1,44 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb +### +# advanced: cutomsized loss function +# +print ('start running example to used cutomized objective function') + +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') + +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param = {'max_depth':2, 'eta':1, 'silent':1 } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 + +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess + +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +def evalerror(preds, dtrain): + labels = dtrain.get_label() + # return a pair metric_name, result + # since preds are margin(before logistic transformation, cutoff at 0) + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) diff --git a/demo/guide-R/generalized_linear_model.R b/demo/guide-R/generalized_linear_model.R new file mode 100755 index 000000000..b6b60be35 --- /dev/null +++ b/demo/guide-R/generalized_linear_model.R @@ -0,0 +1,32 @@ +#!/usr/bin/python +import sys +sys.path.append('../../wrapper') +import xgboost as xgb +## +# this script demonstrate how to fit generalized linear model in xgboost +# basically, we are using linear model, instead of tree for our boosters +## +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +# change booster to gblinear, so that we are fitting a linear model +# alpha is the L1 regularizer +# lambda is the L2 regularizer +# you can also set lambda_bias which is L2 regularizer on the bias term +param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', + 'alpha': 0.0001, 'lambda': 1 } + +# normally, you do not need to set eta (step_size) +# XGBoost uses a parallel coordinate descent algorithm (shotgun), +# there could be affection on convergence with parallelization on certain cases +# setting eta to be smaller value, e.g 0.5 can make the optimization more stable +# param['eta'] = 1 + +## +# the rest of settings are the same +## +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 4 +bst = xgb.train(param, dtrain, num_round, watchlist) +preds = bst.predict(dtest) +labels = dtest.get_label() +print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) diff --git a/demo/guide-R/predict_first_ntree.R b/demo/guide-R/predict_first_ntree.R new file mode 100755 index 000000000..03f327e7f --- /dev/null +++ b/demo/guide-R/predict_first_ntree.R @@ -0,0 +1,22 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 3 +bst = xgb.train(param, dtrain, num_round, watchlist) + +print ('start testing prediction from first n trees') +### predict using first 1 tree +label = dtest.get_label() +ypred1 = bst.predict(dtest, ntree_limit=1) +# by default, we predict using all the trees +ypred2 = bst.predict(dtest) +print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label)))) +print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))