in the middle of guide-r

2014-09-05 23:03:04 -07:00
parent bc1817ca2f
commit 905051b7cb
6 changed files with 208 additions and 30 deletions
--- a/demo/guide-R/basic_walkthrough.R
+++ b/demo/guide-R/basic_walkthrough.R
@@ -1,7 +1,14 @@
 require(xgboost)

-dtrain <- xgb.DMatrix('../data/agaricus.txt.train')
-dtest <- xgb.DMatrix('../data/agaricus.txt.test')
+data(iris)
+iris[,5] <- as.numeric(iris[,5]=='setosa')
+iris <- as.matrix(iris)
+set.seed(20)
+test_ind <- sample(1:nrow(iris),50)
+train_ind <- setdiff(1:nrow(iris),test_ind)
+dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+
 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
 watchlist <- list(eval = dtest, train = dtrain)
 num_round <- 2
@@ -18,36 +25,17 @@ preds2 <- predict(bst2,dtest)
 stopifnot(sum((preds-preds2)^2)==0)


-cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n')
-read.libsvm <- function(fname, maxcol) {
-    content <- readLines(fname)
-    nline <- length(content)
-    label <- numeric(nline)
-    mat <- matrix(0, nline, maxcol + 1)
-    for (i in 1:nline) {
-        arr <- as.vector(strsplit(content[i], " ")[[1]])
-        label[i] <- as.numeric(arr[[1]])
-        for (j in 2:length(arr)) {
-            kv <- strsplit(arr[j], ":")[[1]]
-            # to avoid 0 index
-            findex <- as.integer(kv[1]) + 1
-            fvalue <- as.numeric(kv[2])
-            mat[i, findex] <- fvalue
-        }
-    }
-    mat <- as(mat, "sparseMatrix")
-    return(list(label = label, data = mat))
-}
-csc <- read.libsvm("../data/agaricus.txt.train", 126)
-y <- csc$label
-x <- csc$data
-class(x)
-dtrain <- xgb.DMatrix(x, label = y)
-bst <- xgb.train(param, dtrain, num_round, watchlist)
-
 cat('start running example of build DMatrix from numpy array\n')
-x <- as.matrix(x)
+x <- iris[,1:4]
+y <- iris[,5]
 class(x)
 dtrain <- xgb.DMatrix(x, label = y)
 bst <- xgb.train(param, dtrain, num_round, watchlist)

+cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n')
+x <- as(x,'dgCMatrix')
+class(x)
+dtrain <- xgb.DMatrix(x, label = y)
+bst <- xgb.train(param, dtrain, num_round, watchlist)
+
+
--- a/demo/guide-R/boost_from_prediction.R
+++ b/demo/guide-R/boost_from_prediction.R
@@ -0,0 +1,29 @@
+require(xgboost)
+
+data(iris)
+iris[,5] <- as.numeric(iris[,5]=='setosa')
+iris <- as.matrix(iris)
+set.seed(20)
+test_ind <- sample(1:nrow(iris),50)
+train_ind <- setdiff(1:nrow(iris),test_ind)
+dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+
+
+watchlist <- list(eval = dtest, train = dtrain)
+print('start running example to start from a initial prediction\n')
+param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
+bst <- xgb.train( param, dtrain, 1, watchlist )
+
+ptrain <- predict(bst, dtrain, outputmargin=TRUE)
+ptest  <- predict(bst, dtest, outputmargin=TRUE)
+# dtrain.set_base_margin(ptrain)
+# dtest.set_base_margin(ptest)
+
+
+cat('this is result of running from initial prediction\n')
+bst <- xgb.train( param, dtrain, 1, watchlist )
+
+
+
+
--- a/demo/guide-R/cross_validation.R
+++ b/demo/guide-R/cross_validation.R
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
+num_round = 2
+
+print ('running cross validation')
+# do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'error'}, seed = 0)
+
+print ('running cross validation, disable standard deviation display')
+# do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'error'}, seed = 0, show_stdv = False)
+
+print ('running cross validation, with preprocessing function')
+# define the preprocessing function
+# used to return the preprocessed training, test data, and parameter
+# we can use this to do weight rescale, etc.
+# as a example, we try to set scale_pos_weight
+def fpreproc(dtrain, dtest, param):
+    label = dtrain.get_label()
+    ratio = float(np.sum(label == 0)) / np.sum(label==1)
+    param['scale_pos_weight'] = ratio
+    return (dtrain, dtest, param)
+
+# do cross validation, for each fold
+# the dtrain, dtest, param will be passed into fpreproc
+# then the return value of fpreproc will be used to generate
+# results of that fold
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
+
+###
+# you can also do cross validation with cutomized loss function
+# See custom_objective.py
+##
+print ('running cross validation, with cutomsized loss function')
+def logregobj(preds, dtrain):
+    labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1.0-preds)
+    return grad, hess
+def evalerror(preds, dtrain):
+    labels = dtrain.get_label()
+    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+
+param = {'max_depth':2, 'eta':1, 'silent':1} 
+# train with customized objective
+xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
+       obj = logregobj, feval=evalerror)
+
--- a/demo/guide-R/custom_objective.R
+++ b/demo/guide-R/custom_objective.R
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+###
+# advanced: cutomsized loss function
+# 
+print ('start running example to used cutomized objective function')
+
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+param = {'max_depth':2, 'eta':1, 'silent':1 }
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 2
+
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+def logregobj(preds, dtrain):
+    labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1.0-preds)
+    return grad, hess
+
+# user defined evaluation function, return a pair metric_name, result
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+def evalerror(preds, dtrain):
+    labels = dtrain.get_label()
+    # return a pair metric_name, result
+    # since preds are margin(before logistic transformation, cutoff at 0)
+    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+
+# training with customized objective, we can also do step by step training
+# simply look at xgboost.py's implementation of train
+bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
--- a/demo/guide-R/generalized_linear_model.R
+++ b/demo/guide-R/generalized_linear_model.R
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+sys.path.append('../../wrapper')
+import xgboost as xgb
+##
+#  this script demonstrate how to fit generalized linear model in xgboost
+#  basically, we are using linear model, instead of tree for our boosters
+##
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+# change booster to gblinear, so that we are fitting a linear model
+# alpha is the L1 regularizer 
+# lambda is the L2 regularizer
+# you can also set lambda_bias which is L2 regularizer on the bias term
+param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
+         'alpha': 0.0001, 'lambda': 1 }
+
+# normally, you do not need to set eta (step_size)
+# XGBoost uses a parallel coordinate descent algorithm (shotgun), 
+# there could be affection on convergence with parallelization on certain cases
+# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
+# param['eta'] = 1 
+
+##
+# the rest of settings are the same
+##
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 4
+bst = xgb.train(param, dtrain, num_round, watchlist)
+preds = bst.predict(dtest)
+labels = dtest.get_label()
+print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
--- a/demo/guide-R/predict_first_ntree.R
+++ b/demo/guide-R/predict_first_ntree.R
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 3
+bst = xgb.train(param, dtrain, num_round, watchlist)
+
+print ('start testing prediction from first n trees')
+### predict using first 1 tree
+label = dtest.get_label()
+ypred1 = bst.predict(dtest, ntree_limit=1)
+# by default, we predict using all the trees
+ypred2 = bst.predict(dtest)
+print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
+print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))