From b858283ec5246009842ed680be6fb30179005ae2 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 6 Sep 2014 10:11:45 -0700
Subject: [PATCH 1/2] add basic walkthrough

---
 R-package/R/xgb.train.R            | 15 ++++-
 R-package/R/xgboost.R              | 11 +---
 R-package/data/README.md           |  2 +
 R-package/demo/basic_walkthrough.R | 93 ++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 10 deletions(-)
 create mode 100644 R-package/data/README.md
 create mode 100644 R-package/demo/basic_walkthrough.R

diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 5a7b03090..135fa4485 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -29,6 +29,9 @@
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
 #'   prediction and dtrain,
+#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
+#'   information of performance. If 2, xgboost will print information of both
+#'
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -65,7 +68,7 @@
 #' @export
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
-                      obj = NULL, feval = NULL, ...) {
+                      obj = NULL, feval = NULL, verbose = 1, ...) {
   dtrain <- data
   if (typeof(params) != "list") {
     stop("xgb.train: first argument params must be list")
@@ -73,7 +76,17 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
   if (class(dtrain) != "xgb.DMatrix") {
     stop("xgb.train: second argument dtrain must be xgb.DMatrix")
   }
+  if (verbose > 1) {
+    params <- append(params, list(silent = 0))
+  } else {
+    params <- append(params, list(silent = 1))
+  }
+  if (length(watchlist) != 0 && verbose == 0) {
+    warning('watchlist is provided but verbose=0, no evaluation information will be printed')
+    watchlist <- list()
+  }
   params = append(params, list(...))
+  
   bst <- xgb.Booster(params, append(watchlist, dtrain))
   for (i in 1:nrounds) {
     succ <- xgb.iter.update(bst, dtrain, i - 1, obj)
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index e8888afcd..24c50fec0 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -40,14 +40,7 @@
 #' 
 xgboost <- function(data = NULL, label = NULL, params = list(), nrounds, 
                     verbose = 1, ...) {
-  dtrain <- xgb.get.DMatrix(data, label)
-  if (verbose > 1) {
-    silent <- 0 
-  } else {
-    silent <- 1
-  }
-  
-  params <- append(params, list(silent = silent))
+  dtrain <- xgb.get.DMatrix(data, label)  
   params <- append(params, list(...))
   
   if (verbose > 0) {
@@ -56,7 +49,7 @@ xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
     watchlist <- list()
   }
   
-  bst <- xgb.train(params, dtrain, nrounds, watchlist)
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
   
   return(bst)
 } 
diff --git a/R-package/data/README.md b/R-package/data/README.md
new file mode 100644
index 000000000..d2d63ec11
--- /dev/null
+++ b/R-package/data/README.md
@@ -0,0 +1,2 @@
+This folder contains processed example dataset used by the demos.
+Copyright of the dataset belongs to the original copyright holder
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
new file mode 100644
index 000000000..c4a38e7f8
--- /dev/null
+++ b/R-package/demo/basic_walkthrough.R
@@ -0,0 +1,93 @@
+require(xgboost)
+require(methods)
+# we load in the agaricus dataset
+# In this example, we are aiming to predict whether a mushroom can be eated
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- agaricus.train
+dtest <- agaricus.test
+# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
+class(dtrain$label)
+class(dtrain$data)
+
+#-------------Basic Training using XGBoost-----------------
+# this is the basic usage of xgboost you can put matrix in data field
+# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
+# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
+print("training xgboost with sparseMatrix")
+bst <- xgboost(data = dtrain$data, label = dtrain$label, max_depth = 2, eta = 1, nround = 2,
+               objective = "binary:logistic")
+# alternatively, you can put in dense matrix, i.e. basic R-matrix
+print("training xgboost with Matrix")
+bst <- xgboost(data = as.matrix(dtrain$data), label = dtrain$label, max_depth = 2, eta = 1, nround = 2,
+               objective = "binary:logistic")
+
+# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
+print("training xgboost with xgb.DMatrix")
+dmat <- xgb.DMatrix(data = dtrain$data, label = dtrain$label)
+bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
+
+# Verbose = 0,1,2
+print ('train xgboost with verbose 0, no message')
+bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
+               objective = "binary:logistic", verbose = 0)
+print ('train xgboost with verbose 1, print evaluation metric')
+bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
+               objective = "binary:logistic", verbose = 1)
+print ('train xgboost with verbose 2, also print information about tree')
+bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
+               objective = "binary:logistic", verbose = 2)
+
+# you can also specify data as file path to a LibSVM format input
+# since we do not have this file with us, the following line is just for illustration
+# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
+
+#--------------------basic prediction using xgboost--------------
+# you can do prediction using the following line
+# you can put in Matrix, sparseMatrix, or xgb.DMatrix 
+pred <- predict(bst, dtest$data)
+err <- as.numeric(sum(as.integer(pred > 0.5) != dtest$label))/length(dtest$label)
+print(paste("test-error=", err))
+
+#-------------------save and load models-------------------------
+# save model to binary local file
+xgb.save(bst, "xgboost.model")
+# load binary model to R
+bst2 <- xgb.load("xgboost.model")
+pred2 <- predict(bst2, dtest$data)
+# pred2 should be identical to pred
+print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
+
+#----------------Advanced features --------------
+# to use advanced features, we need to put data in xgb.DMatrix
+dtrain <- xgb.DMatrix(data = dtrain$data, label=dtrain$label)
+dtest <- xgb.DMatrix(data = dtest$data, label=dtest$label)
+#---------------Using watchlist----------------
+# watchlist is a list of xgb.DMatrix, each of them tagged with name
+watchlist <- list(train=dtrain, test=dtest)
+# to train with watchlist, use xgb.train, which contains more advanced features
+# watchlist allows us to monitor the evaluation result on all data in the list 
+print ('train xgboost using xgb.train with watchlist')
+bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
+                 objective = "binary:logistic")
+# we can change evaluation metrics, or use multiple evaluation metrics
+print ('train xgboost using xgb.train with watchlist, watch logloss and error')
+bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
+                 "eval_metric" = "error", "eval_metric" = "logloss",
+                 objective = "binary:logistic")
+
+# xgb.DMatrix can also be saved using xgb.DMatrix.save
+xgb.DMatrix.save(dtrain, "dtrain.buffer")
+# to load it in, simply call xgb.DMatrix
+dtrain2 <- xgb.DMatrix("dtrain.buffer")
+bst <- xgb.train(data=dtrain2, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
+                 objective = "binary:logistic")
+# information can be extracted from xgb.DMatrix using getinfo
+label = getinfo(dtest, "label")
+pred <- predict(bst, dtest)
+err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
+print(paste("test-error=", err))
+
+# Finally, you can dump the tree you learned using xgb.dump into a text file
+xgb.dump(bst, "dump.raw.txt")
+

From 4a8612defcd3932ad25f3e6be03ba664799da4fb Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 6 Sep 2014 10:19:19 -0700
Subject: [PATCH 2/2] add customize objective

---
 R-package/R/utils.R               |  6 ++---
 R-package/demo/custom_objective.R | 39 +++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 R-package/demo/custom_objective.R

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 2dddcc980..4ed6b14fe 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -133,9 +133,9 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
     .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, 
           PACKAGE = "xgboost")
   } else {
-    pred <- xgb.predict(bst, dtrain)
+    pred <- xgb.predict(booster, dtrain)
     gpair <- obj(pred, dtrain)
-    succ <- xgb.iter.boost(bst, dtrain, gpair)
+    succ <- xgb.iter.boost(booster, dtrain, gpair)
   }
   return(TRUE)
 }
@@ -172,7 +172,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
         if (length(names(w)) == 0) {
           stop("xgb.eval: name tag must be presented for every elements in watchlist")
         }
-        ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
+        ret <- feval(xgb.predict(booster, w[[1]]), w[[1]])
         msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
       }
     }
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
new file mode 100644
index 000000000..017961876
--- /dev/null
+++ b/R-package/demo/custom_objective.R
@@ -0,0 +1,39 @@
+require(xgboost)
+# load in the agaricus dataset
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+param <- list(max_depth=2,eta=1,silent=1)
+watchlist <- list(eval = dtest, train = dtrain)
+num_round <- 2
+
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1/(1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+
+# user defined evaluation function, return a pair metric_name, result
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+  return(list(metric = "error", value = err))
+}
+print ('start training with user customized objective')
+# training with customized objective, we can also do step by step training
+# simply look at xgboost.py's implementation of train
+bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)