From b858283ec5246009842ed680be6fb30179005ae2 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 6 Sep 2014 10:11:45 -0700 Subject: [PATCH 1/2] add basic walkthrough --- R-package/R/xgb.train.R | 15 ++++- R-package/R/xgboost.R | 11 +--- R-package/data/README.md | 2 + R-package/demo/basic_walkthrough.R | 93 ++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 10 deletions(-) create mode 100644 R-package/data/README.md create mode 100644 R-package/demo/basic_walkthrough.R diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 5a7b03090..135fa4485 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -29,6 +29,9 @@ #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain, +#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print +#' information of performance. If 2, xgboost will print information of both +#' #' @param ... other parameters to pass to \code{params}. #' #' @details @@ -65,7 +68,7 @@ #' @export #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), - obj = NULL, feval = NULL, ...) { + obj = NULL, feval = NULL, verbose = 1, ...) { dtrain <- data if (typeof(params) != "list") { stop("xgb.train: first argument params must be list") @@ -73,7 +76,17 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), if (class(dtrain) != "xgb.DMatrix") { stop("xgb.train: second argument dtrain must be xgb.DMatrix") } + if (verbose > 1) { + params <- append(params, list(silent = 0)) + } else { + params <- append(params, list(silent = 1)) + } + if (length(watchlist) != 0 && verbose == 0) { + warning('watchlist is provided but verbose=0, no evaluation information will be printed') + watchlist <- list() + } params = append(params, list(...)) + bst <- xgb.Booster(params, append(watchlist, dtrain)) for (i in 1:nrounds) { succ <- xgb.iter.update(bst, dtrain, i - 1, obj) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index e8888afcd..24c50fec0 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -40,14 +40,7 @@ #' xgboost <- function(data = NULL, label = NULL, params = list(), nrounds, verbose = 1, ...) { - dtrain <- xgb.get.DMatrix(data, label) - if (verbose > 1) { - silent <- 0 - } else { - silent <- 1 - } - - params <- append(params, list(silent = silent)) + dtrain <- xgb.get.DMatrix(data, label) params <- append(params, list(...)) if (verbose > 0) { @@ -56,7 +49,7 @@ xgboost <- function(data = NULL, label = NULL, params = list(), nrounds, watchlist <- list() } - bst <- xgb.train(params, dtrain, nrounds, watchlist) + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose) return(bst) } diff --git a/R-package/data/README.md b/R-package/data/README.md new file mode 100644 index 000000000..d2d63ec11 --- /dev/null +++ b/R-package/data/README.md @@ -0,0 +1,2 @@ +This folder contains processed example dataset used by the demos. +Copyright of the dataset belongs to the original copyright holder diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R new file mode 100644 index 000000000..c4a38e7f8 --- /dev/null +++ b/R-package/demo/basic_walkthrough.R @@ -0,0 +1,93 @@ +require(xgboost) +require(methods) +# we load in the agaricus dataset +# In this example, we are aiming to predict whether a mushroom can be eated +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +dtrain <- agaricus.train +dtest <- agaricus.test +# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} +class(dtrain$label) +class(dtrain$data) + +#-------------Basic Training using XGBoost----------------- +# this is the basic usage of xgboost you can put matrix in data field +# note: we are puting in sparse matrix here, xgboost naturally handles sparse input +# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector) +print("training xgboost with sparseMatrix") +bst <- xgboost(data = dtrain$data, label = dtrain$label, max_depth = 2, eta = 1, nround = 2, + objective = "binary:logistic") +# alternatively, you can put in dense matrix, i.e. basic R-matrix +print("training xgboost with Matrix") +bst <- xgboost(data = as.matrix(dtrain$data), label = dtrain$label, max_depth = 2, eta = 1, nround = 2, + objective = "binary:logistic") + +# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features +print("training xgboost with xgb.DMatrix") +dmat <- xgb.DMatrix(data = dtrain$data, label = dtrain$label) +bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic") + +# Verbose = 0,1,2 +print ('train xgboost with verbose 0, no message') +bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, + objective = "binary:logistic", verbose = 0) +print ('train xgboost with verbose 1, print evaluation metric') +bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, + objective = "binary:logistic", verbose = 1) +print ('train xgboost with verbose 2, also print information about tree') +bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, + objective = "binary:logistic", verbose = 2) + +# you can also specify data as file path to a LibSVM format input +# since we do not have this file with us, the following line is just for illustration +# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nround = 2,objective = "binary:logistic") + +#--------------------basic prediction using xgboost-------------- +# you can do prediction using the following line +# you can put in Matrix, sparseMatrix, or xgb.DMatrix +pred <- predict(bst, dtest$data) +err <- as.numeric(sum(as.integer(pred > 0.5) != dtest$label))/length(dtest$label) +print(paste("test-error=", err)) + +#-------------------save and load models------------------------- +# save model to binary local file +xgb.save(bst, "xgboost.model") +# load binary model to R +bst2 <- xgb.load("xgboost.model") +pred2 <- predict(bst2, dtest$data) +# pred2 should be identical to pred +print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) + +#----------------Advanced features -------------- +# to use advanced features, we need to put data in xgb.DMatrix +dtrain <- xgb.DMatrix(data = dtrain$data, label=dtrain$label) +dtest <- xgb.DMatrix(data = dtest$data, label=dtest$label) +#---------------Using watchlist---------------- +# watchlist is a list of xgb.DMatrix, each of them tagged with name +watchlist <- list(train=dtrain, test=dtest) +# to train with watchlist, use xgb.train, which contains more advanced features +# watchlist allows us to monitor the evaluation result on all data in the list +print ('train xgboost using xgb.train with watchlist') +bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, + objective = "binary:logistic") +# we can change evaluation metrics, or use multiple evaluation metrics +print ('train xgboost using xgb.train with watchlist, watch logloss and error') +bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, + "eval_metric" = "error", "eval_metric" = "logloss", + objective = "binary:logistic") + +# xgb.DMatrix can also be saved using xgb.DMatrix.save +xgb.DMatrix.save(dtrain, "dtrain.buffer") +# to load it in, simply call xgb.DMatrix +dtrain2 <- xgb.DMatrix("dtrain.buffer") +bst <- xgb.train(data=dtrain2, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, + objective = "binary:logistic") +# information can be extracted from xgb.DMatrix using getinfo +label = getinfo(dtest, "label") +pred <- predict(bst, dtest) +err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) +print(paste("test-error=", err)) + +# Finally, you can dump the tree you learned using xgb.dump into a text file +xgb.dump(bst, "dump.raw.txt") + From 4a8612defcd3932ad25f3e6be03ba664799da4fb Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 6 Sep 2014 10:19:19 -0700 Subject: [PATCH 2/2] add customize objective --- R-package/R/utils.R | 6 ++--- R-package/demo/custom_objective.R | 39 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 R-package/demo/custom_objective.R diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 2dddcc980..4ed6b14fe 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -133,9 +133,9 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE = "xgboost") } else { - pred <- xgb.predict(bst, dtrain) + pred <- xgb.predict(booster, dtrain) gpair <- obj(pred, dtrain) - succ <- xgb.iter.boost(bst, dtrain, gpair) + succ <- xgb.iter.boost(booster, dtrain, gpair) } return(TRUE) } @@ -172,7 +172,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) { if (length(names(w)) == 0) { stop("xgb.eval: name tag must be presented for every elements in watchlist") } - ret <- feval(xgb.predict(bst, w[[1]]), w[[1]]) + ret <- feval(xgb.predict(booster, w[[1]]), w[[1]]) msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="") } } diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R new file mode 100644 index 000000000..017961876 --- /dev/null +++ b/R-package/demo/custom_objective.R @@ -0,0 +1,39 @@ +require(xgboost) +# load in the agaricus dataset +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) + +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param <- list(max_depth=2,eta=1,silent=1) +watchlist <- list(eval = dtest, train = dtrain) +num_round <- 2 + +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} + +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + return(list(metric = "error", value = err)) +} +print ('start training with user customized objective') +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)