xgboost/R-package/inst/examples/demo.R

require(xgboost)
require(methods)

# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose adopted from
# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
  content <- readLines(fname)
  nline <- length(content)
  label <- numeric(nline)
  mat <- matrix(0, nline, maxcol + 1)
  for (i in 1:nline) {
    arr <- as.vector(strsplit(content[i], " ")[[1]])
    label[i] <- as.numeric(arr[[1]])
    for (j in 2:length(arr)) {
      kv <- strsplit(arr[j], ":")[[1]]
      # to avoid 0 index
      findex <- as.integer(kv[1]) + 1
      fvalue <- as.numeric(kv[2])
      mat[i, findex] <- fvalue
    }
  }
  mat <- as(mat, "sparseMatrix")
  return(list(label = label, data = mat))
}

############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.

# Directly read in local file
dtrain <- xgb.DMatrix("agaricus.txt.train")
class(dtrain)

# read file in R
csc <- read.libsvm("agaricus.txt.train", 126)
y <- csc$label
x <- csc$data

# x as Sparse Matrix
class(x)
dtrain <- xgb.DMatrix(x, label = y)

# x as dense matrix
dense.x <- as.matrix(x)
dtrain <- xgb.DMatrix(dense.x, label = y)

############################ Test xgboost with local file, sparse matrix and dense matrix in R.

# Test with DMatrix object
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")

# Verbose = 0,1,2
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 0)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 1)
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic", verbose = 2)

# Test with local file
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
               objective = "binary:logistic")

# Test with Sparse Matrix
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")

# Test with dense Matrix
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
               objective = "binary:logistic")


############################ Test predict

# Prediction with DMatrix object
dtest <- xgb.DMatrix("agaricus.txt.test")
pred <- predict(bst, dtest)

# Prediction with local test file
pred <- predict(bst, "agaricus.txt.test")

# Prediction with Sparse Matrix
csc <- read.libsvm("agaricus.txt.test", 126)
test.y <- csc$label
test.x <- csc$data
pred <- predict(bst, test.x)

# Extrac label with getinfo
labels <- getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=", err))

############################ Save and load model to hard disk

# save model to binary local file
xgb.save(bst, "xgboost.model")

# load binary model to R
bst <- xgb.load("xgboost.model")
pred <- predict(bst, test.x)

# save model to text file
xgb.dump(bst, "dump.raw.txt")
# save model to text file, with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")

# save a DMatrix object to hard disk
xgb.DMatrix.save(dtrain, "dtrain.buffer")

# load a DMatrix object to R
dtrain <- xgb.DMatrix("dtrain.buffer")

############################ More flexible training function xgb.train

param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
watchlist <- list(eval = dtest, train = dtrain)

# training xgboost model
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)

############################ cutomsized loss function

param <- list(max_depth = 2, eta = 1, silent = 1)

# note: for customized objective function, we leave objective as default note: what we are getting is
# margin value in prediction you must know what you are doing

# user define objective function, given prediction, return gradient and second order gradient this is
# loglikelihood loss
logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
# you do customized loss function, the default prediction value is margin this may make buildin
# evalution metric not function properly for example, we are doing logistic loss, the prediction is
# score before logistic transformation the buildin evaluation error assumes input is after logistic
# transformation Take this in mind when you use the customization, and maybe you need write customized
# evaluation function
evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
}

# training with customized objective, we can also do step by step training simply look at xgboost.py's
# implementation of train
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)