require(xgboost) require(methods) # helper function to read libsvm format # this is very badly written, load in dense, and convert to sparse # use this only for demo purpose # adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r read.libsvm = function(fname, maxcol) { content = readLines(fname) nline = length(content) label = numeric(nline) mat = matrix(0, nline, maxcol+1) for (i in 1:nline) { arr = as.vector(strsplit(content[i], " ")[[1]]) label[i] = as.numeric(arr[[1]]) for (j in 2:length(arr)) { kv = strsplit(arr[j], ":")[[1]] # to avoid 0 index findex = as.integer(kv[1]) + 1 fvalue = as.numeric(kv[2]) mat[i,findex] = fvalue } } mat = as(mat, "sparseMatrix") return(list(label=label, data=mat)) } ############################ # Test xgb.DMatrix with local file, sparse matrix and dense matrix in R. ############################ # Directly read in local file dtrain = xgb.DMatrix('agaricus.txt.train') class(dtrain) # read file in R csc = read.libsvm("agaricus.txt.train", 126) y = csc$label x = csc$data # x as Sparse Matrix class(x) dtrain = xgb.DMatrix(x, label=y) # x as dense matrix dense.x = as.matrix(x) dtrain = xgb.DMatrix(dense.x, label=y) ############################ # Test xgboost with local file, sparse matrix and dense matrix in R. ############################ # Test with DMatrix object bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic') # Verbose = 0,1,2 bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', verbose = 0) bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', verbose = 1) bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', verbose = 2) # Test with local file bst = xgboost(data='agaricus.txt.train', max_depth=2, eta=1, objective='binary:logistic') # Test with Sparse Matrix bst = xgboost(data = x, label = y, max_depth=2, eta=1, objective='binary:logistic') # Test with dense Matrix bst = xgboost(data = dense.x, label = y, max_depth=2, eta=1, objective='binary:logistic') ############################ # Test predict ############################ # Prediction with DMatrix object dtest = xgb.DMatrix('agaricus.txt.test') pred = predict(bst, dtest) # Prediction with local test file pred = predict(bst, 'agaricus.txt.test') # Prediction with Sparse Matrix csc = read.libsvm("agaricus.txt.test", 126) test.y = csc$label test.x = csc$data pred = predict(bst, test.x) # Extrac label with xgb.getinfo labels = xgb.getinfo(dtest, "label") err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) print(paste("error=",err)) ############################ # Save and load model to hard disk ############################ # save model to binary local file xgb.save(bst, 'model.save') # load binary model to R bst = xgb.load('model.save') pred = predict(bst, test.x) # save model to text file xgb.dump(bst, 'model.dump') # save a DMatrix object to hard disk xgb.DMatrix.save(dtrain,'dtrain.save') # load a DMatrix object to R dtrain = xgb.DMatrix('dtrain.save') ############################ # More flexible training function xgb.train ############################ param = list(max_depth=2, eta=1, silent = 1, objective="binary:logistic") watchlist <- list("eval"=dtest,"train"=dtrain) # training xgboost model bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) ############################ # cutomsized loss function ############################ param <- list(max_depth = 2, eta = 1, silent =1) # note: for customized objective function, we leave objective as default # note: what we are getting is margin value in prediction # you must know what you are doing # user define objective function, given prediction, return gradient and second order gradient # this is loglikelihood loss logregobj <- function(preds, dtrain) { labels <- xgb.getinfo(dtrain, "label") preds <- 1.0 / (1.0 + exp(-preds)) grad <- preds - labels hess <- preds * (1.0-preds) return(list(grad=grad, hess=hess)) } # user defined evaluation function, return a list(metric="metric-name", value="metric-value") # NOTE: when you do customized loss function, the default prediction value is margin # this may make buildin evalution metric not function properly # for example, we are doing logistic loss, the prediction is score before logistic transformation # the buildin evaluation error assumes input is after logistic transformation # Take this in mind when you use the customization, and maybe you need write customized evaluation function evalerror <- function(preds, dtrain) { labels <- xgb.getinfo(dtrain, "label") err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels) return(list(metric="error", value=err)) } # training with customized objective, we can also do step by step training # simply look at xgboost.py"s implementation of train bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)