refinement of R package

2014-08-27 12:57:37 -07:00 · 2014-08-27 12:57:37 -07:00 · d747172d37
commit d747172d37
parent 0fe5470a4f
6 changed files with 89 additions and 60 deletions
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -8,3 +8,4 @@ export(xgb.train)
 export(xgb.save)
 export(xgb.load)
 export(xgb.dump)
+export(xgb.DMatrix.save)
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@ -1,5 +1,5 @@
 # constructing DMatrix
-xgb.DMatrix <- function(data, missing=0.0, ...) {
+xgb.DMatrix <- function(data, info=list(), missing=0.0, ...) {
    if (typeof(data) == "character") {
        handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost")
    } else if(is.matrix(data)) {
@ -11,7 +11,7 @@ xgb.DMatrix <- function(data, missing=0.0, ...) {
    }
    dmat <- structure(handle, class="xgb.DMatrix")
    
-    info = list(...)
+    info = append(info,list(...))
    if (length(info)==0)
        return(dmat)
    for (i in 1:length(info)) {
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@ -0,0 +1,12 @@
+# save model or DMatrix to file 
+xgb.DMatrix.save <- function(handle, fname) {
+    if (typeof(fname) != "character") {
+        stop("xgb.save: fname must be character")
+    }
+    if (class(handle) == "xgb.DMatrix") {
+        .Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
+        return(TRUE)
+    }
+    stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
+    return(FALSE)
+}
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@ -7,10 +7,6 @@ xgb.save <- function(handle, fname) {
        .Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost")
        return(TRUE)
    }
-    if (class(handle) == "xgb.DMatrix") {
-        .Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
-        return(TRUE)
-    }
    stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
    return(FALSE)
 }
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@ -1,49 +1,41 @@
 # Main function for xgboost-package

-xgboost = function(x=NULL,y=NULL,DMatrix=NULL, file=NULL, validation=NULL, 
-                   nrounds=10, obj=NULL, feval=NULL, margin=NULL, verbose = T, ...)
+xgboost = function(data=NULL, label = NULL, params=list(), nrounds=10, 
+                   verbose = 1, ...)
 {
-    if (!is.null(DMatrix))
-        dtrain = DMatrix
+    inClass = class(data)
+    if (inClass=='dgCMatrix' || inClass=='matrix')
+    {
+        if (is.null(label))
+            stop('xgboost: need label when data is a matrix')
+        dtrain = xgb.DMatrix(data, label=y)
+    }
    else
    {
-        if (is.null(x) && is.null(y))
-        {
-            if (is.null(file))
-                stop('xgboost need input data, either R objects, local files or DMatrix object.')
-            dtrain = xgb.DMatrix(file)
-        }
+        if (!is.null(label))
+            warning('xgboost: label will be ignored.')
+        if (inClass=='character')
+            dtrain = xgb.DMatrix(data)
+        else if (inClass=='xgb.DMatrix')
+            dtrain = data
        else
-            dtrain = xgb.DMatrix(x, label=y)
-        if (!is.null(margin))
-        {
-            succ <- xgb.setinfo(dtrain, "base_margin", margin)
-            if (!succ)
-                warning('Attemp to use margin failed.')
-        }
+            stop('xgboost: Invalid input of data')
    }
    
-    params = list(...)
+    if (verbose>1)
+        silent = 0
+    else
+        silent = 1
    
-    watchlist=list()
-    if (verbose)
-    {
-        if (!is.null(validation))
-        {
-            if (class(validation)!='xgb.DMatrix')
-                dtest = xgb.DMatrix(validation)
-            else
-                dtest = validation
-            watchlist = list(eval=dtest,train=dtrain)
-        }
-            
-        else
-            watchlist = list(train=dtrain)
-    }
+    params = append(params, list(silent=silent))
+    params = append(params, list(...)) 
    
-    bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval)
+    if (verbose>0)
+        watchlist = list(train=dtrain)
+    else
+        watchlist = list()
+    
+    bst <- xgb.train(params, dtrain, nrounds, watchlist)
    
    return(bst)
 }
-
-
--- a/R-package/inst/examples/demo-new.R
+++ b/R-package/inst/examples/demo-new.R
@ -51,20 +51,25 @@ dtrain = xgb.DMatrix(dense.x, label=y)
 ############################

 # Test with DMatrix object
-bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic')
+
+# Verbose = 0,1,2
+bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', 
+              verbose = 0)
+bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', 
+              verbose = 1)
+bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', 
+              verbose = 2)

 # Test with local file
-bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
+bst = xgboost(data='agaricus.txt.train', max_depth=2, eta=1, objective='binary:logistic')

 # Test with Sparse Matrix
-bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+bst = xgboost(data = x, label = y, max_depth=2, eta=1, objective='binary:logistic')

 # Test with dense Matrix
-bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+bst = xgboost(data = dense.x, label = y, max_depth=2, eta=1, objective='binary:logistic')

-# Test with validation set
-bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test', 
-              max_depth=2, eta=1, silent=1, objective='binary:logistic')

 ############################
 # Test predict
@ -102,17 +107,39 @@ pred = predict(bst, test.x)
 # save model to text file
 xgb.dump(bst, 'model.dump')

+# save a DMatrix object to hard disk
+xgb.DMatrix.save(dtrain,'dtrain.save')
+
+# load a DMatrix object to R
+dtrain = xgb.DMatrix('dtrain.save')
+
 ############################
-# Customized objective and evaluation function 
+# More flexible training function xgb.train
 ############################

+param = list(max_depth=2, eta=1, silent = 1, objective="binary:logistic")
+watchlist <- list("eval"=dtest,"train"=dtrain)
+
+# training xgboost model
+bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
+
+############################
+# cutomsized loss function
+############################
+
+param <- list(max_depth = 2, eta = 1, silent =1)
+
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
-logregobj = function(preds, dtrain) {
-    labels = xgb.getinfo(dtrain, "label")
-    preds = 1.0 / (1.0 + exp(-preds))
-    grad = preds - labels
-    hess = preds * (1.0-preds)
+logregobj <- function(preds, dtrain) {
+    labels <- xgb.getinfo(dtrain, "label")
+    preds <- 1.0 / (1.0 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1.0-preds)
    return(list(grad=grad, hess=hess))
 }
 # user defined evaluation function, return a list(metric="metric-name", value="metric-value")
@ -121,13 +148,14 @@ logregobj = function(preds, dtrain) {
 # for example, we are doing logistic loss, the prediction is score before logistic transformation
 # the buildin evaluation error assumes input is after logistic transformation
 # Take this in mind when you use the customization, and maybe you need write customized evaluation function
-evalerror = function(preds, dtrain) {
-    labels = xgb.getinfo(dtrain, "label")
-    err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
+evalerror <- function(preds, dtrain) {
+    labels <- xgb.getinfo(dtrain, "label")
+    err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
    return(list(metric="error", value=err))
 }

-bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
-              obj=logregobj, feval=evalerror)
+# training with customized objective, we can also do step by step training
+# simply look at xgboost.py"s implementation of train
+bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)