major change in the design of R interface

2014-08-26 23:41:03 -07:00
parent 84e5fc285b
commit 0130be4acc
21 changed files with 1175 additions and 128 deletions
--- a/R-package/inst/examples/demo-new.R
+++ b/R-package/inst/examples/demo-new.R
@@ -0,0 +1,133 @@
+require(xgboost)
+require(methods)
+
+# helper function to read libsvm format
+# this is very badly written, load in dense, and convert to sparse
+# use this only for demo purpose
+# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
+read.libsvm = function(fname, maxcol) {
+    content = readLines(fname)
+    nline = length(content)
+    label = numeric(nline)
+    mat = matrix(0, nline, maxcol+1)
+    for (i in 1:nline) {
+        arr = as.vector(strsplit(content[i], " ")[[1]])
+        label[i] = as.numeric(arr[[1]])
+        for (j in 2:length(arr)) {
+            kv = strsplit(arr[j], ":")[[1]]
+            # to avoid 0 index
+            findex = as.integer(kv[1]) + 1
+            fvalue = as.numeric(kv[2])
+            mat[i,findex] = fvalue
+        }
+    }
+    mat = as(mat, "sparseMatrix")
+    return(list(label=label, data=mat))
+}
+
+############################
+# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
+############################
+
+# Directly read in local file
+dtrain = xgb.DMatrix('agaricus.txt.train')
+class(dtrain)
+
+# read file in R
+csc = read.libsvm("agaricus.txt.train", 126)
+y = csc$label
+x = csc$data
+
+# x as Sparse Matrix
+class(x)
+dtrain = xgb.DMatrix(x, label=y)
+
+# x as dense matrix
+dense.x = as.matrix(x)
+dtrain = xgb.DMatrix(dense.x, label=y)
+
+############################
+# Test xgboost with local file, sparse matrix and dense matrix in R.
+############################
+
+# Test with DMatrix object
+bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+
+# Test with local file
+bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
+
+# Test with Sparse Matrix
+bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+
+# Test with dense Matrix
+bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
+
+# Test with validation set
+bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test', 
+              max_depth=2, eta=1, silent=1, objective='binary:logistic')
+
+############################
+# Test predict
+############################
+
+# Prediction with DMatrix object
+dtest = xgb.DMatrix('agaricus.txt.test')
+pred = predict(bst, dtest)
+
+# Prediction with local test file
+pred = predict(bst, 'agaricus.txt.test')
+
+# Prediction with Sparse Matrix
+csc = read.libsvm("agaricus.txt.test", 126)
+test.y = csc$label
+test.x = csc$data
+pred = predict(bst, test.x)
+
+# Extrac label with xgb.getinfo
+labels = xgb.getinfo(dtest, "label")
+err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
+print(paste("error=",err))
+
+############################
+# Save and load model to hard disk
+############################
+
+# save model to binary local file
+xgb.save(bst, 'model.save')
+
+# load binary model to R
+bst = xgb.load('model.save')
+pred = predict(bst, test.x)
+
+# save model to text file
+xgb.dump(bst, 'model.dump')
+
+############################
+# Customized objective and evaluation function 
+############################
+
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+logregobj = function(preds, dtrain) {
+    labels = xgb.getinfo(dtrain, "label")
+    preds = 1.0 / (1.0 + exp(-preds))
+    grad = preds - labels
+    hess = preds * (1.0-preds)
+    return(list(grad=grad, hess=hess))
+}
+# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror = function(preds, dtrain) {
+    labels = xgb.getinfo(dtrain, "label")
+    err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
+    return(list(metric="error", value=err))
+}
+
+bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
+              obj=logregobj, feval=evalerror)
+
+
--- a/R-package/inst/examples/demo.R
+++ b/R-package/inst/examples/demo.R
@@ -1,4 +1,5 @@
 require(xgboost)
+require(methods)

 # helper function to read libsvm format
 # this is very badly written, load in dense, and convert to sparse
--- a/R-package/inst/examples/model.dump
+++ b/R-package/inst/examples/model.dump
@@ -0,0 +1,72 @@
+booster[0]:
+0:[f28<1.00001] yes=1,no=2,missing=2
+	1:[f108<1.00001] yes=3,no=4,missing=4
+		3:leaf=1.85965
+		4:leaf=-1.94071
+	2:[f55<1.00001] yes=5,no=6,missing=6
+		5:leaf=-1.70044
+		6:leaf=1.71218
+booster[1]:
+0:[f59<1.00001] yes=1,no=2,missing=2
+	1:leaf=-6.23624
+	2:[f28<1.00001] yes=3,no=4,missing=4
+		3:leaf=-0.96853
+		4:leaf=0.784718
+booster[2]:
+0:[f101<1.00001] yes=1,no=2,missing=2
+	1:[f110<1.00001] yes=3,no=4,missing=4
+		3:leaf=-9.42142
+		4:leaf=-0.791407
+	2:[f66<1.00001] yes=5,no=6,missing=6
+		5:leaf=5.77229
+		6:leaf=0.658725
+booster[3]:
+0:[f26<1.00001] yes=1,no=2,missing=2
+	1:leaf=1.07748
+	2:[f38<1.00001] yes=3,no=4,missing=4
+		3:leaf=-0.877906
+		4:leaf=0.614153
+booster[4]:
+0:[f108<1.00001] yes=1,no=2,missing=2
+	1:leaf=2.92191
+	2:[f35<1.00001] yes=3,no=4,missing=4
+		3:leaf=0.152607
+		4:leaf=-1.26934
+booster[5]:
+0:[f22<1.00001] yes=1,no=2,missing=2
+	1:[f35<1.00001] yes=3,no=4,missing=4
+		3:leaf=-1.02315
+		4:leaf=-3.02414
+	2:[f23<1.00001] yes=5,no=6,missing=6
+		5:leaf=-1.53846
+		6:leaf=0.431742
+booster[6]:
+0:[f28<1.00001] yes=1,no=2,missing=2
+	1:[f108<1.00001] yes=3,no=4,missing=4
+		3:leaf=0.836115
+		4:leaf=-0.912605
+	2:[f23<1.00001] yes=5,no=6,missing=6
+		5:leaf=-1.1971
+		6:leaf=0.777142
+booster[7]:
+0:[f38<1.00001] yes=1,no=2,missing=2
+	1:[f26<1.00001] yes=3,no=4,missing=4
+		3:leaf=0.890623
+		4:leaf=-0.908312
+	2:[f111<1.00001] yes=5,no=6,missing=6
+		5:leaf=1.43619
+		6:leaf=-0.0180106
+booster[8]:
+0:[f22<1.00001] yes=1,no=2,missing=2
+	1:leaf=-1.01502
+	2:[f101<1.00001] yes=3,no=4,missing=4
+		3:leaf=0.568838
+		4:leaf=-0.515293
+booster[9]:
+0:[f114<1.00001] yes=1,no=2,missing=2
+	1:[f60<1.00001] yes=3,no=4,missing=4
+		3:leaf=-0.609475
+		4:leaf=3.63443
+	2:[f28<1.00001] yes=5,no=6,missing=6
+		5:leaf=-0.734556
+		6:leaf=0.217203