Merge branch 'master' of ssh://github.com/tqchen/xgboost

2014-09-06 10:29:42 -07:00
parent 35431e664e 166df74024
commit 7879db8702
5 changed files with 85 additions and 72 deletions
--- a/demo/guide-R/basic_walkthrough.R
+++ b/demo/guide-R/basic_walkthrough.R
@@ -1,37 +1,32 @@
 require(xgboost)
 require(methods)
-data(iris)
-# we use iris data as example dataset
-# iris is a dataset with 3 types of iris
+data(agaricus.train)
+data(agaricus.test)
+
+# we use agaricus data as example dataset
 # we will show how to use xgboost to do binary classification here
-# so the class label will be whether the flower is of type setosa
-iris[,5] <- as.numeric(iris[,5]=='setosa')
-iris <- as.matrix(iris)
-set.seed(20)
-# random split train and test set
-test_ind <- sample(1:nrow(iris),50)
-train_ind <- setdiff(1:nrow(iris),test_ind)
-trainX = iris[train_ind,1:4]
-trainY = iris[train_ind,5]
-testX = iris[train_ind,1:4]
-testY = iris[test_ind,5]
+
+trainX = agaricus.train$data
+trainY = agaricus.train$label
+testX = agaricus.test$data
+testY = agaricus.test$label
 #-------------------------------------
 # this is the basic usage of xgboost
-# you can put matrix in data field
+# you can put sparse matrix in data field. this is helpful when your data is sparse
+# for example, when you use one-hot encoding for feature vectors
 bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
               objective = "binary:logistic")
-# alternatively, you can put sparse matrix, this is helpful when your data is sparse
-# for example, when you use one-hot encoding for feature vectors
-sparseX <- as(trainX, "sparseMatrix")
-bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
+# alternatively, you can put dense matrix
+denseX <- as(trainX, "matrix")
+bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
               objective = "binary:logistic")

 # you can also specify data as file path to a LibSVM format input
 # since we do not have libsvm format file for iris, next line is only for illustration
 # bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")

-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+dtrain <- xgb.DMatrix(trainX, label=trainY)
+dtest <- xgb.DMatrix(testX, label=testY)


 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
--- a/demo/guide-R/boost_from_prediction.R
+++ b/demo/guide-R/boost_from_prediction.R
@@ -1,13 +1,15 @@
 require(xgboost)

-data(iris)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
-iris <- as.matrix(iris)
-set.seed(20)
-test_ind <- sample(1:nrow(iris),50)
-train_ind <- setdiff(1:nrow(iris),test_ind)
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+data(agaricus.train)
+data(agaricus.test)
+
+trainX = agaricus.train$data
+trainY = agaricus.train$label
+testX = agaricus.test$data
+testY = agaricus.test$label
+
+dtrain <- xgb.DMatrix(trainX, label=trainY)
+dtest <- xgb.DMatrix(testX, label=testY)


 watchlist <- list(eval = dtest, train = dtrain)
--- a/demo/guide-R/cross_validation.R
+++ b/demo/guide-R/cross_validation.R
@@ -1,38 +1,45 @@
-#!/usr/bin/python
-import sys
-import numpy as np
-sys.path.append('../../wrapper')
-import xgboost as xgb
+require(xgboost)

-### load data in do training
-dtrain = xgb.DMatrix('../data/agaricus.txt.train')
-param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
-num_round = 2
+data(agaricus.train)
+data(agaricus.test)

-print ('running cross validation')
+trainX = agaricus.train$data
+trainY = agaricus.train$label
+testX = agaricus.test$data
+testY = agaricus.test$label
+
+dtrain <- xgb.DMatrix(trainX, label=trainY)
+dtest <- xgb.DMatrix(testX, label=testY)
+
+num_round <- 2
+param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
+
+cat('running cross validation\n')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0)

-print ('running cross validation, disable standard deviation display')
+cat('running cross validation, disable standard deviation display\n')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0, show_stdv = False)

-print ('running cross validation, with preprocessing function')
+cat('running cross validation, with preprocessing function\n')
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
 # as a example, we try to set scale_pos_weight
-def fpreproc(dtrain, dtest, param):
-    label = dtrain.get_label()
-    ratio = float(np.sum(label == 0)) / np.sum(label==1)
-    param['scale_pos_weight'] = ratio
-    return (dtrain, dtest, param)
+fpreproc <- function(dtrain, dtest, param){
+  label <- getinfo(dtrain, 'label')
+  ratio <- mean(label==0)
+  param <- append(param, list(scale_pos_weight = ratio))
+  return(list(dtrain=dtrain, dtest= dtest, param = param))
+}
+

 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
@@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5,
 # See custom_objective.py
 ##
 print ('running cross validation, with cutomsized loss function')
-def logregobj(preds, dtrain):
-    labels = dtrain.get_label()
-    preds = 1.0 / (1.0 + np.exp(-preds))
-    grad = preds - labels
-    hess = preds * (1.0-preds)
-    return grad, hess
-def evalerror(preds, dtrain):
-    labels = dtrain.get_label()
-    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

-param = {'max_depth':2, 'eta':1, 'silent':1} 
+logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1/(1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+  return(list(metric = "error", value = err))
+}
+
+param <- list(max_depth=2,eta=1,silent=1)
 # train with customized objective
 xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
       obj = logregobj, feval=evalerror)
--- a/demo/guide-R/custom_objective.R
+++ b/demo/guide-R/custom_objective.R
@@ -1,13 +1,15 @@
 require(xgboost)

-data(iris)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
-iris <- as.matrix(iris)
-set.seed(20)
-test_ind <- sample(1:nrow(iris),50)
-train_ind <- setdiff(1:nrow(iris),test_ind)
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+data(agaricus.train)
+data(agaricus.test)
+
+trainX = agaricus.train$data
+trainY = agaricus.train$label
+testX = agaricus.test$data
+testY = agaricus.test$label
+
+dtrain <- xgb.DMatrix(trainX, label=trainY)
+dtest <- xgb.DMatrix(testX, label=testY)

 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
--- a/demo/guide-R/predict_first_ntree.R
+++ b/demo/guide-R/predict_first_ntree.R
@@ -1,13 +1,15 @@
 require(xgboost)

-data(iris)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
-iris <- as.matrix(iris)
-set.seed(20)
-test_ind <- sample(1:nrow(iris),50)
-train_ind <- setdiff(1:nrow(iris),test_ind)
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+data(agaricus.train)
+data(agaricus.test)
+
+trainX = agaricus.train$data
+trainY = agaricus.train$label
+testX = agaricus.test$data
+testY = agaricus.test$label
+
+dtrain <- xgb.DMatrix(trainX, label=trainY)
+dtest <- xgb.DMatrix(testX, label=testY)


 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')