Merge branch 'master' of ssh://github.com/tqchen/xgboost

2014-09-06 10:29:42 -07:00 · 2014-09-06 10:29:42 -07:00 · 7879db8702
commit 7879db8702
parent 35431e664e 166df74024
5 changed files with 85 additions and 72 deletions
--- a/demo/guide-R/basic_walkthrough.R
+++ b/demo/guide-R/basic_walkthrough.R
@ -1,37 +1,32 @@
 require(xgboost)
 require(methods)
-data(iris)
+data(agaricus.train)
-# we use iris data as example dataset
+data(agaricus.test)
-# iris is a dataset with 3 types of iris
+
 # we use agaricus data as example dataset
 # we will show how to use xgboost to do binary classification here
-# so the class label will be whether the flower is of type setosa
+
-iris[,5] <- as.numeric(iris[,5]=='setosa')
+trainX = agaricus.train$data
-iris <- as.matrix(iris)
+trainY = agaricus.train$label
-set.seed(20)
+testX = agaricus.test$data
-# random split train and test set
+testY = agaricus.test$label
 test_ind <- sample(1:nrow(iris),50)
 train_ind <- setdiff(1:nrow(iris),test_ind)
 trainX = iris[train_ind,1:4]
 trainY = iris[train_ind,5]
 testX = iris[train_ind,1:4]
 testY = iris[test_ind,5]
 #-------------------------------------
 # this is the basic usage of xgboost
-# you can put matrix in data field
+# you can put sparse matrix in data field. this is helpful when your data is sparse
 # for example, when you use one-hot encoding for feature vectors
 bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
               objective = "binary:logistic")
-# alternatively, you can put sparse matrix, this is helpful when your data is sparse
+# alternatively, you can put dense matrix
-# for example, when you use one-hot encoding for feature vectors
+denseX <- as(trainX, "matrix")
-sparseX <- as(trainX, "sparseMatrix")
+bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
 bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
               objective = "binary:logistic")
 # you can also specify data as file path to a LibSVM format input
 # since we do not have libsvm format file for iris, next line is only for illustration
 # bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+dtrain <- xgb.DMatrix(trainX, label=trainY)
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+dtest <- xgb.DMatrix(testX, label=testY)
 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
--- a/demo/guide-R/boost_from_prediction.R
+++ b/demo/guide-R/boost_from_prediction.R
@ -1,13 +1,15 @@
 require(xgboost)
-data(iris)
+data(agaricus.train)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
+data(agaricus.test)
-iris <- as.matrix(iris)
+
-set.seed(20)
+trainX = agaricus.train$data
-test_ind <- sample(1:nrow(iris),50)
+trainY = agaricus.train$label
-train_ind <- setdiff(1:nrow(iris),test_ind)
+testX = agaricus.test$data
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+testY = agaricus.test$label
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+
 dtrain <- xgb.DMatrix(trainX, label=trainY)
 dtest <- xgb.DMatrix(testX, label=testY)
 watchlist <- list(eval = dtest, train = dtrain)
--- a/demo/guide-R/cross_validation.R
+++ b/demo/guide-R/cross_validation.R
@ -1,38 +1,45 @@
-#!/usr/bin/python
+require(xgboost)
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
-### load data in do training
+data(agaricus.train)
-dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+data(agaricus.test)
 param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
 num_round = 2
-print ('running cross validation')
+trainX = agaricus.train$data
 trainY = agaricus.train$label
 testX = agaricus.test$data
 testY = agaricus.test$label
 dtrain <- xgb.DMatrix(trainX, label=trainY)
 dtest <- xgb.DMatrix(testX, label=testY)
 num_round <- 2
 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
 cat('running cross validation\n')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0)
-print ('running cross validation, disable standard deviation display')
+cat('running cross validation, disable standard deviation display\n')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0, show_stdv = False)
-print ('running cross validation, with preprocessing function')
+cat('running cross validation, with preprocessing function\n')
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
 # as a example, we try to set scale_pos_weight
-def fpreproc(dtrain, dtest, param):
+fpreproc <- function(dtrain, dtest, param){
-    label = dtrain.get_label()
+  label <- getinfo(dtrain, 'label')
-    ratio = float(np.sum(label == 0)) / np.sum(label==1)
+  ratio <- mean(label==0)
-    param['scale_pos_weight'] = ratio
+  param <- append(param, list(scale_pos_weight = ratio))
-    return (dtrain, dtest, param)
+  return(list(dtrain=dtrain, dtest= dtest, param = param))
 }
 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5,
 # See custom_objective.py
 ##
 print ('running cross validation, with cutomsized loss function')
 def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0-preds)
    return grad, hess
 def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
-param = {'max_depth':2, 'eta':1, 'silent':1} 
+logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
 evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
 param <- list(max_depth=2,eta=1,silent=1)
 # train with customized objective
 xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
       obj = logregobj, feval=evalerror)
--- a/demo/guide-R/custom_objective.R
+++ b/demo/guide-R/custom_objective.R
@ -1,13 +1,15 @@
 require(xgboost)
-data(iris)
+data(agaricus.train)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
+data(agaricus.test)
-iris <- as.matrix(iris)
+
-set.seed(20)
+trainX = agaricus.train$data
-test_ind <- sample(1:nrow(iris),50)
+trainY = agaricus.train$label
-train_ind <- setdiff(1:nrow(iris),test_ind)
+testX = agaricus.test$data
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+testY = agaricus.test$label
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+
 dtrain <- xgb.DMatrix(trainX, label=trainY)
 dtest <- xgb.DMatrix(testX, label=testY)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
--- a/demo/guide-R/predict_first_ntree.R
+++ b/demo/guide-R/predict_first_ntree.R
@ -1,13 +1,15 @@
 require(xgboost)
-data(iris)
+data(agaricus.train)
-iris[,5] <- as.numeric(iris[,5]=='setosa')
+data(agaricus.test)
-iris <- as.matrix(iris)
+
-set.seed(20)
+trainX = agaricus.train$data
-test_ind <- sample(1:nrow(iris),50)
+trainY = agaricus.train$label
-train_ind <- setdiff(1:nrow(iris),test_ind)
+testX = agaricus.test$data
-dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
+testY = agaricus.test$label
-dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
+
 dtrain <- xgb.DMatrix(trainX, label=trainY)
 dtest <- xgb.DMatrix(testX, label=testY)
 param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')