Merge branch 'master' of ssh://github.com/tqchen/xgboost

This commit is contained in:
tqchen 2014-09-06 10:29:42 -07:00
commit 7879db8702
5 changed files with 85 additions and 72 deletions

View File

@ -1,37 +1,32 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
data(iris) data(agaricus.train)
# we use iris data as example dataset data(agaricus.test)
# iris is a dataset with 3 types of iris
# we use agaricus data as example dataset
# we will show how to use xgboost to do binary classification here # we will show how to use xgboost to do binary classification here
# so the class label will be whether the flower is of type setosa
iris[,5] <- as.numeric(iris[,5]=='setosa') trainX = agaricus.train$data
iris <- as.matrix(iris) trainY = agaricus.train$label
set.seed(20) testX = agaricus.test$data
# random split train and test set testY = agaricus.test$label
test_ind <- sample(1:nrow(iris),50)
train_ind <- setdiff(1:nrow(iris),test_ind)
trainX = iris[train_ind,1:4]
trainY = iris[train_ind,5]
testX = iris[train_ind,1:4]
testY = iris[test_ind,5]
#------------------------------------- #-------------------------------------
# this is the basic usage of xgboost # this is the basic usage of xgboost
# you can put matrix in data field # you can put sparse matrix in data field. this is helpful when your data is sparse
# for example, when you use one-hot encoding for feature vectors
bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2, bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic") objective = "binary:logistic")
# alternatively, you can put sparse matrix, this is helpful when your data is sparse # alternatively, you can put dense matrix
# for example, when you use one-hot encoding for feature vectors denseX <- as(trainX, "matrix")
sparseX <- as(trainX, "sparseMatrix") bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic") objective = "binary:logistic")
# you can also specify data as file path to a LibSVM format input # you can also specify data as file path to a LibSVM format input
# since we do not have libsvm format file for iris, next line is only for illustration # since we do not have libsvm format file for iris, next line is only for illustration
# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic") # bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')

View File

@ -1,13 +1,15 @@
require(xgboost) require(xgboost)
data(iris) data(agaricus.train)
iris[,5] <- as.numeric(iris[,5]=='setosa') data(agaricus.test)
iris <- as.matrix(iris)
set.seed(20) trainX = agaricus.train$data
test_ind <- sample(1:nrow(iris),50) trainY = agaricus.train$label
train_ind <- setdiff(1:nrow(iris),test_ind) testX = agaricus.test$data
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) testY = agaricus.test$label
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)

View File

@ -1,38 +1,45 @@
#!/usr/bin/python require(xgboost)
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training data(agaricus.train)
dtrain = xgb.DMatrix('../data/agaricus.txt.train') data(agaricus.test)
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print ('running cross validation') trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
num_round <- 2
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as # do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value # [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric # std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5, xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0) metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display') cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as # do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value # [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric # std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5, xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False) metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function') cat('running cross validation, with preprocessing function\n')
# define the preprocessing function # define the preprocessing function
# used to return the preprocessed training, test data, and parameter # used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc. # we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight # as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param): fpreproc <- function(dtrain, dtest, param){
label = dtrain.get_label() label <- getinfo(dtrain, 'label')
ratio = float(np.sum(label == 0)) / np.sum(label==1) ratio <- mean(label==0)
param['scale_pos_weight'] = ratio param <- append(param, list(scale_pos_weight = ratio))
return (dtrain, dtest, param) return(list(dtrain=dtrain, dtest= dtest, param = param))
}
# do cross validation, for each fold # do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc # the dtrain, dtest, param will be passed into fpreproc
@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5,
# See custom_objective.py # See custom_objective.py
## ##
print ('running cross validation, with cutomsized loss function') print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1} logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max_depth=2,eta=1,silent=1)
# train with customized objective # train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror) obj = logregobj, feval=evalerror)

View File

@ -1,13 +1,15 @@
require(xgboost) require(xgboost)
data(iris) data(agaricus.train)
iris[,5] <- as.numeric(iris[,5]=='setosa') data(agaricus.test)
iris <- as.matrix(iris)
set.seed(20) trainX = agaricus.train$data
test_ind <- sample(1:nrow(iris),50) trainY = agaricus.train$label
train_ind <- setdiff(1:nrow(iris),test_ind) testX = agaricus.test$data
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) testY = agaricus.test$label
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
# note: for customized objective function, we leave objective as default # note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction # note: what we are getting is margin value in prediction

View File

@ -1,13 +1,15 @@
require(xgboost) require(xgboost)
data(iris) data(agaricus.train)
iris[,5] <- as.numeric(iris[,5]=='setosa') data(agaricus.test)
iris <- as.matrix(iris)
set.seed(20) trainX = agaricus.train$data
test_ind <- sample(1:nrow(iris),50) trainY = agaricus.train$label
train_ind <- setdiff(1:nrow(iris),test_ind) testX = agaricus.test$data
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) testY = agaricus.test$label
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')