Merge branch 'master' of ssh://github.com/tqchen/xgboost

This commit is contained in:
tqchen 2014-09-06 10:29:42 -07:00
commit 7879db8702
5 changed files with 85 additions and 72 deletions

View File

@ -1,37 +1,32 @@
require(xgboost)
require(methods)
data(iris)
# we use iris data as example dataset
# iris is a dataset with 3 types of iris
data(agaricus.train)
data(agaricus.test)
# we use agaricus data as example dataset
# we will show how to use xgboost to do binary classification here
# so the class label will be whether the flower is of type setosa
iris[,5] <- as.numeric(iris[,5]=='setosa')
iris <- as.matrix(iris)
set.seed(20)
# random split train and test set
test_ind <- sample(1:nrow(iris),50)
train_ind <- setdiff(1:nrow(iris),test_ind)
trainX = iris[train_ind,1:4]
trainY = iris[train_ind,5]
testX = iris[train_ind,1:4]
testY = iris[test_ind,5]
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
#-------------------------------------
# this is the basic usage of xgboost
# you can put matrix in data field
# you can put sparse matrix in data field. this is helpful when your data is sparse
# for example, when you use one-hot encoding for feature vectors
bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic")
# alternatively, you can put sparse matrix, this is helpful when your data is sparse
# for example, when you use one-hot encoding for feature vectors
sparseX <- as(trainX, "sparseMatrix")
bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
# alternatively, you can put dense matrix
denseX <- as(trainX, "matrix")
bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic")
# you can also specify data as file path to a LibSVM format input
# since we do not have libsvm format file for iris, next line is only for illustration
# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')

View File

@ -1,13 +1,15 @@
require(xgboost)
data(iris)
iris[,5] <- as.numeric(iris[,5]=='setosa')
iris <- as.matrix(iris)
set.seed(20)
test_ind <- sample(1:nrow(iris),50)
train_ind <- setdiff(1:nrow(iris),test_ind)
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
watchlist <- list(eval = dtest, train = dtrain)

View File

@ -1,38 +1,45 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
require(xgboost)
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
data(agaricus.train)
data(agaricus.test)
print ('running cross validation')
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
num_round <- 2
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display')
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function')
cat('running cross validation, with preprocessing function\n')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
return (dtrain, dtest, param)
fpreproc <- function(dtrain, dtest, param){
label <- getinfo(dtrain, 'label')
ratio <- mean(label==0)
param <- append(param, list(scale_pos_weight = ratio))
return(list(dtrain=dtrain, dtest= dtest, param = param))
}
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5,
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max_depth=2,eta=1,silent=1)
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

View File

@ -1,13 +1,15 @@
require(xgboost)
data(iris)
iris[,5] <- as.numeric(iris[,5]=='setosa')
iris <- as.matrix(iris)
set.seed(20)
test_ind <- sample(1:nrow(iris),50)
train_ind <- setdiff(1:nrow(iris),test_ind)
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction

View File

@ -1,13 +1,15 @@
require(xgboost)
data(iris)
iris[,5] <- as.numeric(iris[,5]=='setosa')
iris <- as.matrix(iris)
set.seed(20)
test_ind <- sample(1:nrow(iris),50)
train_ind <- setdiff(1:nrow(iris),test_ind)
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')