Merge branch 'master' of ssh://github.com/tqchen/xgboost
This commit is contained in:
commit
7879db8702
@ -1,37 +1,32 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
data(iris)
|
||||
# we use iris data as example dataset
|
||||
# iris is a dataset with 3 types of iris
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
# we use agaricus data as example dataset
|
||||
# we will show how to use xgboost to do binary classification here
|
||||
# so the class label will be whether the flower is of type setosa
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
# random split train and test set
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
trainX = iris[train_ind,1:4]
|
||||
trainY = iris[train_ind,5]
|
||||
testX = iris[train_ind,1:4]
|
||||
testY = iris[test_ind,5]
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
#-------------------------------------
|
||||
# this is the basic usage of xgboost
|
||||
# you can put matrix in data field
|
||||
# you can put sparse matrix in data field. this is helpful when your data is sparse
|
||||
# for example, when you use one-hot encoding for feature vectors
|
||||
bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
# alternatively, you can put sparse matrix, this is helpful when your data is sparse
|
||||
# for example, when you use one-hot encoding for feature vectors
|
||||
sparseX <- as(trainX, "sparseMatrix")
|
||||
bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
|
||||
# alternatively, you can put dense matrix
|
||||
denseX <- as(trainX, "matrix")
|
||||
bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# you can also specify data as file path to a LibSVM format input
|
||||
# since we do not have libsvm format file for iris, next line is only for illustration
|
||||
# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
require(xgboost)
|
||||
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
|
||||
@ -1,38 +1,45 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
require(xgboost)
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||
num_round = 2
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
print ('running cross validation')
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
num_round <- 2
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0)
|
||||
|
||||
print ('running cross validation, disable standard deviation display')
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0, show_stdv = False)
|
||||
|
||||
print ('running cross validation, with preprocessing function')
|
||||
cat('running cross validation, with preprocessing function\n')
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
# as a example, we try to set scale_pos_weight
|
||||
def fpreproc(dtrain, dtest, param):
|
||||
label = dtrain.get_label()
|
||||
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||
param['scale_pos_weight'] = ratio
|
||||
return (dtrain, dtest, param)
|
||||
fpreproc <- function(dtrain, dtest, param){
|
||||
label <- getinfo(dtrain, 'label')
|
||||
ratio <- mean(label==0)
|
||||
param <- append(param, list(scale_pos_weight = ratio))
|
||||
return(list(dtrain=dtrain, dtest= dtest, param = param))
|
||||
}
|
||||
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
# See custom_objective.py
|
||||
##
|
||||
print ('running cross validation, with cutomsized loss function')
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1)
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
require(xgboost)
|
||||
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
require(xgboost)
|
||||
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5]=='setosa')
|
||||
iris <- as.matrix(iris)
|
||||
set.seed(20)
|
||||
test_ind <- sample(1:nrow(iris),50)
|
||||
train_ind <- setdiff(1:nrow(iris),test_ind)
|
||||
dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5])
|
||||
dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5])
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user