From a35d93c7361eacd740842951461701bf1d9c4375 Mon Sep 17 00:00:00 2001 From: hetong Date: Sat, 6 Sep 2014 10:19:46 -0700 Subject: [PATCH] change data from iris back to mushroom --- demo/guide-R/basic_walkthrough.R | 37 +++++++--------- demo/guide-R/boost_from_prediction.R | 18 ++++---- demo/guide-R/cross_validation.R | 66 ++++++++++++++++------------ demo/guide-R/custom_objective.R | 18 ++++---- demo/guide-R/predict_first_ntree.R | 18 ++++---- 5 files changed, 85 insertions(+), 72 deletions(-) diff --git a/demo/guide-R/basic_walkthrough.R b/demo/guide-R/basic_walkthrough.R index 0c7e677ab..0d057c0d7 100644 --- a/demo/guide-R/basic_walkthrough.R +++ b/demo/guide-R/basic_walkthrough.R @@ -1,37 +1,32 @@ require(xgboost) require(methods) -data(iris) -# we use iris data as example dataset -# iris is a dataset with 3 types of iris +data(agaricus.train) +data(agaricus.test) + +# we use agaricus data as example dataset # we will show how to use xgboost to do binary classification here -# so the class label will be whether the flower is of type setosa -iris[,5] <- as.numeric(iris[,5]=='setosa') -iris <- as.matrix(iris) -set.seed(20) -# random split train and test set -test_ind <- sample(1:nrow(iris),50) -train_ind <- setdiff(1:nrow(iris),test_ind) -trainX = iris[train_ind,1:4] -trainY = iris[train_ind,5] -testX = iris[train_ind,1:4] -testY = iris[test_ind,5] + +trainX = agaricus.train$data +trainY = agaricus.train$label +testX = agaricus.test$data +testY = agaricus.test$label #------------------------------------- # this is the basic usage of xgboost -# you can put matrix in data field +# you can put sparse matrix in data field. this is helpful when your data is sparse +# for example, when you use one-hot encoding for feature vectors bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2, objective = "binary:logistic") -# alternatively, you can put sparse matrix, this is helpful when your data is sparse -# for example, when you use one-hot encoding for feature vectors -sparseX <- as(trainX, "sparseMatrix") -bst <- xgboost(data = sparseX, label = trainY, max_depth = 1, eta = 1, nround = 2, +# alternatively, you can put dense matrix +denseX <- as(trainX, "matrix") +bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2, objective = "binary:logistic") # you can also specify data as file path to a LibSVM format input # since we do not have libsvm format file for iris, next line is only for illustration # bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic") -dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) -dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) +dtrain <- xgb.DMatrix(trainX, label=trainY) +dtest <- xgb.DMatrix(testX, label=testY) param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') diff --git a/demo/guide-R/boost_from_prediction.R b/demo/guide-R/boost_from_prediction.R index 69fe4153c..9bf9b5d41 100755 --- a/demo/guide-R/boost_from_prediction.R +++ b/demo/guide-R/boost_from_prediction.R @@ -1,13 +1,15 @@ require(xgboost) -data(iris) -iris[,5] <- as.numeric(iris[,5]=='setosa') -iris <- as.matrix(iris) -set.seed(20) -test_ind <- sample(1:nrow(iris),50) -train_ind <- setdiff(1:nrow(iris),test_ind) -dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) -dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) +data(agaricus.train) +data(agaricus.test) + +trainX = agaricus.train$data +trainY = agaricus.train$label +testX = agaricus.test$data +testY = agaricus.test$label + +dtrain <- xgb.DMatrix(trainX, label=trainY) +dtest <- xgb.DMatrix(testX, label=testY) watchlist <- list(eval = dtest, train = dtrain) diff --git a/demo/guide-R/cross_validation.R b/demo/guide-R/cross_validation.R index a50586c58..c35974875 100755 --- a/demo/guide-R/cross_validation.R +++ b/demo/guide-R/cross_validation.R @@ -1,38 +1,45 @@ -#!/usr/bin/python -import sys -import numpy as np -sys.path.append('../../wrapper') -import xgboost as xgb +require(xgboost) -### load data in do training -dtrain = xgb.DMatrix('../data/agaricus.txt.train') -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} -num_round = 2 +data(agaricus.train) +data(agaricus.test) -print ('running cross validation') +trainX = agaricus.train$data +trainY = agaricus.train$label +testX = agaricus.test$data +testY = agaricus.test$label + +dtrain <- xgb.DMatrix(trainX, label=trainY) +dtest <- xgb.DMatrix(testX, label=testY) + +num_round <- 2 +param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') + +cat('running cross validation\n') # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed = 0) -print ('running cross validation, disable standard deviation display') +cat('running cross validation, disable standard deviation display\n') # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed = 0, show_stdv = False) -print ('running cross validation, with preprocessing function') +cat('running cross validation, with preprocessing function\n') # define the preprocessing function # used to return the preprocessed training, test data, and parameter # we can use this to do weight rescale, etc. # as a example, we try to set scale_pos_weight -def fpreproc(dtrain, dtest, param): - label = dtrain.get_label() - ratio = float(np.sum(label == 0)) / np.sum(label==1) - param['scale_pos_weight'] = ratio - return (dtrain, dtest, param) +fpreproc <- function(dtrain, dtest, param){ + label <- getinfo(dtrain, 'label') + ratio <- mean(label==0) + param <- append(param, list(scale_pos_weight = ratio)) + return(list(dtrain=dtrain, dtest= dtest, param = param)) +} + # do cross validation, for each fold # the dtrain, dtest, param will be passed into fpreproc @@ -46,17 +53,22 @@ xgb.cv(param, dtrain, num_round, nfold=5, # See custom_objective.py ## print ('running cross validation, with cutomsized loss function') -def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0-preds) - return grad, hess -def evalerror(preds, dtrain): - labels = dtrain.get_label() - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) -param = {'max_depth':2, 'eta':1, 'silent':1} +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} + +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + return(list(metric = "error", value = err)) +} + +param <- list(max_depth=2,eta=1,silent=1) # train with customized objective xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, obj = logregobj, feval=evalerror) diff --git a/demo/guide-R/custom_objective.R b/demo/guide-R/custom_objective.R index f9dd03754..0e25a254b 100755 --- a/demo/guide-R/custom_objective.R +++ b/demo/guide-R/custom_objective.R @@ -1,13 +1,15 @@ require(xgboost) -data(iris) -iris[,5] <- as.numeric(iris[,5]=='setosa') -iris <- as.matrix(iris) -set.seed(20) -test_ind <- sample(1:nrow(iris),50) -train_ind <- setdiff(1:nrow(iris),test_ind) -dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) -dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) +data(agaricus.train) +data(agaricus.test) + +trainX = agaricus.train$data +trainY = agaricus.train$label +testX = agaricus.test$data +testY = agaricus.test$label + +dtrain <- xgb.DMatrix(trainX, label=trainY) +dtest <- xgb.DMatrix(testX, label=testY) # note: for customized objective function, we leave objective as default # note: what we are getting is margin value in prediction diff --git a/demo/guide-R/predict_first_ntree.R b/demo/guide-R/predict_first_ntree.R index 7596e66f8..1b90cc86b 100755 --- a/demo/guide-R/predict_first_ntree.R +++ b/demo/guide-R/predict_first_ntree.R @@ -1,13 +1,15 @@ require(xgboost) -data(iris) -iris[,5] <- as.numeric(iris[,5]=='setosa') -iris <- as.matrix(iris) -set.seed(20) -test_ind <- sample(1:nrow(iris),50) -train_ind <- setdiff(1:nrow(iris),test_ind) -dtrain <- xgb.DMatrix(iris[train_ind,1:4], label=iris[train_ind,5]) -dtest <- xgb.DMatrix(iris[test_ind,1:4], label=iris[test_ind,5]) +data(agaricus.train) +data(agaricus.test) + +trainX = agaricus.train$data +trainY = agaricus.train$label +testX = agaricus.test$data +testY = agaricus.test$label + +dtrain <- xgb.DMatrix(trainX, label=trainY) +dtest <- xgb.DMatrix(testX, label=testY) param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')