[R] Remove demos (#10750)
This commit is contained in:
parent
06c4246ff1
commit
12c6b7ceea
@ -1,14 +0,0 @@
|
||||
basic_walkthrough Basic feature walkthrough
|
||||
custom_objective Customize loss function, and evaluation metric
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
create_sparse_matrix Create Sparse Matrix
|
||||
predict_leaf_indices Predicting the corresponding leaves
|
||||
early_stopping Early Stop in training
|
||||
poisson_regression Poisson regression on count data
|
||||
tweedie_regression Tweedie regression
|
||||
gpu_accelerated GPU-accelerated tree building algorithms
|
||||
interaction_constraints Interaction constraints among features
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||
* [Customize loss function, and evaluation metric](custom_objective.R)
|
||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
* [Generalized Linear Model](generalized_linear_model.R)
|
||||
* [Cross validation](cross_validation.R)
|
||||
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
|
||||
* [Use GPU-accelerated tree building algorithms](gpu_accelerated.R)
|
||||
|
||||
Benchmarks
|
||||
====
|
||||
* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs)
|
||||
|
||||
Notes
|
||||
====
|
||||
* Contribution of examples, benchmarks is more than welcomed!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request :)
|
||||
@ -1,113 +0,0 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
# we load in the agaricus dataset
|
||||
# In this example, we are aiming to predict whether a mushroom is edible
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
|
||||
class(train$label)
|
||||
class(train$data)
|
||||
|
||||
#-------------Basic Training using XGBoost-----------------
|
||||
# this is the basic usage of xgboost you can put matrix in data field
|
||||
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
|
||||
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
|
||||
print("Training xgboost with sparseMatrix")
|
||||
bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)),
|
||||
params = list(max_depth = 2, eta = 1),
|
||||
nrounds = 2, nthread = 2)
|
||||
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||
print("Training xgboost with Matrix")
|
||||
bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)),
|
||||
params = list(max_depth = 2, eta = 1),
|
||||
nrounds = 2, nthread = 2)
|
||||
|
||||
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
|
||||
print("Training xgboost with xgb.DMatrix")
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic")
|
||||
bst <- xgb.train(data = dtrain, params = params, nrounds = 2)
|
||||
|
||||
# Verbose = 0,1,2
|
||||
print("Train xgboost with verbose 0, no message")
|
||||
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0)
|
||||
print("Train xgboost with verbose 1, print evaluation metric")
|
||||
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1)
|
||||
print("Train xgboost with verbose 2, also print information about tree")
|
||||
bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2)
|
||||
|
||||
# you can also specify data as file path to a LIBSVM format input
|
||||
# since we do not have this file with us, the following line is just for illustration
|
||||
# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic")
|
||||
|
||||
#--------------------basic prediction using xgboost--------------
|
||||
# you can do prediction using the following line
|
||||
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
|
||||
pred <- predict(bst, test$data)
|
||||
err <- mean(as.numeric(pred > 0.5) != test$label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
#-------------------save and load models-------------------------
|
||||
# save model to binary local file
|
||||
xgb.save(bst, "xgboost.model")
|
||||
# load binary model to R
|
||||
# Function doesn't take 'nthreads', but can be set like this:
|
||||
RhpcBLASctl::omp_set_num_threads(1)
|
||||
bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
|
||||
|
||||
# save model to R's raw vector
|
||||
raw <- xgb.save.raw(bst)
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load.raw(raw)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
# pred3 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
|
||||
|
||||
#----------------Advanced features --------------
|
||||
# to use advanced features, we need to put data in xgb.DMatrix
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label = test$label)
|
||||
#---------------Using an evaluation set----------------
|
||||
# 'evals' is a list of xgb.DMatrix, each of them is tagged with name
|
||||
evals <- list(train = dtrain, test = dtest)
|
||||
# to train with an evaluation set, use xgb.train, which contains more advanced features
|
||||
# 'evals' argument allows us to monitor the evaluation result on all data in the list
|
||||
print("Train xgboost using xgb.train with evaluation data")
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# we can change evaluation metrics, or use multiple evaluation metrics
|
||||
print("train xgboost using xgb.train with evaluation data, watch logloss and error")
|
||||
bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
eval_metric = "error", eval_metric = "logloss",
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
|
||||
# xgb.DMatrix can also be saved using xgb.DMatrix.save
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
# to load it in, simply call xgb.DMatrix
|
||||
dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
|
||||
nthread = 2, objective = "binary:logistic")
|
||||
# information can be extracted from xgb.DMatrix using getinfo
|
||||
label <- getinfo(dtest, "label")
|
||||
pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
# You can dump the tree you learned using xgb.dump into a text file
|
||||
dump_path <- file.path(tempdir(), 'dump.raw.txt')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
|
||||
# Finally, you can check which features are the most important.
|
||||
print("Most important features (look at column Gain):")
|
||||
imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst)
|
||||
print(imp_matrix)
|
||||
|
||||
# Feature importance bar plot by gain
|
||||
print("Feature importance Plot : ")
|
||||
print(xgb.plot.importance(importance_matrix = imp_matrix))
|
||||
@ -1,26 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print('start running example to start from a initial prediction')
|
||||
# train xgboost for 1 round
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
bst <- xgb.train(param, dtrain, 1, evals)
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
|
||||
ptrain <- predict(bst, dtrain, outputmargin = TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin = TRUE)
|
||||
# set the base_margin property of dtrain and dtest
|
||||
# base margin is the base prediction we will boost from
|
||||
setinfo(dtrain, "base_margin", ptrain)
|
||||
setinfo(dtest, "base_margin", ptest)
|
||||
|
||||
print('this is result of boost from initial prediction')
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = 1, evals = evals)
|
||||
@ -1,117 +0,0 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
if (!require(vcd)) {
|
||||
install.packages('vcd') #Available in CRAN. Used for its dataset with categorical values.
|
||||
require(vcd)
|
||||
}
|
||||
# According to its documentation, XGBoost works only on numbers.
|
||||
# Sometimes the dataset we have to work on have categorical data.
|
||||
# A categorical variable is one which have a fixed number of values.
|
||||
# By example, if for each observation a variable called "Colour" can have only
|
||||
# "red", "blue" or "green" as value, it is a categorical variable.
|
||||
#
|
||||
# In R, categorical variable is called Factor.
|
||||
# Type ?factor in console for more information.
|
||||
#
|
||||
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix
|
||||
# before analyzing it in XGBoost.
|
||||
# The method we are going to see is usually called "one hot encoding".
|
||||
|
||||
#load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
|
||||
# create a copy of the dataset with data.table package
|
||||
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
|
||||
# and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's have a look to the data.table
|
||||
cat("Print the dataset\n")
|
||||
print(df)
|
||||
|
||||
# 2 columns have factor type, one has ordinal type
|
||||
# (ordinal variable is a categorical variable with values which can be ordered, here: None > Some > Marked).
|
||||
cat("Structure of the dataset\n")
|
||||
str(df)
|
||||
|
||||
# Let's add some new categorical features to see if it helps.
|
||||
# Of course these feature are highly correlated to the Age feature.
|
||||
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
|
||||
# even in case of highly correlated features.
|
||||
|
||||
# For the first feature we create groups of age by rounding the real age.
|
||||
# Note that we transform it to factor (categorical data) so the algorithm treat them as independent values.
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
|
||||
# I choose this value based on nothing.
|
||||
# We will see later if simplifying the information based on arbitrary values is a good strategy
|
||||
# (I am sure you already have an idea of how well it will work!).
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[, ID := NULL]
|
||||
|
||||
# List the different values for the column Treatment: Placebo, Treated.
|
||||
cat("Values of the categorical feature Treatment\n")
|
||||
print(levels(df[, Treatment]))
|
||||
|
||||
# Next step, we will transform the categorical data to dummy variables.
|
||||
# This method is also called one hot encoding.
|
||||
# The purpose is to transform each value of each categorical feature in one binary feature.
|
||||
#
|
||||
# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated.
|
||||
# Each of them will be binary.
|
||||
# For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation,
|
||||
# the value 1 in the new column Placebo and the value 0 in the new column Treated.
|
||||
#
|
||||
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
|
||||
# Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df)
|
||||
|
||||
cat("Encoding of the sparse Matrix\n")
|
||||
print(sparse_matrix)
|
||||
|
||||
# Create the output vector (not sparse)
|
||||
# 1. Set, for all rows, field in Y column to 0;
|
||||
# 2. set Y to 1 when Improved == Marked;
|
||||
# 3. Return Y column
|
||||
output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
bst <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = output_vector), max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||
|
||||
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
|
||||
print(importance)
|
||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age.
|
||||
# The second most important feature is having received a placebo or not.
|
||||
# The sex is third.
|
||||
# Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||
|
||||
# Does these result make sense?
|
||||
# Let's check some Chi2 between each of these features and the outcome.
|
||||
|
||||
print(chisq.test(df$Age, df$Y))
|
||||
# Pearson correlation between Age and illness disappearing is 35
|
||||
|
||||
print(chisq.test(df$AgeDiscret, df$Y))
|
||||
# Our first simplification of Age gives a Pearson correlation of 8.
|
||||
|
||||
print(chisq.test(df$AgeCat, df$Y))
|
||||
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2.
|
||||
# It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that),
|
||||
# but for the illness we are studying, the age to be vulnerable is not the same.
|
||||
# Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
|
||||
|
||||
# As you can see, in general destroying information by simplifying it won't improve your model.
|
||||
# Chi2 just demonstrates that.
|
||||
# But in more complex cases, creating a new feature based on existing one which makes link with the outcome
|
||||
# more obvious may help the algorithm and improve the model.
|
||||
# The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
||||
# However it's almost always worse when you add some arbitrary rules.
|
||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with
|
||||
# other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
|
||||
# Linear model may not be that strong in these scenario.
|
||||
@ -1,51 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nrounds <- 2
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = 'error')
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nrounds, nfold = 5,
|
||||
metrics = 'error', showsd = FALSE)
|
||||
|
||||
###
|
||||
# you can also do cross validation with customized loss function
|
||||
# See custom_objective.R
|
||||
##
|
||||
print('running cross validation, with customized loss function')
|
||||
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth = 2, eta = 1,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
# train with customized objective
|
||||
xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5)
|
||||
|
||||
# do cross validation with prediction values for each fold
|
||||
res <- xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5, prediction = TRUE)
|
||||
res$evaluation_log
|
||||
length(res$pred)
|
||||
@ -1,65 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is log likelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make builtin evaluation metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the builtin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
print('start training with user customized objective')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
|
||||
#
|
||||
# there can be cases where you want additional information
|
||||
# being considered besides the property of DMatrix you can get by getinfo
|
||||
# you can set additional information as attributes if DMatrix
|
||||
|
||||
# set label attribute of dtrain to be label, we use label as an example, it can be anything
|
||||
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
|
||||
# this is new customized objective, where you can access things you set
|
||||
# same thing applies to customized evaluation function
|
||||
logregobjattr <- function(preds, dtrain) {
|
||||
# now you can access the attribute in customized function
|
||||
labels <- attr(dtrain, 'label')
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0,
|
||||
objective = logregobjattr, eval_metric = evalerror)
|
||||
print('start training with user customized objective, with additional attributes in DMatrix')
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
@ -1,40 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0)
|
||||
evals <- list(eval = dtest)
|
||||
num_round <- 20
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is log likelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1 / (1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make builtin evaluation metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the builtin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
print('start training with early Stopping setting')
|
||||
|
||||
bst <- xgb.train(param, dtrain, num_round, evals,
|
||||
objective = logregobj, eval_metric = evalerror, maximize = FALSE,
|
||||
early_stopping_round = 3)
|
||||
bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
|
||||
objective = logregobj, eval_metric = evalerror,
|
||||
maximize = FALSE, early_stopping_rounds = 3)
|
||||
@ -1,33 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
# you can fit a linear regression, or logistic regression model
|
||||
##
|
||||
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param <- list(objective = "binary:logistic", booster = "gblinear",
|
||||
nthread = 2, alpha = 0.0001, lambda = 1)
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
bst <- xgb.train(param, dtrain, num_round, evals)
|
||||
ypred <- predict(bst, dtest)
|
||||
labels <- getinfo(dtest, 'label')
|
||||
cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n')
|
||||
@ -1,45 +0,0 @@
|
||||
# An example of using GPU-accelerated tree building algorithms
|
||||
#
|
||||
# NOTE: it can only run if you have a CUDA-enable GPU and the package was
|
||||
# specially compiled with GPU support.
|
||||
#
|
||||
# For the current functionality, see
|
||||
# https://xgboost.readthedocs.io/en/latest/gpu/index.html
|
||||
#
|
||||
|
||||
library('xgboost')
|
||||
|
||||
# Simulate N x p random matrix with some binomial response dependent on pp columns
|
||||
set.seed(111)
|
||||
N <- 1000000
|
||||
p <- 50
|
||||
pp <- 25
|
||||
X <- matrix(runif(N * p), ncol = p)
|
||||
betas <- 2 * runif(pp) - 1
|
||||
sel <- sort(sample(p, pp))
|
||||
m <- X[, sel] %*% betas - 1 + rnorm(N)
|
||||
y <- rbinom(N, 1, plogis(m))
|
||||
|
||||
tr <- sample.int(N, N * 0.75)
|
||||
dtrain <- xgb.DMatrix(X[tr, ], label = y[tr])
|
||||
dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr])
|
||||
evals <- list(train = dtrain, test = dtest)
|
||||
|
||||
# An example of running 'gpu_hist' algorithm
|
||||
# which is
|
||||
# - similar to the 'hist'
|
||||
# - the fastest option for moderately large datasets
|
||||
# - current limitations: max_depth < 16, does not implement guided loss
|
||||
# You can use tree_method = 'gpu_hist' for another GPU accelerated algorithm,
|
||||
# which is slower, more memory-hungry, but does not use binning.
|
||||
param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
|
||||
max_bin = 64, tree_method = 'gpu_hist')
|
||||
pt <- proc.time()
|
||||
bst_gpu <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
|
||||
proc.time() - pt
|
||||
|
||||
# Compare to the 'hist' algorithm:
|
||||
param$tree_method <- 'hist'
|
||||
pt <- proc.time()
|
||||
bst_hist <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
|
||||
proc.time() - pt
|
||||
@ -1,113 +0,0 @@
|
||||
library(xgboost)
|
||||
library(data.table)
|
||||
|
||||
set.seed(1024)
|
||||
|
||||
# Function to obtain a list of interactions fitted in trees, requires input of maximum depth
|
||||
treeInteractions <- function(input_tree, input_max_depth) {
|
||||
ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable"
|
||||
|
||||
trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting
|
||||
if (input_max_depth < 2) return(list()) # no interactions if max depth < 2
|
||||
if (nrow(input_tree) == 1) return(list())
|
||||
|
||||
# Attach parent nodes
|
||||
for (i in 2:input_max_depth) {
|
||||
if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))]
|
||||
parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)]
|
||||
parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)]
|
||||
|
||||
data.table::setorderv(trees, 'ID_merge')
|
||||
data.table::setorderv(parents_left, 'ID_merge')
|
||||
data.table::setorderv(parents_right, 'ID_merge')
|
||||
|
||||
trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
|
||||
trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE)
|
||||
trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1))
|
||||
:= list(i.id, i.feature)]
|
||||
trees[, c('i.id', 'i.feature') := NULL]
|
||||
}
|
||||
|
||||
# Extract nodes with interactions
|
||||
interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), # nolint: object_usage_linter
|
||||
c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))),
|
||||
with = FALSE]
|
||||
interaction_trees_split <- split(interaction_trees, seq_len(nrow(interaction_trees)))
|
||||
interaction_list <- lapply(interaction_trees_split, as.character)
|
||||
|
||||
# Remove NAs (no parent interaction)
|
||||
interaction_list <- lapply(interaction_list, function(x) x[!is.na(x)])
|
||||
|
||||
# Remove non-interactions (same variable)
|
||||
interaction_list <- lapply(interaction_list, unique) # remove same variables
|
||||
interaction_length <- lengths(interaction_list)
|
||||
interaction_list <- interaction_list[interaction_length > 1]
|
||||
interaction_list <- unique(lapply(interaction_list, sort))
|
||||
return(interaction_list)
|
||||
}
|
||||
|
||||
# Generate sample data
|
||||
x <- list()
|
||||
for (i in 1:10) {
|
||||
x[[i]] <- i * rnorm(1000, 10)
|
||||
}
|
||||
x <- as.data.table(x)
|
||||
|
||||
y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']]
|
||||
+ rnorm(1000, 0.001) + 3 * sin(x[['V7']])
|
||||
|
||||
train <- as.matrix(x)
|
||||
|
||||
# Interaction constraint list (column names form)
|
||||
interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5'))
|
||||
|
||||
# Convert interaction constraint list into feature index form
|
||||
cols2ids <- function(object, col_names) {
|
||||
LUT <- seq_along(col_names) - 1
|
||||
names(LUT) <- col_names
|
||||
rapply(object, function(x) LUT[x], classes = "character", how = "replace")
|
||||
}
|
||||
interaction_list_fid <- cols2ids(interaction_list, colnames(train))
|
||||
|
||||
# Fit model with interaction constraints
|
||||
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
|
||||
bst_tree <- xgb.model.dt.tree(colnames(train), bst)
|
||||
bst_interactions <- treeInteractions(bst_tree, 4)
|
||||
# interactions constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Fit model without interaction constraints
|
||||
bst2 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
|
||||
bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
|
||||
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions
|
||||
|
||||
# Fit model with both interaction and monotonicity constraints
|
||||
bst3 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
|
||||
|
||||
bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
|
||||
bst3_interactions <- treeInteractions(bst3_tree, 4)
|
||||
# interactions still constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Show monotonic constraints still apply by checking scores after incrementing V1
|
||||
x1 <- sort(unique(x[['V1']]))
|
||||
for (i in seq_along(x1)){
|
||||
testdata <- copy(x[, - ('V1')])
|
||||
testdata[['V1']] <- x1[i]
|
||||
testdata <- testdata[, paste0('V', 1:10), with = FALSE]
|
||||
pred <- predict(bst3, as.matrix(testdata))
|
||||
|
||||
# Should not print out anything due to monotonic constraints
|
||||
if (i > 1) if (any(pred > prev_pred)) print(i)
|
||||
prev_pred <- pred
|
||||
}
|
||||
@ -1,6 +0,0 @@
|
||||
data(mtcars)
|
||||
head(mtcars)
|
||||
bst <- xgb.train(data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]),
|
||||
objective = 'count:poisson', nrounds = 5)
|
||||
pred <- predict(bst, as.matrix(mtcars[, -11]))
|
||||
sqrt(mean((pred - mtcars[, 11]) ^ 2))
|
||||
@ -1,23 +0,0 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
evals <- list(eval = dtest, train = dtrain)
|
||||
nrounds <- 2
|
||||
|
||||
# training the model for two rounds
|
||||
bst <- xgb.train(param, dtrain, nrounds, nthread = 2, evals = evals)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
labels <- getinfo(dtest, 'label')
|
||||
|
||||
### predict using first 1 tree
|
||||
ypred1 <- predict(bst, dtest, iterationrange = c(1, 1))
|
||||
# by default, we predict using all the trees
|
||||
ypred2 <- predict(bst, dtest)
|
||||
|
||||
cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n')
|
||||
cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n')
|
||||
@ -1,54 +0,0 @@
|
||||
require(xgboost)
|
||||
require(data.table)
|
||||
require(Matrix)
|
||||
|
||||
set.seed(1982)
|
||||
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package = 'xgboost')
|
||||
data(agaricus.test, package = 'xgboost')
|
||||
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
|
||||
nrounds <- 4
|
||||
|
||||
# training the model for two rounds
|
||||
bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy without new features
|
||||
accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# by default, we predict using all the trees
|
||||
pred_with_leaf <- predict(bst, dtest, predleaf = TRUE)
|
||||
head(pred_with_leaf)
|
||||
|
||||
create.new.tree.features <- function(model, original.features) {
|
||||
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||
cols <- list()
|
||||
for (i in 1:xgb.get.num.boosted.rounds(model)) {
|
||||
# max is not the real max but it s not important for the purpose of adding features
|
||||
leaf.id <- sort(unique(pred_with_leaf[, i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)
|
||||
}
|
||||
cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols)))
|
||||
}
|
||||
|
||||
# Convert previous features to one hot encoding
|
||||
new.features.train <- create.new.tree.features(bst, agaricus.train$data)
|
||||
new.features.test <- create.new.tree.features(bst, agaricus.test$data)
|
||||
colnames(new.features.test) <- colnames(new.features.train)
|
||||
|
||||
# learning with new features
|
||||
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
|
||||
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
|
||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
# Model accuracy with new features
|
||||
accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label)
|
||||
/ length(agaricus.test$label))
|
||||
|
||||
# Here the accuracy was already good and is now perfect.
|
||||
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
|
||||
accuracy.after, "!\n"))
|
||||
@ -1,13 +0,0 @@
|
||||
# running all scripts in demo folder, removed during packaging.
|
||||
demo(basic_walkthrough, package = 'xgboost')
|
||||
demo(custom_objective, package = 'xgboost')
|
||||
demo(boost_from_prediction, package = 'xgboost')
|
||||
demo(predict_first_ntree, package = 'xgboost')
|
||||
demo(generalized_linear_model, package = 'xgboost')
|
||||
demo(cross_validation, package = 'xgboost')
|
||||
demo(create_sparse_matrix, package = 'xgboost')
|
||||
demo(predict_leaf_indices, package = 'xgboost')
|
||||
demo(early_stopping, package = 'xgboost')
|
||||
demo(poisson_regression, package = 'xgboost')
|
||||
demo(tweedie_regression, package = 'xgboost')
|
||||
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support
|
||||
@ -1,49 +0,0 @@
|
||||
library(xgboost)
|
||||
library(data.table)
|
||||
library(cplm)
|
||||
|
||||
data(AutoClaim)
|
||||
|
||||
# auto insurance dataset analyzed by Yip and Yau (2005)
|
||||
dt <- data.table(AutoClaim)
|
||||
|
||||
# exclude these columns from the model matrix
|
||||
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
|
||||
|
||||
# retains the missing values
|
||||
# NOTE: this dataset is comes ready out of the box
|
||||
options(na.action = 'na.pass')
|
||||
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE])
|
||||
options(na.action = 'na.omit')
|
||||
|
||||
# response
|
||||
y <- dt[, CLM_AMT5]
|
||||
|
||||
d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
|
||||
|
||||
# the tweedie_variance_power parameter determines the shape of
|
||||
# distribution
|
||||
# - closer to 1 is more poisson like and the mass
|
||||
# is more concentrated near zero
|
||||
# - closer to 2 is more gamma like and the mass spreads to the
|
||||
# the right with less concentration near zero
|
||||
|
||||
params <- list(
|
||||
objective = 'reg:tweedie',
|
||||
eval_metric = 'rmse',
|
||||
tweedie_variance_power = 1.4,
|
||||
max_depth = 6,
|
||||
eta = 1)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = d_train,
|
||||
params = params,
|
||||
maximize = FALSE,
|
||||
evals = list(train = d_train),
|
||||
nrounds = 20)
|
||||
|
||||
var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
|
||||
|
||||
preds <- predict(bst, d_train)
|
||||
|
||||
rmse <- sqrt(sum(mean((y - preds) ^ 2)))
|
||||
@ -45,7 +45,6 @@ def pack_rpackage() -> Path:
|
||||
)
|
||||
|
||||
shutil.copytree("R-package", dest)
|
||||
os.remove(dest / "demo" / "runall.R")
|
||||
# core
|
||||
shutil.copytree("src", dest / "src" / "src")
|
||||
shutil.copytree("include", dest / "src" / "include")
|
||||
@ -221,7 +220,6 @@ def test_with_autotools() -> None:
|
||||
subprocess.check_call(
|
||||
["R.exe", "-q", "-e", "library(testthat); setwd('tests'); source('testthat.R')"]
|
||||
)
|
||||
subprocess.check_call(["R.exe", "-q", "-e", "demo(runall, package = 'xgboost')"])
|
||||
|
||||
|
||||
@record_time
|
||||
@ -296,7 +294,6 @@ def test_with_cmake(args: argparse.Namespace) -> None:
|
||||
"library(testthat); setwd('tests'); source('testthat.R')",
|
||||
]
|
||||
)
|
||||
subprocess.check_call([R, "-q", "-e", "demo(runall, package = 'xgboost')"])
|
||||
|
||||
|
||||
@record_time
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user