From d3bb466026a9fdf0f44e445272f3cff8f6b730c1 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 10:51:20 -0400 Subject: [PATCH 1/2] ENH/DOC: Added R package demo using caret library to train xgbTree model --- R-package/demo/caret_wrapper.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 R-package/demo/caret_wrapper.R diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R new file mode 100644 index 000000000..5c53c9915 --- /dev/null +++ b/R-package/demo/caret_wrapper.R @@ -0,0 +1,32 @@ +# install development version of caret library that contains xgboost models +devtools::install_github("topepo/caret/pkg/caret") +require(caret) +require(xgboost) +require(data.table) +require(vcd) +require(e1071) + +# Load Arthritis dataset in memory. +data(Arthritis) +# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good). +df <- data.table(Arthritis, keep.rownames = F) + +# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. +# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. +df[,AgeDiscret:= as.factor(round(Age/10,0))] + +# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). +df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] + +# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). +df[,ID:=NULL] + +#-------------Basic Training using XGBoost in caret Library----------------- +# set up control parameters for caret::train +# here we use 10-fold cross-validation, repeating twice +fitControl <- trainControl(method = "cv", number = 10, repeats = 2) +# train a xgbTree model using caret::train +model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl) + +# See model results +print(model) \ No newline at end of file From 9ead44531e1784974fb7a719dc07110e9d8cd591 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 10:54:07 -0400 Subject: [PATCH 2/2] DOC: Added new demo to index --- R-package/demo/00Index | 1 + R-package/demo/runall.R | 1 + 2 files changed, 2 insertions(+) diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 0112eb9e1..f3d241470 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -1,4 +1,5 @@ basic_walkthrough Basic feature walkthrough +caret_wrapper Use xgboost to train in caret library custom_objective Cutomize loss function, and evaluation metric boost_from_prediction Boosting from existing prediction predict_first_ntree Predicting using first n trees diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 7311ec95e..c337f8164 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -9,3 +9,4 @@ demo(create_sparse_matrix) demo(predict_leaf_indices) demo(early_stopping) demo(poisson_regression) +demo(caret_wrapper)