From d3bb466026a9fdf0f44e445272f3cff8f6b730c1 Mon Sep 17 00:00:00 2001
From: terrytangyuan <terrytangyuan@gmail.com>
Date: Tue, 8 Sep 2015 10:51:20 -0400
Subject: [PATCH 1/2] ENH/DOC: Added R package demo using caret library to
 train xgbTree model

---
 R-package/demo/caret_wrapper.R | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 R-package/demo/caret_wrapper.R

diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R
new file mode 100644
index 000000000..5c53c9915
--- /dev/null
+++ b/R-package/demo/caret_wrapper.R
@@ -0,0 +1,32 @@
+# install development version of caret library that contains xgboost models
+devtools::install_github("topepo/caret/pkg/caret") 
+require(caret)
+require(xgboost)
+require(data.table)
+require(vcd)
+require(e1071)
+
+# Load Arthritis dataset in memory.
+data(Arthritis)
+# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
+df <- data.table(Arthritis, keep.rownames = F)
+
+# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
+# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+
+# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+
+# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
+df[,ID:=NULL]
+
+#-------------Basic Training using XGBoost in caret Library-----------------
+# set up control parameters for caret::train
+# here we use 10-fold cross-validation, repeating twice
+fitControl <- trainControl(method = "cv", number = 10, repeats = 2)
+# train a xgbTree model using caret::train
+model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
+
+# See model results
+print(model)
\ No newline at end of file

From 9ead44531e1784974fb7a719dc07110e9d8cd591 Mon Sep 17 00:00:00 2001
From: terrytangyuan <terrytangyuan@gmail.com>
Date: Tue, 8 Sep 2015 10:54:07 -0400
Subject: [PATCH 2/2] DOC: Added new demo to index

---
 R-package/demo/00Index  | 1 +
 R-package/demo/runall.R | 1 +
 2 files changed, 2 insertions(+)

diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 0112eb9e1..f3d241470 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -1,4 +1,5 @@
 basic_walkthrough               Basic feature walkthrough
+caret_wrapper                   Use xgboost to train in caret library
 custom_objective                Cutomize loss function, and evaluation metric
 boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees
diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R
index 7311ec95e..c337f8164 100644
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -9,3 +9,4 @@ demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
+demo(caret_wrapper)