Merge pull request #2 from dmlc/master

rebase to current dmlc official version
2015-09-13 15:01:22 -07:00
parent f4a5a8b6cd dd3126735b
commit cbb52b1d5d
32 changed files with 426 additions and 120 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -34,6 +34,7 @@ List of Contributors
 * [Zygmunt Zając](https://github.com/zygmuntz)
  - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
 * [Ajinkya Kale](https://github.com/ajkl)
+* [Yuan Tang](https://github.com/terrytangyuan)
 * [Boliang Chen](https://github.com/cblsjtu)
 * [Vadim Khotilovich](https://github.com/khotilov)
 * [Yangqing Men](https://github.com/yanqingmen)
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -23,7 +23,8 @@ Suggests:
    ggplot2 (>= 1.0.0),
    DiagrammeR (>= 0.6),
    Ckmeans.1d.dp (>= 3.3.1),
-    vcd (>= 1.3)
+    vcd (>= 1.3),
+    testthat
 Depends:
    R (>= 2.10)
 Imports:
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -103,17 +103,21 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
 ## ----the following are low level iteratively function, not needed if
 ## you do not want to use them ---------------------------------------
 # get dmatrix from data, label
-xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
+xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
  inClass <- class(data)
  if (inClass == "dgCMatrix" || inClass == "matrix") {
    if (is.null(label)) {
      stop("xgboost: need label when data is a matrix")
    }
+    dtrain <- xgb.DMatrix(data, label = label)
    if (is.null(missing)){
      dtrain <- xgb.DMatrix(data, label = label)
    } else {
      dtrain <- xgb.DMatrix(data, label = label, missing = missing)
    }
+    if (!is.null(weight)){
+      xgb.setinfo(dtrain, "weight", weight)
+    }
  } else {
    if (!is.null(label)) {
      warning("xgboost: label will be ignored.")
@@ -122,6 +126,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
      dtrain <- xgb.DMatrix(data)
    } else if (inClass == "xgb.DMatrix") {
      dtrain <- data
+    } else if (inClass == "data.frame") {
+      stop("xgboost only support numerical matrix input, 
+           use 'data.frame' to transform the data.")
    } else {
      stop("xgboost: Invalid input of data")
    }
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -72,6 +72,8 @@
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -120,7 +122,8 @@
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
                      obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
-                      early.stop.round = NULL, maximize = NULL, ...) {
+                      early.stop.round = NULL, maximize = NULL, 
+                      save_period = 0, save_name = "xgboost.model", ...) {
  dtrain <- data
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list")
@@ -215,6 +218,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
        }
      }
    }
+    if (save_period > 0) {
+      if (i %% save_period == 0) {
+        xgb.save(bst, save_name)
+      }
+    }
  }
  bst <- xgb.Booster.check(bst)
  if (!is.null(early.stop.round)) {
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -31,11 +31,14 @@
 #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
+#' @param weight a vector indicating the weight for each row of the input.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -56,14 +59,11 @@
 #' 
 #' @export
 #' 
-xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
+xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL, 
+                    params = list(), nrounds, 
                    verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-                    maximize = NULL, ...) {
-  if (is.null(missing)) {
-    dtrain <- xgb.get.DMatrix(data, label)
-  } else {
-    dtrain <- xgb.get.DMatrix(data, label, missing)
-  }
+                    maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
+  dtrain <- xgb.get.DMatrix(data, label, missing, weight)
    
  params <- append(params, list(...))
  
@@ -74,7 +74,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
  }
  
  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
-                   early.stop.round = early.stop.round)
+                   early.stop.round = early.stop.round, maximize = maximize,
+                   save_period = save_period, save_name = save_name)
  
  return(bst)
 } 
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -1,4 +1,5 @@
 basic_walkthrough               Basic feature walkthrough
+caret_wrapper                   Use xgboost to train in caret library
 custom_objective                Cutomize loss function, and evaluation metric
 boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees
--- a/R-package/demo/README.md
+++ b/R-package/demo/README.md
@@ -1,6 +1,7 @@
 XGBoost R Feature Walkthrough
 ====
-* [Basic walkthrough of wrappers](basic_walkthrough.R) 
+* [Basic walkthrough of wrappers](basic_walkthrough.R)
+* [Train a xgboost model from caret library](caret_wrapper.R)
 * [Cutomize loss function, and evaluation metric](custom_objective.R)
 * [Boosting from existing prediction](boost_from_prediction.R)
 * [Predicting using first n trees](predict_first_ntree.R)
--- a/R-package/demo/caret_wrapper.R
+++ b/R-package/demo/caret_wrapper.R
@@ -0,0 +1,35 @@
+# install development version of caret library that contains xgboost models
+devtools::install_github("topepo/caret/pkg/caret") 
+require(caret)
+require(xgboost)
+require(data.table)
+require(vcd)
+require(e1071)
+
+# Load Arthritis dataset in memory.
+data(Arthritis)
+# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
+df <- data.table(Arthritis, keep.rownames = F)
+
+# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
+# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+
+# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+
+# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
+df[,ID:=NULL]
+
+#-------------Basic Training using XGBoost in caret Library-----------------
+# Set up control parameters for caret::train
+# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
+fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
+# train a xgbTree model using caret::train
+model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
+
+# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
+# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
+
+# See model results
+print(model)
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -9,3 +9,4 @@ demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
+demo(caret_wrapper)
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -6,7 +6,8 @@
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  feval = NULL, verbose = 1, print.every.n = 1L,
-  early.stop.round = NULL, maximize = NULL, ...)
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{params}{the list of parameters.
@@ -87,6 +88,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}

+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -4,9 +4,10 @@
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
-xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-  maximize = NULL, ...)
+xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL,
+  params = list(), nrounds, verbose = 1, print.every.n = 1L,
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@@ -18,6 +19,8 @@ if data is local data file or  \code{xgb.DMatrix}.}
 \item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}

+\item{weight}{a vector indicating the weight for each row of the input.}
+
 \item{params}{the list of parameters.

 Commonly used ones are:
@@ -51,6 +54,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}

+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(xgboost)
+
+test_check("xgboost")
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -0,0 +1,33 @@
+require(xgboost)
+
+context("basic functions")
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train = agaricus.train
+test = agaricus.test
+
+test_that("train and predict", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+  pred = predict(bst, test$data)
+})
+
+
+test_that("early stopping", {
+  res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
+               eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+               early.stop.round = 3, maximize = FALSE)
+  expect_true(nrow(res)<20)
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                early.stop.round = 3, maximize = FALSE)
+  pred = predict(bst, test$data)
+})
+
+test_that("save_period", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                save_period = 10, save_name = "xgb.model")
+  pred = predict(bst, test$data)
+})
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -0,0 +1,47 @@
+context('Test models with custom objective')
+
+require(xgboost)
+
+test_that("custom objective works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+  
+  logregobj <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  evalerror <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+    return(list(metric = "error", value = err))
+  }
+  
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobj, eval_metric=evalerror)
+  
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+  attr(dtrain, 'label') <- getinfo(dtrain, 'label')
+  
+  logregobjattr <- function(preds, dtrain) {
+    labels <- attr(dtrain, 'label')
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobjattr, eval_metric=evalerror)
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+})
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -0,0 +1,19 @@
+context('Test generalized linear models')
+
+require(xgboost)
+
+test_that("glm works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  expect_equal(class(dtrain), "xgb.DMatrix")
+  expect_equal(class(dtest), "xgb.DMatrix")
+  param <- list(objective = "binary:logistic", booster = "gblinear",
+                nthread = 2, alpha = 0.0001, lambda = 1)
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  ypred <- predict(bst, dtest)
+  expect_equal(length(getinfo(dtest, 'label')), 1611)
+})
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -0,0 +1,32 @@
+context('Test helper functions')
+
+require(xgboost)
+require(data.table)
+require(Matrix)
+require(vcd)
+
+data(Arthritis)
+data(agaricus.train, package='xgboost')
+df <- data.table(Arthritis, keep.rownames = F)
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+df[,ID:=NULL]
+sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
+output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
+bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
+               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
+
+
+test_that("xgb.dump works", {
+  capture.output(print(xgb.dump(bst)))
+})
+
+test_that("xgb.importance works", {
+  xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+  importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
+  expect_equal(dim(importance), c(7, 4))
+})
+
+test_that("xgb.plot.tree works", {
+  xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+})
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -0,0 +1,13 @@
+context('Test poisson regression model')
+
+require(xgboost)
+
+test_that("poisson regression works", {
+  data(mtcars)
+  bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
+                objective='count:poisson',nrounds=5)
+  expect_equal(class(bst), "xgb.Booster")
+  pred = predict(bst,as.matrix(mtcars[,-11]))
+  expect_equal(length(pred), 32)
+  sqrt(mean((pred-mtcars[,11])^2))
+})
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Contents
 * [Build Instruction](doc/build.md)
 * [Features](#features)
 * [Distributed XGBoost](multi-node)
-* [Usecases](doc/README.md#highlight-links)
+* [Usecases](doc/index.md#highlight-links)
 * [Bug Reporting](#bug-reporting)
 * [Contributing to XGBoost](#contributing-to-xgboost)
 * [Committers and Contributors](CONTRIBUTORS.md)
@@ -29,6 +29,7 @@ Contents
 What's New
 ----------

+* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
 * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
  Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
 * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
--- a/doc/README
+++ b/doc/README
@@ -1,5 +1,7 @@
 The document of xgboost is generated with recommonmark and sphinx.

 You can build it locally by typing "make html" in this folder.
- You will need to rerun the recommonmark script for readthedocs in sphinx_util.
- This was a hack to get the customized parser into readthedocs, hopefully to be removed in future.
+- clone https://github.com/tqchen/recommonmark to root
+- type make html
+
+Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
--- a/doc/dev-guide/contribute.md
+++ b/doc/dev-guide/contribute.md
@@ -5,7 +5,7 @@ Everyone is more than welcomed to is a great way to make the project better.
 The project is maintained by a committee of [committers](../../CONTRIBUTORS.md#comitters) who will review and merge pull requests from contributors.

 Contributing Code
-=================
+-----------------
 * The C++ code follows Google C++ style
 * We follow numpy style to document our python module
 * Tools to precheck codestyle
--- a/doc/faq.md
+++ b/doc/faq.md
@@ -6,6 +6,10 @@ How to tune parameters
 ----------------------
 See [Parameter Tunning Guide](param_tuning.md)

+Description on the model
+------------------------
+See [Introduction to Boosted Trees](model.md)
+

 I have a big dataset
 --------------------
--- a/doc/img/split_find.png
+++ b/doc/img/split_find.png
--- a/doc/img/step_fit.png
+++ b/doc/img/step_fit.png
--- a/doc/img/struct_score.png
+++ b/doc/img/struct_score.png
--- a/doc/index.md
+++ b/doc/index.md
@@ -19,7 +19,7 @@ The best way to get started to learn xgboost is by the examples. There are three

 After you gets familiar with the interface, checkout the following additional resources
 * [Frequently Asked Questions](faq.md)
-* [Learning what is in Behind: Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
+* [Learning what is in Behind: Introduction to Boosted Trees](model.md)
 * [User Guide](#user-guide) contains comprehensive list of documents of xgboost.
 * [Developer Guide](dev-guide/contribute.md)

@@ -38,6 +38,7 @@ are great resources to learn xgboost by real examples. If you think you have som
 * [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package)
  - This tutorial teaches you how to use xgboost to compete kaggle otto challenge.

+
 Highlight Solutions
 -------------------
 This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
@@ -51,8 +52,7 @@ This section is about blogposts, presentation and videos discussing how to use x
 User Guide
 ----------
 * [Frequently Asked Questions](faq.md)
-* [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
-* [Introduction to the Model of XGBoost](model.md)
+* [Introduction to Boosted Trees](model.md)
 * [Using XGBoost in Python](python/python_intro.md)
 * [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd)
 * [Learning to use XGBoost by Example](../demo)
@@ -62,7 +62,6 @@ User Guide
 * [Parameters](parameter.md)
 * [Notes on Parameter Tunning](param_tuning.md)

-
 Developer Guide
 ---------------
 * [Developer Guide](dev-guide/contribute.md)
@@ -70,4 +69,3 @@ Developer Guide
 API Reference
 -------------
 * [Python API Reference](python/python_api.rst)
-
--- a/doc/model.md
+++ b/doc/model.md
@@ -1,143 +1,188 @@
-Introduction to the Model of XGBoost
-=========================
+Introduction to Boosted Trees
+=============================
+XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model. This is a tutorial on boosted trees, most of content are based on this [slide](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost.

-## The Origin
+The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explaination is cleaner, more formal, and motivates the variant used in xgboost.

-XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model, we incoporated several modifications to make it faster and more robust.
+Elements of Supervised Learning
+-------------------------------
+XGBoost is used for supervised learning problems, where we use the training data ``$ x_i $`` to predict a target variable ``$ y_i $``.
+Before we get dived into trees, let us start from reviwing the basic elements in supervised learning.

-## The General Problem
+### Model and Parameters
+The ***model*** in supervised learning usually refers to the mathematical structure on how to given the prediction ``$ y_i $`` given ``$ x_i $``.
+For example, a common model is *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features.
+The prediction value can have different interpretations, depending on the task.
+For example, it can be logistic transformed to get the probability of postitive class in logistic regression, it can also be used as ranking score when we want to rank the outputs.

-### Supervised Model
+The ***parameters*** are the undermined part that we need to learn from data. In linear regression problem, the parameters are the co-efficients ``$ w $``.
+Usually we will use ``$ \Theta $`` to denote the parameters.

-XGBoost is used for supervised learning problems, where we use the training data ``$ x_i $`` to predict a target variable ``$ y_i $``. Our model is a mathematical structure that captures the pattern from the training data. Given the structure, we need to learn the best parameters ``$ \Theta $`` in the model.
+### Object Function : Training Loss + Regularization

-### Loss Function
+Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc.
+We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so called ***objective function***,
+to measure the performance of the model under certain set of parameters.

-Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc. To model different problems, we use a so-called `loss function` to describe how good is our model's performance. The function usually takes two parameters: the true value ``$ y_i $`` and the prediction ``$ \hat{y}_i $``. For example, we can use Rooted Mean Squared Error (RMSE)
-
-```math
-l(y_i, \hat{y}_i) = (y_i-\hat{y}_i)^2
-```
-
-for a regression problem, and logistic loss function
-
-```math
-l(y_i, \hat{y}_i) = y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})
-```
-
-for a classification problem. 
-
-### Regularization
-
-Besides we need to control the complexity of our model. A model achieving a perfect loss function score on the training dataset is overfitting it, which means it not only captures the useful pattern, but also the outliers, noise and the specific pattern in the training data. Controlling the complexity can make the model focus on more important and general pattern rather than the unnecessary details. 
-
-### Optimize the Objective
-
-Combining the loss function and the regularization, we have our objective for the supervised learning model as
+A very important about objective functions, is they ***must always*** contains two parts: training loss and regularization.

 ```math
 Obj(\Theta) = L(\Theta) + \Omega(\Theta)
 ```

-where ``$ L $`` is the loss function, and ``$ \Omega $`` is the regularization term. The first one is making our model being accurate, while the second one is preventing our model being overfitting. We want to have a balance between these two parts when optimizing the objective. The optimization algorithm depends on the structure of our model. The following content will introduce the details.
+where ``$ L $`` is the training loss function, and ``$ \Omega $`` is the regularization term. The training loss measures how *predictive* our model is on training data.
+For example, a commonly used training loss is mean squared error.

-## Boosting Trees Model
+```math
+L(\Theta) = \sum_i (y_i-\hat{y}_i)^2
+```
+Another commonly used loss function is logistic loss for logistic regression

-### Classification and Regression Tree
+```math
+L(\theta) = \sum_i[ y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})]
+```

-The boosting trees model is a set of classification and regression trees. Here's a simple example of such a model:
+The ***regularization term*** is usually people forget to add. The regularization term controls the complexity of the model, this helps us to avoid overfitting.
+This sounds a bit abstract, let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points
+on the upper left corner of the image, which solution among the tree you think is the best fit?
+
+![Step function](img/step_fit.png)
+
+The answer is already marked as red. Please think if it is reasonable to you visually. The general principle is we want a ***simple*** and ***predictive*** model.
+The tradeoff between the two is also referred as bias-variance tradeoff in machine learning.
+
+
+### Why introduce the general principle
+The elements introduced in above forms the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits.
+For example, you should be able to answer what is the difference and common parts between boosted trees and random forest.
+Understanding the process in a formalized way also helps us to understand the objective what we are learning and getting the reason behind the heurestics such as
+pruning and smoothing.
+
+Tree Ensemble
+-------------
+Now we have introduce the elements of supervised learning, let us getting started with real trees.
+To begin with, let us first learn what is the ***model*** of xgboost: tree ensembles.
+The tree ensemble model is a set of classification and regression trees (CART). Here's a simple example of a CART
+that classifies is someone will like computer games.

 ![CART](img/cart.png)

 We classify the members in thie family into different leaves, and assign them the score on corresponding leaf.
+A CART is a bit different from decision trees, where the leaf only contain decision values. In CART, a real score
+is associated with each of the leaves, this allows gives us richer interpretations that go beyond classification.
+This also makes the unified optimization step easier, as we will see in later part of this tutorial.

-### Tree Ensemble
-
-However a single CART model is not so strong in practice. How about predict with more trees?
+Usually, a single tree is not so strong enough to be used in practice. What is actually used is the so called
+tree ensemble model, that sumes the prediction of multiple trees together.

 ![TwoCART](img/twocart.png)

-Now we are predicting with two trees, by predict on each tree individually and then sum the scores up. Mathematically, we can write our model into the form
+Here is an example of tree ensemble of two trees. The prediction scores of each individual tree are summed up to get the final score.
+If you look at the example, an important fact is that the two trees tries to *complement* each other.
+Mathematically, we can write our model into the form

 ```math
-\hat{y}_i = \sum_{k=1}^K f_k(x_i), f_k \in F
+\hat{y}_i = \sum_{k=1}^K f_k(x_i), f_k \in \mathcal{F}
 ```

-where ``$ K $`` is the number of trees, ``$ f $`` is a function in the functional space ``$ F $``, and ``$ F $`` is the set of all possible CARTs. Therefore our objective to optimize can be written as
+where ``$ K $`` is the number of trees, ``$ f $`` is a function in the functional space ``$ \mathcal{F} $``, and ``$ \mathcal{F} $`` is the set of all possible CARTs. Therefore our objective to optimize can be written as

 ```math
 obj(\Theta) = \sum_i^n l(y_i, \hat{y}_i) + \sum_{k=1}^K \Omega(f_k)
 ```
+Now here comes the question, what is the *model* of random forest? It is exactly tree ensembles! So random forest and boosted trees are not different in terms of model,
+the difference is how we train them. This means if you write a predictive service of tree ensembles, you only need to write one of them and they should directly work
+for both random forest and boosted trees. One example of elements of supervised learning rocks.
+
+Tree Boosting
+-------------
+After introducing the model, let us begin with the real training part. How should we learn the trees?
+The answer is, as is always for all supervised learning models: *define an objective function, and optimize it*!
+
+Assume we have the following objective function (remember it always need to contain training loss, and regularization)
+```math
+Obj = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
+```

 ### Additive Training

-It is not easy to train all the trees at once. Instead, we use the strategy to train them in a sequence so that everytime we train one CART and add it to the model. We note the prediction value at step `t` by ``$ \hat{y}_i^{(t)}$``, so we have 
+First thing we want to ask is what are ***parameters*** of trees. You can find what we need to learn are those functions ``$f_i$``, with each contains the structure
+of the tree, and the leaf score. This is much harder than traditional optimization problem where you can take the gradient and go.
+It is not easy to train all the trees at once.
+Instead, we use an additive strategy: fix what we have learned, add a new tree at a time.
+We note the prediction value at step ``$t$`` by ``$ \hat{y}_i^{(t)}$``, so we have

 ```math
 \hat{y}_i^{(0)} &= 0\\
 \hat{y}_i^{(1)} &= f_1(x_i) = \hat{y}_i^{(0)} + f_1(x_i)\\
 \hat{y}_i^{(2)} &= f_1(x_i) + f_2(x_i)= \hat{y}_i^{(1)} + f_2(x_i)\\
-\dots &\\
+&\dots\\
 \hat{y}_i^{(t)} &= \sum_{k=1}^t f_k(x_i)= \hat{y}_i^{(t-1)} + f_t(x_i)
 ```

-Which CART do we want at each step? Of course we want to add the one that minimize our objective.
+It remains to ask Which tree do we want at each step?  A natural thing is to add the one that optimizes our objective.

 ```math
 Obj^{(t)} & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
          & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t-1)} + f_t(x_i)) + \Omega(f_t) + constant
 ```

-Let's consider using RMSE as our loss function
+If we  consider using MSE as our loss function, it becomes the following form.

 ```math
 Obj^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\Omega(f_i) \\
          & = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \Omega(f_t) + constant
 ```

-The form of RMSE is friendly. But other loss functions could be tricky to expand. For convenience we calculate the Taylor expansion of the loss function up to the second order
+The form of MSE is friendly, with a first order term (usually called residual) and a quadratic term.
+For other loss of interest (for example, logistic loss), it is not so easy to get such a nice form.
+So in general case, we take the Taylor expansion of the loss function up to the second order

 ```math
 Obj^{(t)} = \sum_{i=1}^n [l(y_i, \hat{y}_i^{(t-1)}) + g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t) + constant
 ```
-
-where 
+where the ``$g_i$`` and ``$h_i$`` are defined as

 ```math
 g_i &= \partial_{\hat{y}_i^{(t)}} l(y_i, \hat{y}_i^{(t-1)})\\
 h_i &= \partial_{\hat{y}_i^{(t)}}^2 l(y_i, \hat{y}_i^{(t-1)})
 ```

-So we can remove all the constant at the t-th step and the specific objective is 
+After we remove all the constants, the specific objective at t step becomes

 ```math
 \sum_{i=1}^n [g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t)
 ```

-One of the benifit of this definition is as long as the loss function has the first and second order derivative, we can optimized every loss function within the same framework.
+This becomes our optimization goal for the new tree. One important advantage of this definition, is that
+it only depends on ``$g_i$`` and ``$h_i$``, this is how xgboost allows support of customization of loss functions.
+We can optimized every loss function, including logistic regression, weighted logistic regression, using the exactly
+the same solver that takes ``$g_i$`` and ``$h_i$`` as input!

 ### Model Complexity
-
-We have introduced the details in the loss function, next we talk about the regularization term. We want to control the complexity of a tree, thus we need to define it first. We define a tree ``$ f(x) $`` as
+We have introduced the training step, but wait, there is one important thing, the ***regularization***!
+We need to define the complexity of the tree ``$\Omega(f)$``. In order to do so, let us first refine the definition of the tree a tree ``$ f(x) $`` as

 ```math
-f_t(x) = w_{q(x)}, w\in R^T, q:R^d\rightarrow \{1,2,\cdots,T\}
+f_t(x) = w_{q(x)}, w \in R^T, q:R^d\rightarrow \{1,2,\cdots,T\} .
 ```

-where ``$ w $`` is the vector of scores on leaves, ``$ q $`` is a function assigning each data point to the corresponding leaf and ``$ T $`` is the number of leaves. In XGBoost, we define the complexity as 
+Here ``$ w $`` is the vector of scores on leaves, ``$ q $`` is a function assigning each data point to the corresponding leaf and``$ T $`` is the number of leaves.
+In XGBoost, we define the complexity as

 ```math
 \Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2
 ```
+Of course there is more than one way to define the complexity, but this specific one works well in practice. The regularization is one part most tree packages takes
+less carefully, or simply ignore. This was due to the traditional treatment tree learning only emphasize improving impurity, while the complexity control part
+are more lies as part of heuristics. By defining it formally, we can get a better idea of what we are learning, and yes it works well in practice.

-It is possible to define other form of regularization terms, but this one works well in practice.
+### The Structure Score

-### The best score on leaf
-
-Now we have the objective value with the ``$ t $``-th tree added:
+Here is the magical part of the derivation. After reformalizing the tree model, we can write the objective value with the ``$ t$``-th tree as:

 ```math
-Obj^{(t)} &\approx \sum_{i=1}^n [g_i w_{q(x_i)} + \frac{1}{2} h_i w_{q(x_i)}^2] + \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2\\
+Obj^{(t)} &\approx \sum_{i=1}^n [g_i w_q(x_i) + \frac{1}{2} h_i w_{q(x_i)}^2] + \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2\\
 &= \sum^T_{j=1} [(\sum_{i\in I_j} g_i) w_j + \frac{1}{2} (\sum_{i\in I_j} h_i + \lambda) w_j^2 ] + \gamma T
 ```

@@ -147,42 +192,41 @@ where ``$ I_j = \{i|q(x_i)=j\} $`` is the set of indices of data points assigned
 Obj^{(t)} = \sum^T_{j=1} [G_jw_j + \frac{1}{2} (H_j+\lambda) w_j^2] +\gamma T
 ```

-In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` to minimize it can be solved deterministically:
+In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` for a given structure ``$q(x)$`` and the best objective reduction we can get:

 ```math
-w_j^\ast &= -\frac{G_j}{H_j+\lambda}\\
-Obj &= -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
+w_j^\ast = -\frac{G_j}{H_j+\lambda}\\
+Obj^\ast = -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
 ```
+The last equation measures ***how good*** a tree structure ``$q(x)$`` is.

-**Therefore, given the parameters, the gradients and the structure of the tree, we know how to set the score on each leaf.**
+![Structure Score](img/struct_score.png)

-### Learning the tree structure
+If all these sounds a bit complicated. Let us take a look the the picture, and see how the scores can be calculated.
+Basically, for a given tree structure, we push the statistics ``$g_i$`` and ``$h_i$`` to the leaves they belong to,
+sum the statistics together, and use the formula to calulate how good the tree is.
+This score is like impurity measure in decision tree, except that it also takes the model complexity into account.

-Our algorithm aims at optimizing the objective, so it also guides us to a good tree structure. We score the structure by ``$ Obj^{(t)} $`` which is mentioned just above. Since we can evaluate the tree, ideally we can enumerate all possible trees and pick the best one. In practice it is impossible, so we enumerate all the trees no deeper than a certain depth greedily.
-
-Specifically we try to split a leaf into two leaves, and the score it gains is 
+### Learn the tree structure
+Now we have a way to measure how good a tree is ideally we can enumerate all possible trees and pick the best one.
+In practice it is impossible, so we will try to one level of the tree at a time.
+Specifically we try to split a leaf into two leaves, and the score it gains is

 ```math
-Gain = \frac{1}{2} [\frac{G_L^2}{H_L+\lambda}+\frac{G_R^2}{H_R+\lambda}-\frac{(G_L+G_R)^2}{H_L+H_R+\lambda}] - \gamma
+Gain = \frac{1}{2} \left[\frac{G_L^2}{H_L+\lambda}+\frac{G_R^2}{H_R+\lambda}-\frac{(G_L+G_R)^2}{H_L+H_R+\lambda}\right] - \gamma
 ```
+This formula can be decomposited as 1) the score on the new left leaf 2) the score on the new right leaf 3) The score on the original leaf 4) regularization on the additional leaf.
+We can find an important fact here: if the gain is smaller than ``$\gamma$``, we would better not to add that branch. This is exactly the ***prunning*** techniques in tree based
+models! By using the principles of supervised learning, we can naturally comes up with the reason these techniques :)

-This formula can be decomposited as 1) the score on the new left leaf, 2) the score on the new right leaf, 3) The score on the original leaf and 4) regularization on the additional leaf.
-
-The regularization in the end can be seen as the minimum increment from this split. In the end, we will prune out the split with a negative gain.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+For real valued data, we usually want to search for an optimal split. To efficiently doing so, we place all the instances in a sorted way, like the following picture.
+![Best split](img/split_find.png)
+Then a left to right scan is sufficient to calculate the structure score of all possible split solutions, and we can find the best split efficiently.

+Final words on XGBoost
+----------------------
+Now you have understand what is a boosted tree, you may ask, where is the introduction on [XGBoost](https://github.com/dmlc/xgboost)?
+XGBoost is exactly a tool motivated by the formal principle introduced in this tutorial!
+More importantly, it is developed with both deep consideration in terms of ***systems optimization*** and ***principles in machine learning***.
+The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate*** library.
+Make sure you [try it out](https://github.com/dmlc/xgboost), and most importantly, contribute your piece of wisdom (code, examples, tutorials) to the community!
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -46,6 +46,10 @@ Parameters for Tree Booster
 * colsample_bytree [default=1]
  - subsample ratio of columns when constructing each tree.
  - range: (0,1]
+* lambda [default=1]
+  - L2 regularization term on weights
+* alpha [default=0]
+  - L1 regularization term on weights

 Parameters for Linear Booster
 -----------------------------
@@ -105,7 +109,7 @@ The following parameters are only used in the console version of xgboost
 * task [default=train] options: train, pred, eval, dump
  - train: training using data
  - pred: making prediction for test:data
-  - eval: for evaluating statistics specified by eval[name]=filenam
+  - eval: for evaluating statistics specified by eval[name]=filename
  - dump: for dump the learned model into text format(preliminary)
 * model_in [default=NULL]
  - path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model
--- a/doc/python-requirements.txt
+++ b/doc/python-requirements.txt
@@ -1,2 +0,0 @@
-commonmark
-
--- a/doc/sphinx_util.py
+++ b/doc/sphinx_util.py
@@ -6,12 +6,12 @@ import docutils
 import subprocess

 if os.environ.get('READTHEDOCS', None) == 'True':
-    subprocess.call('cd ..; rm -rf recommonmark recom;' +
-                    'git clone https://github.com/tqchen/recommonmark;' +
-                    'mv recommonmark/recommonmark recom', shell=True)
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)

-sys.path.insert(0, os.path.abspath('..'))
-from recom import parser, transform
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+
+from recommonmark import parser, transform

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -319,7 +319,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
-            column_indexes = np.repeat(0, data.shape[0])
+            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

--- a/tests/README.md
+++ b/tests/README.md
@@ -1 +1 @@
-This folder contains tetstcases for xgboost.
+This folder contains testcases for xgboost.
--- a/tests/python/test_models.py
+++ b/tests/python/test_models.py
@@ -0,0 +1,39 @@
+import numpy as np
+import xgboost as xgb
+
+dpath = 'demo/data/'
+dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+
+def test_glm():
+	param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 }
+	watchlist  = [(dtest,'eval'), (dtrain,'train')]
+	num_round = 4
+	bst = xgb.train(param, dtrain, num_round, watchlist)
+	assert isinstance(bst, xgb.core.Booster)
+	preds = bst.predict(dtest)
+	labels = dtest.get_label()
+	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.1
+
+def test_custom_objective():
+	param = {'max_depth':2, 'eta':1, 'silent':1 }
+	watchlist  = [(dtest,'eval'), (dtrain,'train')]
+	num_round = 2
+	def logregobj(preds, dtrain):
+		labels = dtrain.get_label()
+		preds = 1.0 / (1.0 + np.exp(-preds))
+		grad = preds - labels
+		hess = preds * (1.0-preds)
+		return grad, hess
+	def evalerror(preds, dtrain):
+		labels = dtrain.get_label()
+		return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+	bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
+	assert isinstance(bst, xgb.core.Booster)
+	preds = bst.predict(dtest)
+	labels = dtest.get_label()
+	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.1
+
+