Merge remote-tracking branch 'dmlc/master'

2015-08-05 12:07:41 +02:00
parent 951ba267cf 752cf4c95d
commit 740db8ff02
62 changed files with 1802 additions and 834 deletions
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -3,3 +3,4 @@
 \.dll$
 ^.*\.Rproj$
 ^\.Rproj\.user$
+README.md
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,16 +1,16 @@
 Package: xgboost
 Type: Package
-Title: eXtreme Gradient Boosting
-Version: 0.4-0
-Date: 2015-05-11
+Title: Extreme Gradient Boosting
+Version: 0.4-2
+Date: 2015-08-01
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
-Description: Xgboost is short for eXtreme Gradient Boosting, which is an 
-    efficient and scalable implementation of gradient boosting framework. 
-    This package is an R wrapper of xgboost. The package includes efficient 
+Description: Extreme Gradient Boosting, which is an 
+    efficient implementation of gradient boosting framework. 
+    This package is its R interface. The package includes efficient 
    linear model solver and tree learning algorithms. The package can automatically 
-    do parallel computation with OpenMP, and it can be more than 10 times faster
-    than existing gradient boosting packages such as gbm. It supports various
+    do parallel computation on a single machine which could be more than 10 times faster
+    than existing gradient boosting packages. It supports various
    objective functions, including regression, classification and ranking. The
    package is made to be extensible, so that users are also allowed to define
    their own objectives easily.
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -288,7 +288,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
    }
    ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
    if (showsd) {
-      ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
+      ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="")
    }
  }
  return (ret)
@@ -313,7 +313,7 @@ xgb.createFolds <- function(y, k = 10)
    if(cuts < 2) cuts <- 2
    if(cuts > 5) cuts <- 5
    y <- cut(y,
-             unique(quantile(y, probs = seq(0, 1, length = cuts))),
+             unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
             include.lowest = TRUE)
  }

--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -240,7 +240,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    else colnames <- colnamesMean
    
    type <- rep(x = "numeric", times = length(colnames))
-    dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
+    dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
    split <- str_split(string = history, pattern = "\t")
    
    for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -134,34 +134,33 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
  }
  
-  yes <- allTrees[!is.na(Yes),Yes]
-                                                                                      
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  yes <- allTrees[!is.na(Yes), Yes]
+  
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
      j = "Yes.Feature", 
-      value = allTrees[ID == yes,Feature])
-
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+      value = allTrees[ID %in% yes, Feature])
+  
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "Yes.Cover", 
-      value = allTrees[ID == yes,Cover])
-
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
-    j = "Yes.Quality", 
-    value = allTrees[ID == yes,Quality])
+      value = allTrees[ID %in% yes, Cover])
  
-  no <- allTrees[!is.na(No),No]
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
+      j = "Yes.Quality", 
+      value = allTrees[ID %in% yes, Quality])
+  no <- allTrees[!is.na(No), No]
  
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "No.Feature", 
-      value = allTrees[ID == no,Feature])
+      value = allTrees[ID %in% no, Feature])
  
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
      j = "No.Cover", 
-      value = allTrees[ID == no,Cover])
+      value = allTrees[ID %in% no, Cover])
  
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
      j = "No.Quality", 
-      value = allTrees[ID == no,Quality])
-        
+      value = allTrees[ID %in% no, Quality])
+  
  allTrees
 }

--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -33,7 +33,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
  if (!"data.table" %in% class(importance_matrix))  {     
    stop("importance_matrix: Should be a data.table.")
  }
-  if (!require(ggplot2, quietly = TRUE)) {
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("ggplot2 package is required for plotting the importance", call. = FALSE)
  }
  if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
@@ -46,7 +46,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
  importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
    
-  plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
+  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
  
  return(plot)  
 }
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -1,10 +1,19 @@
 R package for xgboost
 =====================

+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
+
 Installation
 ------------

-For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
+We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now. For stable/pre-compiled(for Windows and OS X) version, please install from CRAN:
+
+```r
+install.packages('xgboost')
+```
+
+For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.

 ```r
 devtools::install_github('dmlc/xgboost',subdir='R-package')
@@ -24,10 +33,10 @@ If you face an issue installing the package using  ```devtools::install_github``
 ```
 devtools::install_github('dmlc/xgboost',subdir='R-package')
 Downloading github repo dmlc/xgboost@master
-Error in function (type, msg, asError = TRUE)  : 
+Error in function (type, msg, asError = TRUE)  :
  Peer certificate cannot be authenticated with given CA certificates
 ```
-To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) - 
+To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) -
 ```
 1. Clone the current repository and set your workspace to xgboost/R-package/
 2. Run R CMD INSTALL --build . in terminal to get the tarball.
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -33,7 +33,7 @@ evalerror <- function(preds, dtrain) {
  return(list(metric = "error", value = err))
 }

-param <- list(max.depth=2,eta=1,nthread = 2, silent=1, 
+param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
              objective=logregobj, eval_metric=evalerror)
 print ('start training with user customized objective')
 # training with customized objective, we can also do step by step training
@@ -57,9 +57,9 @@ logregobjattr <- function(preds, dtrain) {
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
 }
-
+param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+              objective=logregobjattr, eval_metric=evalerror)
 print ('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, 
-                 objective=logregobj, eval_metric=evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist)
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -32,14 +32,14 @@ extern "C" {
 bool CheckNAN(double v) {
  return ISNAN(v);
 }
-bool LogGamma(double v) {
+double LogGamma(double v) {
  return lgammafn(v);
 }
 }  // namespace utils

 namespace random {
 void Seed(unsigned seed) {
-  warning("parameter seed is ignored, please set random seed using set.seed");
+  //  warning("parameter seed is ignored, please set random seed using set.seed");
 }
 double Uniform(void) {
  return unif_rand();
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "Xgboost presentation"
-output: 
+output:
  rmarkdown::html_vignette:
    css: vignette.css
    number_sections: yes
@@ -16,7 +16,7 @@ vignette: >
 Introduction
 ============

-**Xgboost** is short for e**X**treme **G**radient **Boost**ing package. 
+**Xgboost** is short for e**X**treme **G**radient **Boost**ing package.

 The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.

@@ -25,9 +25,9 @@ It is an efficient and scalable implementation of gradient boosting framework by
 - *linear* model ;
 - *tree learning* algorithm.

-It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. 
+It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily.

-It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. 
+It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions.

 It has several features:

@@ -64,7 +64,7 @@ Formerly available versions can be obtained from the CRAN [archive](http://cran.
 Learning
 ========

-For the purpose of this tutorial we will load **Xgboost** package.
+For the purpose of this tutorial we will load **XGBoost** package.

 ```{r libLoading, results='hold', message=F, warning=F}
 require(xgboost)
@@ -73,7 +73,7 @@ require(xgboost)
 Dataset presentation
 --------------------

-In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-). 
+In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).

 Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.

@@ -85,7 +85,7 @@ We will load the `agaricus` datasets embedded with the package and will link the
 The datasets are already split in:

 * `train`: will be used to build the model ;
-* `test`: will be used to assess the quality of our model. 
+* `test`: will be used to assess the quality of our model.

 Why *split* the dataset in two parts?

@@ -115,7 +115,7 @@ dim(train$data)
 dim(test$data)
 ```

-This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently.
+This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently.

 As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):

@@ -124,7 +124,7 @@ class(train$data)[1]
 class(train$label)
 ```

-Basic Training using Xgboost
+Basic Training using XGBoost
 ----------------------------

 This step is the most critical part of the process for the quality of our model.
@@ -160,7 +160,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth

 #### xgb.DMatrix

-**Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
+**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.

 ```{r trainingDmatrix, message=F, warning=F}
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
@@ -169,7 +169,7 @@ bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround

 #### Verbose option

-**Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
+**XGBoost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.

 One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).

@@ -188,7 +188,7 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
 ```

-Basic prediction using Xgboost
+Basic prediction using XGBoost
 ==============================

 Perform the prediction
@@ -211,7 +211,7 @@ These numbers doesn't look like *binary classification* `{0,1}`. We need to perf
 Transform the regression in a binary classification
 ---------------------------------------------------

-The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model.
+The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.

 How can we use a *regression* model to perform a binary classification?

@@ -240,7 +240,7 @@ Steps explanation:
 2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
 3. `mean(vectorOfErrors)` computes the *average error* itself.

-The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. 
+The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**.

 *Multiclass* classification works in a similar way.

@@ -269,7 +269,7 @@ Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.

 One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.

-One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
+One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.

 > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.

@@ -281,7 +281,7 @@ watchlist <- list(train=dtrain, test=dtest)
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
 ```

-**Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. 
+**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.

 Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.

@@ -298,13 +298,13 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
 Linear boosting
 ---------------

-Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
+Until know, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).

 ```{r linearBoosting, message=F, warning=F}
 bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
 ```

-In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. 
+In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.

 In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.

@@ -340,7 +340,7 @@ print(paste("test-error=", err))
 View feature importance/influence from the learnt model
 -------------------------------------------------------

-Feature importance is similar to R gbm package's relative influence (rel.inf). 
+Feature importance is similar to R gbm package's relative influence (rel.inf).

 ```
 importance_matrix <- xgb.importance(model = bst)
@@ -370,7 +370,7 @@ Save and load models

 May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.

-Hopefully for you, **Xgboost** implements such functions.
+Hopefully for you, **XGBoost** implements such functions.

 ```{r saveModel, message=F, warning=F}
 # save model to binary local file
@@ -397,7 +397,7 @@ file.remove("./xgboost.model")

 > result is `0`? We are good!

-In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
+In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.

 ```{r saveLoadRBinVectorModel, message=F, warning=F}
 # save model to R's raw vector
@@ -412,9 +412,9 @@ pred3 <- predict(bst3, test$data)

 # pred2 should be identical to pred
 print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
-``` 
+```

-> Again `0`? It seems that `Xgboost` works pretty well!
+> Again `0`? It seems that `XGBoost` works pretty well!

 References
 ==========