parent
b9a9d2bf45
commit
58aa1129ea
@ -87,8 +87,8 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
|
||||
#' @param ... Parameters passed to \code{predict.xgb.Booster}
|
||||
#'
|
||||
#' @details
|
||||
#' Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
|
||||
#' and it is not necesserily equal to the number of trees in a model.
|
||||
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
|
||||
#' and it is not necessarily equal to the number of trees in a model.
|
||||
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
|
||||
#' But for multiclass classification, there are multiple trees per iteration,
|
||||
#' but \code{ntreelimit} limits the number of boosting iterations.
|
||||
@ -242,7 +242,7 @@ predict.xgb.Booster.handle <- function(object, ...) {
|
||||
#' (from R or any other interface).
|
||||
#' In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
|
||||
#' would not be saved by \code{xgb.save} because an xgboost model is an external memory object
|
||||
#' and its serialization is handled extrnally.
|
||||
#' and its serialization is handled externally.
|
||||
#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
||||
#' change the value of that parameter for a model.
|
||||
#' Use \code{\link{xgb.parameters<-}} to set or change model parameters.
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param model generated by the \code{xgb.train} function.
|
||||
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||
#' @param label the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
|
||||
#'
|
||||
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||
@ -14,7 +14,7 @@
|
||||
#' This function is for both linear and tree models.
|
||||
#'
|
||||
#' \code{data.table} is returned by the function.
|
||||
#' The columns are :
|
||||
#' The columns are:
|
||||
#' \itemize{
|
||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||
@ -33,7 +33,7 @@
|
||||
#'
|
||||
#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
|
||||
#'
|
||||
#' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
|
||||
#' If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-)
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -94,7 +94,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe
|
||||
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
|
||||
apply(c & d, 2, . %>% target %>% sum) -> vec
|
||||
|
||||
result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
|
||||
result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][, MissingNo := NULL]
|
||||
}
|
||||
}
|
||||
result
|
||||
|
||||
@ -20,7 +20,7 @@ xgb.load <- function(modelfile) {
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
|
||||
handle <- xgb.Booster(modelfile = modelfile)
|
||||
# re-use modelfile if it is raw so we donot need to serialize
|
||||
# re-use modelfile if it is raw so we do not need to serialize
|
||||
if (typeof(modelfile) == "raw") {
|
||||
bst <- xgb.handleToBooster(handle, modelfile)
|
||||
} else {
|
||||
|
||||
@ -86,7 +86,7 @@
|
||||
#' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
|
||||
#' 0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.
|
||||
#' @param save_name the name or path for periodically saved model file.
|
||||
#' @param xgb_model a previously built model to continue the trainig from.
|
||||
#' @param xgb_model a previously built model to continue the training from.
|
||||
#' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
|
||||
#' file with a previously saved model.
|
||||
#' @param callbacks a list of callback functions to perform various task during boosting.
|
||||
|
||||
@ -110,7 +110,7 @@ This parameter is passed to the \code{\link{cb.early.stop}} callback.}
|
||||
|
||||
\item{save_name}{the name or path for periodically saved model file.}
|
||||
|
||||
\item{xgb_model}{a previously built model to continue the trainig from.
|
||||
\item{xgb_model}{a previously built model to continue the training from.
|
||||
Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
|
||||
file with a previously saved model.}
|
||||
|
||||
|
||||
@ -16,8 +16,8 @@
|
||||
|
||||
XGBoost is an optimized distributed gradient boosting library designed to be highly ***efficient***, ***flexible*** and ***portable***.
|
||||
It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework.
|
||||
XGBoost provides a parallel tree boosting(also known as GBDT, GBM) that solve many data science problems in a fast and accurate way.
|
||||
The same code runs on major distributed environment(Hadoop, SGE, MPI) and can solve problems beyond billions of examples.
|
||||
XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way.
|
||||
The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.
|
||||
|
||||
What's New
|
||||
----------
|
||||
@ -29,7 +29,7 @@ What's New
|
||||
Ask a Question
|
||||
--------------
|
||||
* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
|
||||
* For generic questions for to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
|
||||
* For generic questions or to share your experience using XGBoost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
|
||||
|
||||
Help to Make XGBoost Better
|
||||
---------------------------
|
||||
|
||||
@ -417,14 +417,14 @@ The case studied here is not enough complex to show that. Check [Kaggle website]
|
||||
|
||||
Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
|
||||
|
||||
Linear model may not be that smart in this scenario.
|
||||
Linear models may not be that smart in this scenario.
|
||||
|
||||
Special Note: What about Random Forests™?
|
||||
-----------------------------------------
|
||||
|
||||
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
|
||||
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
|
||||
Both train several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
|
||||
|
||||
This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
The document of xgboost is generated with recommonmark and sphinx.
|
||||
The documentation of xgboost is generated with recommonmark and sphinx.
|
||||
|
||||
You can build it locally by typing "make html" in this folder.
|
||||
- clone https://github.com/tqchen/recommonmark to root
|
||||
- type make html
|
||||
|
||||
Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
|
||||
Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
|
||||
|
||||
@ -56,7 +56,7 @@ Yes, xgboost implements LambdaMART. Checkout the objective section in [parameter
|
||||
|
||||
How to deal with Missing Value
|
||||
------------------------------
|
||||
xgboost support missing value by default
|
||||
xgboost supports missing value by default.
|
||||
|
||||
|
||||
Slightly different result between runs
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Get Started with XGBoost
|
||||
|
||||
This is a quick started tutorial showing snippets for you to quickly try out xgboost
|
||||
This is a quick start tutorial showing snippets for you to quickly try out xgboost
|
||||
on the demo dataset on a binary classification task.
|
||||
|
||||
## Links to Helpful Other Resources
|
||||
|
||||
@ -8,7 +8,7 @@ Before running XGboost, we must set three types of parameters: general parameter
|
||||
|
||||
Parameters in R Package
|
||||
-----------------------
|
||||
In R-package, you can use .(dot) to replace under score in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R.
|
||||
In R-package, you can use .(dot) to replace underscore in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R.
|
||||
|
||||
General Parameters
|
||||
------------------
|
||||
@ -29,13 +29,13 @@ Parameters for Tree Booster
|
||||
- step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.
|
||||
- range: [0,1]
|
||||
* gamma [default=0]
|
||||
- minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||
- minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm will be.
|
||||
- range: [0,∞]
|
||||
* max_depth [default=6]
|
||||
- maximum depth of a tree, increase this value will make model more complex / likely to be overfitting.
|
||||
- maximum depth of a tree, increase this value will make the model more complex / likely to be overfitting.
|
||||
- range: [1,∞]
|
||||
* min_child_weight [default=1]
|
||||
- minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
|
||||
- minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.
|
||||
- range: [0,∞]
|
||||
* max_delta_step [default=0]
|
||||
- Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
|
||||
@ -85,7 +85,7 @@ Additional parameters for Dart Booster
|
||||
* normalize_type [default="tree"]
|
||||
- type of normalization algorithm.
|
||||
- "tree": new trees have the same weight of each of dropped trees.
|
||||
- weight of new trees are 1 / (k + learnig_rate)
|
||||
- weight of new trees are 1 / (k + learning_rate)
|
||||
- dropped trees are scaled by a factor of k / (k + learning_rate)
|
||||
- "forest": new trees have the same weight of sum of dropped trees (forest).
|
||||
- weight of new trees are 1 / (1 + learning_rate)
|
||||
@ -105,10 +105,10 @@ Parameters for Linear Booster
|
||||
* alpha [default=0]
|
||||
- L1 regularization term on weights, increase this value will make model more conservative.
|
||||
* lambda_bias
|
||||
- L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)
|
||||
- L2 regularization term on bias, default 0 (no L1 reg on bias because it is not important)
|
||||
|
||||
Parameters for Tweedie Regression
|
||||
-----------------------------
|
||||
---------------------------------
|
||||
* tweedie_variance_power [default=1.5]
|
||||
- Parameter that controls the variance of the tweedie distribution. Set closer to 2 to shift towards a gamma distribution and closer to 1 to shift towards a poisson distribution.
|
||||
|
||||
@ -132,7 +132,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
||||
- the initial prediction score of all instances, global bias
|
||||
- for sufficient number of iterations, changing this value will not have too much effect.
|
||||
* eval_metric [ default according to objective ]
|
||||
- evaluation metrics for validation data, a default metric will be assigned according to objective( rmse for regression, and error for classification, mean average precision for ranking )
|
||||
- evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and error for classification, mean average precision for ranking )
|
||||
- User can add multiple evaluation metrics, for python user, remember to pass the metrics in as list of parameters pairs instead of map, so that latter 'eval_metric' won't override previous one
|
||||
- The choices are listed below:
|
||||
- "rmse": [root mean square error](http://en.wikipedia.org/wiki/Root_mean_square_error)
|
||||
@ -163,12 +163,12 @@ The following parameters are only used in the console version of xgboost
|
||||
* test:data
|
||||
- The path of test data to do prediction
|
||||
* save_period [default=0]
|
||||
- the period to save the model, setting save_period=10 means that for every 10 rounds XGBoost will save the model, setting it to 0 means not save any model during training.
|
||||
- the period to save the model, setting save_period=10 means that for every 10 rounds XGBoost will save the model, setting it to 0 means not saving any model during the training.
|
||||
* task [default=train] options: train, pred, eval, dump
|
||||
- train: training using data
|
||||
- pred: making prediction for test:data
|
||||
- eval: for evaluating statistics specified by eval[name]=filename
|
||||
- dump: for dump the learned model into text format(preliminary)
|
||||
- dump: for dump the learned model into text format (preliminary)
|
||||
* model_in [default=NULL]
|
||||
- path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model
|
||||
* model_out [default=NULL]
|
||||
|
||||
@ -94,7 +94,7 @@ struct LearnerTrainParam
|
||||
.add_enum("auto", 0)
|
||||
.add_enum("col", 1)
|
||||
.add_enum("row", 2)
|
||||
.describe("Data split mode for distributed trainig. ");
|
||||
.describe("Data split mode for distributed training.");
|
||||
DMLC_DECLARE_FIELD(tree_method).set_default(0)
|
||||
.add_enum("auto", 0)
|
||||
.add_enum("approx", 1)
|
||||
|
||||
@ -137,10 +137,10 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
DMLC_DECLARE_FIELD(cache_opt).set_default(true).describe(
|
||||
"EXP Param: Cache aware optimization.");
|
||||
DMLC_DECLARE_FIELD(silent).set_default(false).describe(
|
||||
"Do not print information during trainig.");
|
||||
"Do not print information during training.");
|
||||
DMLC_DECLARE_FIELD(monotone_constraints)
|
||||
.set_default(std::vector<int>())
|
||||
.describe("Constraint of variable monotinicity");
|
||||
.describe("Constraint of variable monotonicity");
|
||||
// add alias of parameters
|
||||
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
|
||||
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user