From 9f5889f1e385623d473fa9b7cd588fd69cb9c584 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 4 Feb 2015 23:59:53 +0100 Subject: [PATCH 01/26] new included feature in dt.tree function --- R-package/NAMESPACE | 1 + R-package/R/xgb.model.dt.tree.R | 7 ++++++- R-package/man/xgb.model.dt.tree.Rd | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index fab1546a2..7d9c64563 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -41,6 +41,7 @@ importFrom(ggplot2,ylab) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) +importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 373f29403..42ca8237b 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -14,6 +14,7 @@ #' @importFrom stringr str_split #' @importFrom stringr str_extract #' @importFrom stringr str_trim +#' @importFrom stringr str_detect #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. @@ -37,6 +38,8 @@ #' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; +#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; +#' \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; #' } #' #' @examples @@ -158,6 +161,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "No.Quality", value = allTrees[ID == no,Quality]) + + allTrees[,"Included":=F][ID == allTrees[!is.na(Yes), Yes], Included:=T][str_detect(ID, "-0$"), Included:=T] allTrees } @@ -165,4 +170,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence")) \ No newline at end of file +globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence", "Included")) \ No newline at end of file diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 069e7ad77..31910cc49 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -39,6 +39,8 @@ The content of the \code{data.table} is organised that way: \item \code{Quality}: it's the gain related to the split in this specific node ; \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; + \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; + \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; } } \examples{ From 92652bffa1b42f21764c3eb19b82fa229aa409a3 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 5 Feb 2015 00:01:13 +0100 Subject: [PATCH 02/26] wording --- R-package/R/xgb.model.dt.tree.R | 2 +- R-package/man/xgb.model.dt.tree.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 42ca8237b..39d51d942 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -39,7 +39,7 @@ #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; -#' \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; +#' \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; #' } #' #' @examples diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 31910cc49..105724471 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -40,7 +40,7 @@ The content of the \code{data.table} is organised that way: \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; - \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; + \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; } } \examples{ From b7526671ba43ec327447817e1e037861fed3b54a Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 5 Feb 2015 00:03:39 +0100 Subject: [PATCH 03/26] wording --- R-package/R/xgb.model.dt.tree.R | 2 +- R-package/man/xgb.model.dt.tree.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 39d51d942..acdcdbdfa 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -39,7 +39,7 @@ #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; -#' \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; +#' \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}). By convention stem feature is always included ; #' } #' #' @examples diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 105724471..8b9eb6a13 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -40,7 +40,7 @@ The content of the \code{data.table} is organised that way: \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; - \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; + \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}). By convention stem feature is always included ; } } \examples{ From 68290546ca4fdda3f240ebb6529805924a64ec03 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 5 Feb 2015 09:53:21 +0100 Subject: [PATCH 04/26] simplidied included column computation --- R-package/R/xgb.model.dt.tree.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index acdcdbdfa..c88c16989 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -129,8 +129,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID] - + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID][,"Included":=F][ID == yesBranch, Included:=T][1, Included:=T] + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } @@ -161,9 +161,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "No.Quality", value = allTrees[ID == no,Quality]) - - allTrees[,"Included":=F][ID == allTrees[!is.na(Yes), Yes], Included:=T][str_detect(ID, "-0$"), Included:=T] - + allTrees } From a82a942cd60216559aabe3c950c5200bca21cf9e Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 5 Feb 2015 17:25:37 +0100 Subject: [PATCH 05/26] add importance feature sign --- R-package/R/xgb.importance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 094171382..51cf2a258 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -73,7 +73,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N } treeDump <- function(feature_names, text){ - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)] + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N, Included = sum(Included)), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover), Frequence = Frequence/sum(Frequence), Included = Included/Frequence)][,Gain:= ifelse(Included >= 0.5, Gain, -Gain)][order(-Gain)] result } From 85186a2e55a91f08a49cee332a25968375cfb185 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 6 Feb 2015 11:44:09 +0100 Subject: [PATCH 06/26] remove buggy feature --- R-package/R/xgb.importance.R | 2 +- R-package/R/xgb.model.dt.tree.R | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 51cf2a258..710143fa1 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -73,7 +73,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N } treeDump <- function(feature_names, text){ - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N, Included = sum(Included)), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover), Frequence = Frequence/sum(Frequence), Included = Included/Frequence)][,Gain:= ifelse(Included >= 0.5, Gain, -Gain)][order(-Gain)] + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))] result } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index c88c16989..9c570c44b 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -39,7 +39,6 @@ #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; -#' \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}). By convention stem feature is always included ; #' } #' #' @examples @@ -129,7 +128,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID][,"Included":=F][ID == yesBranch, Included:=T][1, Included:=T] + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID] allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } @@ -168,4 +167,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence", "Included")) \ No newline at end of file +globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence")) \ No newline at end of file From 85739c537dbfdb516192fbb11990ce35523dbdd5 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sat, 7 Feb 2015 23:40:49 +0100 Subject: [PATCH 07/26] new doc --- R-package/R/xgboost.R | 73 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index c72c4d5b0..b3c76af9d 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -6,20 +6,69 @@ #' \code{xgb.DMatrix}. #' @param label the response variable. User should not set this field, #' if data is local data file or \code{xgb.DMatrix}. -#' @param params the list of parameters. Commonly used ones are: +#' @param params the list of parameters. +#' +#' General Parameters +#' #' \itemize{ -#' \item \code{objective} objective function, common ones are -#' \itemize{ -#' \item \code{reg:linear} linear regression -#' \item \code{binary:logistic} logistic regression for classification -#' } -#' \item \code{eta} step size of each boosting step -#' \item \code{max.depth} maximum depth of the tree -#' \item \code{nthread} number of thread used in training, if not set, all threads are used +#' \item \code{booster} which booster to use, can be gbtree or gblinear. (default=gbtree) +#' \item \code{silent} 0 (default) means printing running messages, 1 means silent mode. +#' \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. +#' \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user +#' \item \code{num_feature} feature dimension used in boosting, set to maximum dimension of the feature. Default: set automatically by xgboost, no need to be set by user. #' } -#' -#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for -#' further details. See also demo/ for walkthrough example in R. +#' +#' Booster Parameters +#' +#' 1. Parameter for Tree Booster +#' +#' \itemize{ +#' \item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3 +#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. +#' \item \code{max_depth} maximum depth of a tree. Default: 6 +#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 +#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1. +#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1. +#' } +#' +#' 2. Parameter for Linear Booster +#' +#' \itemize{ +#' \item \code{lambda} L2 regularization term on weights. Default: 0 +#' \item \code{lambda_bias} L2 regularization term on bias. Default: 0 +#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 +#' } +#' +#' Task Parameters +#' +#' +#' \itemize{ +#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: +#' \itemize{ +#' \item \code{reg:linear} linear regression (Default). +#' \item \code{reg:logistic} logistic regression. +#' \item \code{binary:logistic} logistic regression for binary classification. Output probability. +#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. +#' \item \code{multi:softmax} set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). +#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. +#' \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss +#' } +#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 +#' \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: +#' \itemize{ +#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} +#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} +#' \item \code{error} Binary classification error rate. It is calculated as (wrong cases)/(all cases). For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. +#' \item \code{merror} Multiclass classification error rate. It is calculated as (wrong cases)/(all cases). +#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. +#' \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} +#' } +#' \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} +#' \item \code{ndcg@n} and \code{map@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. +#' \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. +#' \item \code{seed} random number seed. Default: 0 +#' } +#' #' @param nrounds the max number of iterations #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both From 75f205b0b1f88db58168a3658e83cde617711f36 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sat, 7 Feb 2015 23:53:55 +0100 Subject: [PATCH 08/26] fix documentation --- R-package/R/xgboost.R | 29 ++++++------- R-package/man/xgb.model.dt.tree.Rd | 1 - R-package/man/xgboost.Rd | 69 +++++++++++++++++++++++++----- 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index b3c76af9d..4e1861ae4 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -8,30 +8,30 @@ #' if data is local data file or \code{xgb.DMatrix}. #' @param params the list of parameters. #' -#' General Parameters +#' 1. General Parameters #' #' \itemize{ -#' \item \code{booster} which booster to use, can be gbtree or gblinear. (default=gbtree) -#' \item \code{silent} 0 (default) means printing running messages, 1 means silent mode. +#' \item \code{booster} which booster to use, can be gbtree or gblinear. Default: gbtree +#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 #' \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. #' \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user #' \item \code{num_feature} feature dimension used in boosting, set to maximum dimension of the feature. Default: set automatically by xgboost, no need to be set by user. #' } #' -#' Booster Parameters +#' 2. Booster Parameters #' -#' 1. Parameter for Tree Booster +#' 2.1. Parameter for Tree Booster #' #' \itemize{ #' \item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3 #' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. #' \item \code{max_depth} maximum depth of a tree. Default: 6 #' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 -#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1. -#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1. +#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 +#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 #' } #' -#' 2. Parameter for Linear Booster +#' 2.2. Parameter for Linear Booster #' #' \itemize{ #' \item \code{lambda} L2 regularization term on weights. Default: 0 @@ -39,8 +39,7 @@ #' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 #' } #' -#' Task Parameters -#' +#' 3. Task Parameters #' #' \itemize{ #' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: @@ -51,21 +50,21 @@ #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. #' \item \code{multi:softmax} set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). #' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. -#' \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss +#' \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss. #' } #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 #' \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: #' \itemize{ #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} -#' \item \code{error} Binary classification error rate. It is calculated as (wrong cases)/(all cases). For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. -#' \item \code{merror} Multiclass classification error rate. It is calculated as (wrong cases)/(all cases). +#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. +#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. #' \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} #' } #' \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} -#' \item \code{ndcg@n} and \code{map@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. -#' \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. +#' \item \code{ndcg@@n} and \code{map@@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. +#' \item \code{ndcg-}, \code{map-}, \code{ndcg@@n-}, \code{map@@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. #' \item \code{seed} random number seed. Default: 0 #' } #' diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 8b9eb6a13..604607209 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -40,7 +40,6 @@ The content of the \code{data.table} is organised that way: \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; - \item \code{Included}: \code{boolean} value which indicates if this feature has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}). By convention stem feature is always included ; } } \examples{ diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 39364c64e..57ca5e469 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -17,20 +17,67 @@ if data is local data file or \code{xgb.DMatrix}.} \item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} -\item{params}{the list of parameters. Commonly used ones are: +\item{params}{the list of parameters. + +1. General Parameters + \itemize{ - \item \code{objective} objective function, common ones are - \itemize{ - \item \code{reg:linear} linear regression - \item \code{binary:logistic} logistic regression for classification - } - \item \code{eta} step size of each boosting step - \item \code{max.depth} maximum depth of the tree - \item \code{nthread} number of thread used in training, if not set, all threads are used + \item \code{booster} which booster to use, can be gbtree or gblinear. Default: gbtree + \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 + \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. + \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user + \item \code{num_feature} feature dimension used in boosting, set to maximum dimension of the feature. Default: set automatically by xgboost, no need to be set by user. } - See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for - further details. See also demo/ for walkthrough example in R.} +2. Booster Parameters + +2.1. Parameter for Tree Booster + +\itemize{ + \item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3 + \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + \item \code{max_depth} maximum depth of a tree. Default: 6 + \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 + \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 +} + +2.2. Parameter for Linear Booster + +\itemize{ + \item \code{lambda} L2 regularization term on weights. Default: 0 + \item \code{lambda_bias} L2 regularization term on bias. Default: 0 + \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 +} + +3. Task Parameters + +\itemize{ +\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below: + \itemize{ + \item \code{reg:linear} linear regression (Default). + \item \code{reg:logistic} logistic regression. + \item \code{binary:logistic} logistic regression for binary classification. Output probability. + \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. + \item \code{multi:softmax} set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). + \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. + \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss. + } + \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 + \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: + \itemize{ + \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} + \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} + \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. + \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. + \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. + \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} + } + \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} + \item \code{ndcg@n} and \code{map@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. + \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. + \item \code{seed} random number seed. Default: 0 +}} \item{nrounds}{the max number of iterations} From 12b0e8e6d583b1e2d19b9e903a146a1cc19e09d4 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sat, 7 Feb 2015 23:57:48 +0100 Subject: [PATCH 09/26] small doc fix --- R-package/man/xgboost.Rd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 57ca5e469..6a5e72264 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -22,7 +22,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu 1. General Parameters \itemize{ - \item \code{booster} which booster to use, can be gbtree or gblinear. Default: gbtree + \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user @@ -38,7 +38,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. \item \code{max_depth} maximum depth of a tree. Default: 6 \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 - \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 } @@ -59,9 +59,9 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item \code{reg:logistic} logistic regression. \item \code{binary:logistic} logistic regression for binary classification. Output probability. \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. - \item \code{multi:softmax} set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. - \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss. + \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. } \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: @@ -75,7 +75,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu } \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} \item \code{ndcg@n} and \code{map@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. - \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. + \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In xgboost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric xgboost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. \item \code{seed} random number seed. Default: 0 }} From 9d89441e38a23f02429fe388229e737a5661dde4 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sat, 7 Feb 2015 23:58:09 +0100 Subject: [PATCH 10/26] small doc fix --- R-package/R/xgboost.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 4e1861ae4..d60c58d5f 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -11,7 +11,7 @@ #' 1. General Parameters #' #' \itemize{ -#' \item \code{booster} which booster to use, can be gbtree or gblinear. Default: gbtree +#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} #' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 #' \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. #' \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user @@ -27,7 +27,7 @@ #' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. #' \item \code{max_depth} maximum depth of a tree. Default: 6 #' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 -#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 +#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1 #' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 #' } #' @@ -48,9 +48,9 @@ #' \item \code{reg:logistic} logistic regression. #' \item \code{binary:logistic} logistic regression for binary classification. Output probability. #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. -#' \item \code{multi:softmax} set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). +#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes). #' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. -#' \item \code{rank:pairwise} set XGBoost to do ranking task by minimizing the pairwise loss. +#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 #' \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: @@ -64,7 +64,7 @@ #' } #' \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} #' \item \code{ndcg@@n} and \code{map@@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. -#' \item \code{ndcg-}, \code{map-}, \code{ndcg@@n-}, \code{map@@n-} In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. +#' \item \code{ndcg-}, \code{map-}, \code{ndcg@@n-}, \code{map@@n-} In xgboost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric xgboost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. #' \item \code{seed} random number seed. Default: 0 #' } #' From 29b5312428dbdb3a4deddec2852d84115de9f4c7 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 8 Feb 2015 00:02:53 +0100 Subject: [PATCH 11/26] remove not required dependency --- R-package/R/xgb.model.dt.tree.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 9c570c44b..42b3657b0 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -14,7 +14,6 @@ #' @importFrom stringr str_split #' @importFrom stringr str_extract #' @importFrom stringr str_trim -#' @importFrom stringr str_detect #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. From 76e24fdd36648da21c46176c214df5481e40c52b Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 8 Feb 2015 22:46:29 +0100 Subject: [PATCH 12/26] documentation simplification --- R-package/NAMESPACE | 1 - R-package/R/xgboost.R | 43 +++++++++++++++++++--------------------- R-package/man/xgboost.Rd | 33 ++++++++++++++---------------- 3 files changed, 35 insertions(+), 42 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 7d9c64563..fab1546a2 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -41,7 +41,6 @@ importFrom(ggplot2,ylab) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) -importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index d60c58d5f..3d5ad1d89 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -13,9 +13,6 @@ #' \itemize{ #' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} #' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 -#' \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. -#' \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user -#' \item \code{num_feature} feature dimension used in boosting, set to maximum dimension of the feature. Default: set automatically by xgboost, no need to be set by user. #' } #' #' 2. Booster Parameters @@ -53,7 +50,24 @@ #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 -#' \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: +#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. +#' } +#' +#' @param nrounds the max number of iterations +#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print +#' information of performance. If 2, xgboost will print information of both +#' performance and construction progress information +#' @param missing Missing is only used when input is dense matrix, pick a float +#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. +#' @param ... other parameters to pass to \code{params}. +#' +#' @details +#' This is the modeling function for xgboost. +#' +#' Parallelization is automatically enabled if OpenMP is present. +#' Number of threads can also be manually specified via "nthread" parameter. +#' +#' \code{eval_metric} is set automatically by xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by xgboost to help you to understand how it works inside. It should not be overriden until you have a real reason to do so. #' \itemize{ #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} @@ -62,25 +76,8 @@ #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. #' \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} #' } -#' \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} -#' \item \code{ndcg@@n} and \code{map@@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. -#' \item \code{ndcg-}, \code{map-}, \code{ndcg@@n-}, \code{map@@n-} In xgboost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric xgboost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. -#' \item \code{seed} random number seed. Default: 0 -#' } -#' -#' @param nrounds the max number of iterations -#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print -#' information of performance. If 2, xgboost will print information of both -#' performance and construction progress information -#' @param missing Missing is only used when input is dense matrix, pick a float -#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. -#' @param ... other parameters to pass to \code{params}. -#' -#' @details -#' This is the modeling function for xgboost. -#' -#' Parallelization is automatically enabled if OpenMP is present. -#' Number of threads can also be manually specified via "nthread" parameter +#' +#' More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters} #' #' @examples #' data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 6a5e72264..8c8a384f4 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -15,7 +15,7 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), if data is local data file or \code{xgb.DMatrix}.} \item{missing}{Missing is only used when input is dense matrix, pick a float -value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} +value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.} \item{params}{the list of parameters. @@ -24,9 +24,6 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \itemize{ \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 - \item \code{nthread} number of parallel threads used to run xgboost. Default to maximum number of threads available if not set. - \item \code{num_pbuffer} size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. Default: set automatically by xgboost, no need to be set by user - \item \code{num_feature} feature dimension used in boosting, set to maximum dimension of the feature. Default: set automatically by xgboost, no need to be set by user. } 2. Booster Parameters @@ -64,19 +61,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. } \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 - \item \code{eval_metric} evaluation metrics for validation data, a default metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). Default according to objective. The choices are listed below: - \itemize{ - \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} - \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} - \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. - \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. - \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. - \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} - } - \item \code{map} Mean average precision. \url{http://en.wikipedia.org/wiki/Mean_average_precision#'Mean_average_precision} - \item \code{ndcg@n} and \code{map@n} n can be assigned as an integer to cut off the top positions in the lists for evaluation. - \item \code{ndcg-}, \code{map-}, \code{ndcg@n-}, \code{map@n-} In xgboost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric xgboost will evaluate these score as 0 to be consistent under some conditions. Training repeatively. - \item \code{seed} random number seed. Default: 0 + \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. }} \item{nrounds}{the max number of iterations} @@ -94,7 +79,19 @@ A simple interface for xgboost in R This is the modeling function for xgboost. Parallelization is automatically enabled if OpenMP is present. -Number of threads can also be manually specified via "nthread" parameter +Number of threads can also be manually specified via "nthread" parameter. + +\code{eval_metric} is set automatically by xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by xgboost to help you to understand how it works inside. It should not be overriden until you have a real reason to do so. + \itemize{ + \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} + \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} + \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. + \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. + \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. + \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} + } + +More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters} } \examples{ data(agaricus.train, package='xgboost') From a45497e6f337c4d99726193132c19a943a77def9 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 8 Feb 2015 22:46:59 +0100 Subject: [PATCH 13/26] add web address --- R-package/R/xgboost.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 3d5ad1d89..3b01868f9 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -77,7 +77,7 @@ #' \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} #' } #' -#' More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters} +#' More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters}. #' #' @examples #' data(agaricus.train, package='xgboost') From 092288325090e9c97987a43ae6f390737aa37444 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Mon, 9 Feb 2015 17:20:21 +0100 Subject: [PATCH 14/26] Optimization in dump function (replaced some regular R function by data.table) --- R-package/R/xgb.dump.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index a0938ded1..7cb1c524a 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -46,12 +46,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { stop("fmap: argument must be type character (when provided)") } - result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") + dt <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") %>% fread + + setnames(dt, "Content") if(is.null(fname)) { - return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .)) + result <- dt[Content != "0"][,Content := str_replace(Content, "^\t+", "")][Content != ""][,paste(Content)] + return(result) } else { - result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname) + result <- dt[Content != "0"][Content != ""][,paste(Content)] %>% writeLines(fname) return(TRUE) } } From 3971323203fc3e0c841b91d303a7bfdbb2a5d4fe Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Mon, 9 Feb 2015 18:01:14 +0100 Subject: [PATCH 15/26] fix bug --- R-package/NAMESPACE | 1 + R-package/R/xgb.dump.R | 8 ++++++-- R-package/man/xgboost.Rd | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index fab1546a2..cce209811 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -25,6 +25,7 @@ importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) +importFrom(data.table,fread) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setnames) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 7cb1c524a..cc22bb3a7 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -3,8 +3,10 @@ #' Save a xgboost model to text file. Could be parsed later. #' #' @importFrom magrittr %>% -#' @importFrom stringr str_split #' @importFrom stringr str_replace +#' @importFrom data.table fread +#' @importFrom data.table := +#' @importFrom data.table setnames #' @param model the model object. #' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. #' @param fmap feature map file representing the type of feature. @@ -46,7 +48,9 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { stop("fmap: argument must be type character (when provided)") } - dt <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") %>% fread + longString <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") + + dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F) setnames(dt, "Content") diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 8c8a384f4..59480e36c 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -91,7 +91,7 @@ Number of threads can also be manually specified via "nthread" parameter. \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG} } -More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters} +More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters}. } \examples{ data(agaricus.train, package='xgboost') From f4b454d6ddbb7b8fa91266bd126140c044dab42f Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 9 Feb 2015 21:34:53 +0100 Subject: [PATCH 16/26] fix some warning in Cran check --- R-package/R/xgb.dump.R | 8 ++++---- R-package/man/xgb.save.raw.Rd | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 R-package/man/xgb.save.raw.Rd diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index cc22bb3a7..edeb03b5f 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -52,13 +52,13 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F) - setnames(dt, "Content") + setnames(dt, "Lines") if(is.null(fname)) { - result <- dt[Content != "0"][,Content := str_replace(Content, "^\t+", "")][Content != ""][,paste(Content)] + result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)] return(result) } else { - result <- dt[Content != "0"][Content != ""][,paste(Content)] %>% writeLines(fname) + result <- dt[Lines != "0"][Lines != ""][, paste(Lines)] %>% writeLines(fname) return(TRUE) } } @@ -66,4 +66,4 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(".") \ No newline at end of file +globalVariables(c("Lines", ".")) \ No newline at end of file diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd new file mode 100644 index 000000000..15ec30636 --- /dev/null +++ b/R-package/man/xgb.save.raw.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/xgb.save.raw.R +\name{xgb.save.raw} +\alias{xgb.save.raw} +\title{Save xgboost model to R's raw vector, +user can call xgb.load to load the model back from raw vector} +\usage{ +xgb.save.raw(model) +} +\arguments{ +\item{model}{the model object.} +} +\description{ +Save xgboost model from xgboost or xgb.train +} +\examples{ +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +train <- agaricus.train +test <- agaricus.test +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nround = 2,objective = "binary:logistic") +raw <- xgb.save(bst) +bst <- xgb.load(raw) +pred <- predict(bst, test$data) +} + From eecfd015fac1f14a3aa4277448de1534df7403af Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 9 Feb 2015 21:37:31 +0100 Subject: [PATCH 17/26] Update CK.means version --- R-package/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 0e62ac423..de91c5edc 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -29,4 +29,4 @@ Imports: stringr (>= 0.6.2), DiagrammeR (>= 0.4), ggplot2 (>= 1.0.0), - Ckmeans.1d.dp (>= 3.02) + Ckmeans.1d.dp (>= 3.3.0) From 8c16491b4239476d6a4ad843d3c2f37eb942ada2 Mon Sep 17 00:00:00 2001 From: Tong He Date: Mon, 9 Feb 2015 13:31:21 -0800 Subject: [PATCH 18/26] Update xgb.save.raw.R --- R-package/R/xgb.save.raw.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index 11fc44470..d8ed6f526 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -12,7 +12,7 @@ #' test <- agaricus.test #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nround = 2,objective = "binary:logistic") -#' raw <- xgb.save(bst) +#' raw <- xgb.save.raw(bst) #' bst <- xgb.load(raw) #' pred <- predict(bst, test$data) #' @export From ea5860d574126400933d0dd06f754932da99dd72 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 13:43:32 -0800 Subject: [PATCH 19/26] fix save.raw doc --- R-package/man/xgb.save.raw.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 15ec30636..f169a3d3d 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -20,7 +20,7 @@ train <- agaricus.train test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") -raw <- xgb.save(bst) +raw <- xgb.save.raw(bst) bst <- xgb.load(raw) pred <- predict(bst, test$data) } From 5b611c355e61546b7c70d4f6a86b387e50a2b17e Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 15:51:24 -0800 Subject: [PATCH 20/26] add handle and raw structure to xgb.Booster --- R-package/R/predict.xgb.Booster.R | 17 +++++++++++++++-- R-package/R/utils.R | 8 ++++---- R-package/R/xgb.dump.R | 9 ++++++++- R-package/R/xgb.load.R | 6 +++++- R-package/R/xgb.save.R | 5 ++++- R-package/R/xgb.save.raw.R | 8 ++++---- R-package/R/xgb.train.R | 9 ++++++--- 7 files changed, 46 insertions(+), 16 deletions(-) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 1e458e708..033bfab84 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -1,4 +1,7 @@ -setClass("xgb.Booster") +setClass("xgb.Booster.handle") +setClass("xgb.Booster", + slots = c(handle = "xgb.Booster.handle", + raw = "raw")) #' Predict method for eXtreme Gradient Boosting model #' @@ -30,6 +33,16 @@ setClass("xgb.Booster") setMethod("predict", signature = "xgb.Booster", definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) { + if (class(object) != "xgb.Booster"){ + stop("predict: model in prediction must be of class xgb.Booster") + } else { + if (is.null(object$handle)) { + object$handle <- xgb.load(object$raw) + } else { + if (is.null(object$raw)) + object$raw <- xgb.save.raw(object$handle) + } + } if (class(newdata) != "xgb.DMatrix") { if (is.null(missing)) { newdata <- xgb.DMatrix(newdata) @@ -51,7 +64,7 @@ setMethod("predict", signature = "xgb.Booster", if (predleaf) { option <- option + 2 } - ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option), + ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), as.integer(ntreelimit), PACKAGE = "xgboost") if (predleaf){ len <- getinfo(newdata, "nrow") diff --git a/R-package/R/utils.R b/R-package/R/utils.R index b0c7f15ac..fb3f59957 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -65,7 +65,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { stop("xgb.Booster: modelfile must be character or raw vector") } } - return(structure(handle, class = "xgb.Booster")) + return(structure(handle, class = "xgb.Booster.handle")) } ## ----the following are low level iteratively function, not needed if @@ -102,7 +102,7 @@ xgb.numrow <- function(dmat) { } # iteratively update booster with customized statistics xgb.iter.boost <- function(booster, dtrain, gpair) { - if (class(booster) != "xgb.Booster") { + if (class(booster) != "xgb.Booster.handle") { stop("xgb.iter.update: first argument must be type xgb.Booster") } if (class(dtrain) != "xgb.DMatrix") { @@ -115,7 +115,7 @@ xgb.iter.boost <- function(booster, dtrain, gpair) { # iteratively update booster with dtrain xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { - if (class(booster) != "xgb.Booster") { + if (class(booster) != "xgb.Booster.handle") { stop("xgb.iter.update: first argument must be type xgb.Booster") } if (class(dtrain) != "xgb.DMatrix") { @@ -135,7 +135,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { # iteratively evaluate one iteration xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) { - if (class(booster) != "xgb.Booster") { + if (class(booster) != "xgb.Booster.handle") { stop("xgb.eval: first argument must be type xgb.Booster") } if (typeof(watchlist) != "list") { diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index edeb03b5f..1f73eed2e 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -40,6 +40,13 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { stop("model: argument must be type xgb.Booster") + } else { + if (is.null(model$handle)) { + model$handle <- xgb.load(model$raw) + } else { + if (is.null(model$raw)) + model$raw <- xgb.save.raw(model$handle) + } } if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) { stop("fname: argument must be type character (when provided)") @@ -48,7 +55,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { stop("fmap: argument must be type character (when provided)") } - longString <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") + longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost") dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index af87e2b3c..87247b4a9 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -19,5 +19,9 @@ xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") - xgb.Booster(modelfile = modelfile) + bst <- list(handle = NULL,raw = NULL) + class(bst) <- 'xgb.Booster' + bst$handle <- xgb.Booster(modelfile = modelfile) + bst$raw <- xgb.save.raw(bst$handle) + return(bst) } diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 2a250a9af..0fecddfb5 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -22,7 +22,10 @@ xgb.save <- function(model, fname) { stop("xgb.save: fname must be character") } if (class(model) == "xgb.Booster") { - .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost") + if (is.null(model$handle)) { + model$handle <- xgb.load(model$raw) + } + .Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost") return(TRUE) } stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index d8ed6f526..91f7075bd 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -17,11 +17,11 @@ #' pred <- predict(bst, test$data) #' @export #' -xgb.save.raw <- function(model) { - if (class(model) == "xgb.Booster") { - raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost") +xgb.save.raw <- function(handle) { + if (class(handle) == "xgb.Booster.handle") { + raw <- .Call("XGBoosterModelToRaw_R", handle, PACKAGE = "xgboost") return(raw) } - stop("xgb.raw: the input must be xgb.Booster. Use xgb.DMatrix.save to save + stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save xgb.DMatrix object.") } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 06c39d76c..c6d29e6e3 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -86,13 +86,16 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } params = append(params, list(...)) - bst <- xgb.Booster(params, append(watchlist, dtrain)) + bst <- list(handle = NULL,raw = NULL) + class(bst) <- 'xgb.Booster' + bst$handle <- xgb.Booster(params, append(watchlist, dtrain)) for (i in 1:nrounds) { - succ <- xgb.iter.update(bst, dtrain, i - 1, obj) + succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { - msg <- xgb.iter.eval(bst, watchlist, i - 1, feval) + msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) cat(paste(msg, "\n", sep="")) } } + bst$raw <- xgb.save.raw(bst$handle) return(bst) } From f7c838ffaa138aed9c87ca941e8582eef4a20b30 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 16:16:11 -0800 Subject: [PATCH 21/26] fix bugs --- R-package/R/utils.R | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index fb3f59957..bcbde36d1 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -102,28 +102,42 @@ xgb.numrow <- function(dmat) { } # iteratively update booster with customized statistics xgb.iter.boost <- function(booster, dtrain, gpair) { - if (class(booster) != "xgb.Booster.handle") { + if (class(booster) != "xgb.Booster") { stop("xgb.iter.update: first argument must be type xgb.Booster") + } else { + if (is.null(booster$handle)) { + booster$handle <- xgb.load(booster$raw) + } else { + if (is.null(booster$raw)) + booster$raw <- xgb.save.raw(booster$handle) + } } if (class(dtrain) != "xgb.DMatrix") { stop("xgb.iter.update: second argument must be type xgb.DMatrix") } - .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, + .Call("XGBoosterBoostOneIter_R", booster$handle, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost") return(TRUE) } # iteratively update booster with dtrain xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { - if (class(booster) != "xgb.Booster.handle") { + if (class(booster) != "xgb.Booster") { stop("xgb.iter.update: first argument must be type xgb.Booster") + } else { + if (is.null(booster$handle)) { + booster$handle <- xgb.load(booster$raw) + } else { + if (is.null(booster$raw)) + booster$raw <- xgb.save.raw(booster$handle) + } } if (class(dtrain) != "xgb.DMatrix") { stop("xgb.iter.update: second argument must be type xgb.DMatrix") } if (is.null(obj)) { - .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, + .Call("XGBoosterUpdateOneIter_R", booster$handle, as.integer(iter), dtrain, PACKAGE = "xgboost") } else { pred <- predict(booster, dtrain) From 0aef62dabceba9b2b6442d86dd767dfc0b798442 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 16:25:00 -0800 Subject: [PATCH 22/26] fix with new predict --- R-package/R/predict.xgb.Booster.R | 2 +- R-package/R/predict.xgb.Booster.handle.R | 16 +++++++++++++++ R-package/R/utils.R | 26 ++++++------------------ R-package/R/xgb.load.R | 1 + 4 files changed, 24 insertions(+), 21 deletions(-) create mode 100644 R-package/R/predict.xgb.Booster.handle.R diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 033bfab84..b1c3c10ca 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -77,4 +77,4 @@ setMethod("predict", signature = "xgb.Booster", } return(ret) }) - + diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R new file mode 100644 index 000000000..05cbf891e --- /dev/null +++ b/R-package/R/predict.xgb.Booster.handle.R @@ -0,0 +1,16 @@ +setClass("xgb.Booster.handle") + +setMethod("predict", signature = "xgb.Booster.handle", + definition = function(object, ...) { + if (class(object) != "xgb.Booster.handle"){ + stop("predict: model in prediction must be of class xgb.Booster.handle") + } + + bst <- list(handle = object,raw = NULL) + class(bst) <- 'xgb.Booster' + bst$raw <- xgb.save.raw(bst$handle) + + ret = predict(bst, ...) + return(ret) +}) + diff --git a/R-package/R/utils.R b/R-package/R/utils.R index bcbde36d1..5093382d4 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -102,42 +102,28 @@ xgb.numrow <- function(dmat) { } # iteratively update booster with customized statistics xgb.iter.boost <- function(booster, dtrain, gpair) { - if (class(booster) != "xgb.Booster") { - stop("xgb.iter.update: first argument must be type xgb.Booster") - } else { - if (is.null(booster$handle)) { - booster$handle <- xgb.load(booster$raw) - } else { - if (is.null(booster$raw)) - booster$raw <- xgb.save.raw(booster$handle) - } + if (class(booster) != "xgb.Booster.handle") { + stop("xgb.iter.update: first argument must be type xgb.Booster.handle") } if (class(dtrain) != "xgb.DMatrix") { stop("xgb.iter.update: second argument must be type xgb.DMatrix") } - .Call("XGBoosterBoostOneIter_R", booster$handle, dtrain, gpair$grad, gpair$hess, + .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost") return(TRUE) } # iteratively update booster with dtrain xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { - if (class(booster) != "xgb.Booster") { - stop("xgb.iter.update: first argument must be type xgb.Booster") - } else { - if (is.null(booster$handle)) { - booster$handle <- xgb.load(booster$raw) - } else { - if (is.null(booster$raw)) - booster$raw <- xgb.save.raw(booster$handle) - } + if (class(booster) != "xgb.Booster.handle") { + stop("xgb.iter.update: first argument must be type xgb.Booster.handle") } if (class(dtrain) != "xgb.DMatrix") { stop("xgb.iter.update: second argument must be type xgb.DMatrix") } if (is.null(obj)) { - .Call("XGBoosterUpdateOneIter_R", booster$handle, as.integer(iter), dtrain, + .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE = "xgboost") } else { pred <- predict(booster, dtrain) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 87247b4a9..264176952 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -19,6 +19,7 @@ xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") + bst <- list(handle = NULL,raw = NULL) class(bst) <- 'xgb.Booster' bst$handle <- xgb.Booster(modelfile = modelfile) From 4c25600d2a8beb09e3a9b5f9f5e54c1e3df941c4 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 17:28:48 -0800 Subject: [PATCH 23/26] fix segfault and add two function for handle and booster --- R-package/R/predict.xgb.Booster.R | 7 +------ R-package/R/predict.xgb.Booster.handle.R | 6 +++--- R-package/R/utils.R | 20 ++++++++++++++++++++ R-package/R/xgb.dump.R | 7 +------ R-package/R/xgb.load.R | 7 +++---- R-package/R/xgb.save.R | 4 +--- R-package/R/xgb.train.R | 7 +++---- 7 files changed, 32 insertions(+), 26 deletions(-) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index b1c3c10ca..52c40df9b 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -36,12 +36,7 @@ setMethod("predict", signature = "xgb.Booster", if (class(object) != "xgb.Booster"){ stop("predict: model in prediction must be of class xgb.Booster") } else { - if (is.null(object$handle)) { - object$handle <- xgb.load(object$raw) - } else { - if (is.null(object$raw)) - object$raw <- xgb.save.raw(object$handle) - } + object <- xgb.Booster.check(object, saveraw = FALSE) } if (class(newdata) != "xgb.DMatrix") { if (is.null(missing)) { diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R index 05cbf891e..a38aeb64e 100644 --- a/R-package/R/predict.xgb.Booster.handle.R +++ b/R-package/R/predict.xgb.Booster.handle.R @@ -6,9 +6,9 @@ setMethod("predict", signature = "xgb.Booster.handle", stop("predict: model in prediction must be of class xgb.Booster.handle") } - bst <- list(handle = object,raw = NULL) - class(bst) <- 'xgb.Booster' - bst$raw <- xgb.save.raw(bst$handle) + bst <- xgb.handleToBooster(object) + # Avoid save a handle without update + # bst$raw <- xgb.save.raw(object) ret = predict(bst, ...) return(ret) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 5093382d4..bff6dd0e8 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -68,6 +68,26 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { return(structure(handle, class = "xgb.Booster.handle")) } +# convert xgb.Booster.handle to xgb.Booster +xgb.handleToBooster <- function(handle) +{ + bst <- list(handle = handle, raw = NULL) + class(bst) <- "xgb.Booster" + return(bst) +} + +# Check whether an xgb.Booster object is complete +xgb.Booster.check <- function(bst, saveraw = TRUE) +{ + if (is.null(bst$handle)) { + bst$handle <- xgb.load(bst$raw) + } else { + if (is.null(bst$raw) && saveraw) + bst$raw <- xgb.save.raw(bst$handle) + } + return(bst) +} + ## ----the following are low level iteratively function, not needed if ## you do not want to use them --------------------------------------- # get dmatrix from data, label diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 1f73eed2e..fa5fe4149 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -41,12 +41,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { stop("model: argument must be type xgb.Booster") } else { - if (is.null(model$handle)) { - model$handle <- xgb.load(model$raw) - } else { - if (is.null(model$raw)) - model$raw <- xgb.save.raw(model$handle) - } + model <- xgb.Booster.check(model) } if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) { stop("fname: argument must be type character (when provided)") diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 264176952..33d440530 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -20,9 +20,8 @@ xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") - bst <- list(handle = NULL,raw = NULL) - class(bst) <- 'xgb.Booster' - bst$handle <- xgb.Booster(modelfile = modelfile) - bst$raw <- xgb.save.raw(bst$handle) + handle <- xgb.Booster(modelfile = modelfile) + bst <- xgb.handleToBooster(handle) + bst <- xgb.Booster.check(bst) return(bst) } diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 0fecddfb5..59c5d2ecd 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -22,9 +22,7 @@ xgb.save <- function(model, fname) { stop("xgb.save: fname must be character") } if (class(model) == "xgb.Booster") { - if (is.null(model$handle)) { - model$handle <- xgb.load(model$raw) - } + model <- xgb.Booster.check(model) .Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost") return(TRUE) } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index c6d29e6e3..250ba2fbf 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -86,9 +86,8 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } params = append(params, list(...)) - bst <- list(handle = NULL,raw = NULL) - class(bst) <- 'xgb.Booster' - bst$handle <- xgb.Booster(params, append(watchlist, dtrain)) + handle <- xgb.Booster(params, append(watchlist, dtrain)) + bst <- xgb.handleToBooster(handle) for (i in 1:nrounds) { succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) if (length(watchlist) != 0) { @@ -96,6 +95,6 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), cat(paste(msg, "\n", sep="")) } } - bst$raw <- xgb.save.raw(bst$handle) + bst <- xgb.Booster.check(bst) return(bst) } From 47b5cf5148e5eb933e5a798a8a3dc20b87378844 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 17:35:50 -0800 Subject: [PATCH 24/26] fix save.raw --- R-package/R/xgb.save.raw.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index 91f7075bd..ee217aa85 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -18,6 +18,9 @@ #' @export #' xgb.save.raw <- function(handle) { + if (class(handle) == "xgb.Booster"){ + handle <- handle$handle + } if (class(handle) == "xgb.Booster.handle") { raw <- .Call("XGBoosterModelToRaw_R", handle, PACKAGE = "xgboost") return(raw) From 25f508e43e4c2393fbdda38196886bd766df7d2f Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 17:48:52 -0800 Subject: [PATCH 25/26] update doc, resolve warnings --- R-package/R/predict.xgb.Booster.R | 1 - R-package/R/predict.xgb.Booster.handle.R | 9 +++++++-- R-package/R/xgb.save.raw.R | 10 +++++----- .../man/predict-xgb.Booster.handle-method.Rd | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 R-package/man/predict-xgb.Booster.handle-method.Rd diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 52c40df9b..c5e1046eb 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -1,4 +1,3 @@ -setClass("xgb.Booster.handle") setClass("xgb.Booster", slots = c(handle = "xgb.Booster.handle", raw = "raw")) diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R index a38aeb64e..3a09e02de 100644 --- a/R-package/R/predict.xgb.Booster.handle.R +++ b/R-package/R/predict.xgb.Booster.handle.R @@ -1,5 +1,12 @@ setClass("xgb.Booster.handle") +#' Predict method for eXtreme Gradient Boosting model handle +#' +#' Predicted values based on xgb.Booster.handle object. +#' +#' @param object Object of class "xgb.Boost.handle" +#' @param ... Parameters pass to \code{predict.xgb.Booster} +#' setMethod("predict", signature = "xgb.Booster.handle", definition = function(object, ...) { if (class(object) != "xgb.Booster.handle"){ @@ -7,8 +14,6 @@ setMethod("predict", signature = "xgb.Booster.handle", } bst <- xgb.handleToBooster(object) - # Avoid save a handle without update - # bst$raw <- xgb.save.raw(object) ret = predict(bst, ...) return(ret) diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index ee217aa85..7f3a2df21 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -17,12 +17,12 @@ #' pred <- predict(bst, test$data) #' @export #' -xgb.save.raw <- function(handle) { - if (class(handle) == "xgb.Booster"){ - handle <- handle$handle +xgb.save.raw <- function(model) { + if (class(model) == "xgb.Booster"){ + model <- model$handle } - if (class(handle) == "xgb.Booster.handle") { - raw <- .Call("XGBoosterModelToRaw_R", handle, PACKAGE = "xgboost") + if (class(model) == "xgb.Booster.handle") { + raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost") return(raw) } stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd new file mode 100644 index 000000000..cc9ba29f9 --- /dev/null +++ b/R-package/man/predict-xgb.Booster.handle-method.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/predict.xgb.Booster.handle.R +\docType{methods} +\name{predict,xgb.Booster.handle-method} +\alias{predict,xgb.Booster.handle-method} +\title{Predict method for eXtreme Gradient Boosting model handle} +\usage{ +\S4method{predict}{xgb.Booster.handle}(object, ...) +} +\arguments{ +\item{object}{Object of class "xgb.Boost.handle"} + +\item{...}{Parameters pass to \code{predict.xgb.Booster}} +} +\description{ +Predicted values based on xgb.Booster.handle object. +} + From 7f3dc7cf7e779b70c788ade2e847c01fef3e12a1 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Mon, 9 Feb 2015 18:38:23 -0800 Subject: [PATCH 26/26] fix warnings --- R-package/R/predict.xgb.Booster.R | 1 + R-package/R/predict.xgb.Booster.handle.R | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index c5e1046eb..52c40df9b 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -1,3 +1,4 @@ +setClass("xgb.Booster.handle") setClass("xgb.Booster", slots = c(handle = "xgb.Booster.handle", raw = "raw")) diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R index 3a09e02de..685318f12 100644 --- a/R-package/R/predict.xgb.Booster.handle.R +++ b/R-package/R/predict.xgb.Booster.handle.R @@ -1,5 +1,3 @@ -setClass("xgb.Booster.handle") - #' Predict method for eXtreme Gradient Boosting model handle #' #' Predicted values based on xgb.Booster.handle object.