Merge branch 'master' into unity
This commit is contained in:
commit
9695c51ce1
2
.gitignore
vendored
2
.gitignore
vendored
@ -44,6 +44,7 @@ Debug
|
||||
*dump
|
||||
*save
|
||||
*csv
|
||||
.Rproj.user
|
||||
*.cpage.col
|
||||
*.cpage
|
||||
xgboost
|
||||
@ -51,3 +52,4 @@ xgboost.mpi
|
||||
xgboost.mock
|
||||
train*
|
||||
rabit
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: eXtreme Gradient Boosting
|
||||
Version: 0.3-2
|
||||
Date: 2014-08-23
|
||||
Version: 0.3-3
|
||||
Date: 2014-12-28
|
||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
|
||||
Maintainer: Tong He <hetong007@gmail.com>
|
||||
Description: This package is a R wrapper of xgboost, which is short for eXtreme
|
||||
@ -21,4 +21,9 @@ Depends:
|
||||
R (>= 2.10)
|
||||
Imports:
|
||||
Matrix (>= 1.1-0),
|
||||
methods
|
||||
methods,
|
||||
data.table (>= 1.9),
|
||||
magrittr (>= 1.5),
|
||||
stringr,
|
||||
DiagrammeR,
|
||||
vcd
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
# Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
|
||||
export(getinfo)
|
||||
export(setinfo)
|
||||
@ -7,7 +7,10 @@ export(xgb.DMatrix)
|
||||
export(xgb.DMatrix.save)
|
||||
export(xgb.cv)
|
||||
export(xgb.dump)
|
||||
export(xgb.importance)
|
||||
export(xgb.load)
|
||||
export(xgb.model.dt.tree)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.train)
|
||||
export(xgboost)
|
||||
@ -15,3 +18,20 @@ exportMethods(predict)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
importFrom(DiagrammeR,DiagrammeR)
|
||||
importFrom(data.table,":=")
|
||||
importFrom(data.table,as.data.table)
|
||||
importFrom(data.table,copy)
|
||||
importFrom(data.table,data.table)
|
||||
importFrom(data.table,rbindlist)
|
||||
importFrom(data.table,set)
|
||||
importFrom(data.table,setnames)
|
||||
importFrom(magrittr,"%>%")
|
||||
importFrom(magrittr,add)
|
||||
importFrom(magrittr,not)
|
||||
importFrom(stringr,str_extract)
|
||||
importFrom(stringr,str_extract_all)
|
||||
importFrom(stringr,str_match)
|
||||
importFrom(stringr,str_replace)
|
||||
importFrom(stringr,str_split)
|
||||
importFrom(stringr,str_trim)
|
||||
|
||||
@ -1,7 +1,18 @@
|
||||
#' Cross Validation
|
||||
#'
|
||||
#' The cross valudation function of xgboost
|
||||
#'
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table as.data.table
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom stringr str_extract_all
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_replace
|
||||
#' @importFrom stringr str_match
|
||||
#'
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
@ -20,7 +31,7 @@
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold number of folds used
|
||||
#' @param label option field, when data is Matrix
|
||||
#' @param showsd boolean, whether show standard deviation of cross validation
|
||||
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in corss validation,
|
||||
#' when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
#' Possible options are:
|
||||
@ -36,8 +47,13 @@
|
||||
#' @param feval custimized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain,
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param verbose \code{boolean}, print the statistics during the process.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the cross validation function for xgboost
|
||||
#'
|
||||
@ -51,10 +67,11 @@
|
||||
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
#' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
|
||||
#' "max.depth"=3, "eta"=1, "objective"="binary:logistic")
|
||||
#' print(history)
|
||||
#' @export
|
||||
#'
|
||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
||||
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) {
|
||||
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.cv: first argument params must be list")
|
||||
}
|
||||
@ -73,18 +90,30 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
}
|
||||
|
||||
folds <- xgb.cv.mknfold(dtrain, nfold, params)
|
||||
history <- list()
|
||||
history <- c()
|
||||
for (i in 1:nrounds) {
|
||||
msg <- list()
|
||||
for (k in 1:nfold) {
|
||||
fd <- folds[[k]]
|
||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||
msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval),
|
||||
"\t")[[1]]
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
}
|
||||
ret <- xgb.cv.aggcv(msg, showsd)
|
||||
history <- append(history, ret)
|
||||
cat(paste(ret, "\n", sep=""))
|
||||
history <- c(history, ret)
|
||||
if(verbose) paste(ret, "\n", sep="") %>% cat
|
||||
}
|
||||
return (TRUE)
|
||||
}
|
||||
|
||||
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
|
||||
colnamesMean <- paste(colnames, "mean")
|
||||
colnamesStd <- paste(colnames, "std")
|
||||
|
||||
colnames <- c()
|
||||
for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||
|
||||
type <- rep(x = "numeric", times = length(colnames))
|
||||
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||
split <- str_split(string = history, pattern = "\t")
|
||||
|
||||
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
|
||||
dt
|
||||
}
|
||||
@ -2,14 +2,24 @@
|
||||
#'
|
||||
#' Save a xgboost model to text file. Could be parsed later.
|
||||
#'
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_replace
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the binary file.
|
||||
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
|
||||
#' @param fmap feature map file representing the type of feature.
|
||||
#' Detailed description could be found at
|
||||
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
#' See demo/ for walkthrough example in R, and
|
||||
#' \url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
|
||||
#' for example Format.
|
||||
#' @param with.stats whether dump statistics of splits
|
||||
#' When this option is on, the model dump comes with two additional statistics:
|
||||
#' gain is the approximate loss function gain we get in each split;
|
||||
#' cover is the sum of second order gradient in each node.
|
||||
#'
|
||||
#' @return
|
||||
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -18,16 +28,30 @@
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump')
|
||||
#' # save the model in file 'xgb.model.dump'
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' # print the model without saving it to a file
|
||||
#' print(xgb.dump(bst))
|
||||
#' @export
|
||||
#'
|
||||
xgb.dump <- function(model, fname, fmap = "") {
|
||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
stop("model: argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
|
||||
stop("fname: argument must be type character (when provided)")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", model, fname, fmap, PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
|
||||
stop("fmap: argument must be type character (when provided)")
|
||||
}
|
||||
|
||||
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
|
||||
|
||||
if(is.null(fname)) {
|
||||
return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .))
|
||||
} else {
|
||||
result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
|
||||
return(TRUE)
|
||||
}
|
||||
}
|
||||
82
R-package/R/xgb.importance.R
Normal file
82
R-package/R/xgb.importance.R
Normal file
@ -0,0 +1,82 @@
|
||||
#' Show importance of features in a model
|
||||
#'
|
||||
#' Read a xgboost model text dump.
|
||||
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#'
|
||||
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the function to understand the model trained (and through your model, your data).
|
||||
#'
|
||||
#' Results are returned for both linear and tree models.
|
||||
#'
|
||||
#' \code{data.table} is returned by the function.
|
||||
#' There are 3 columns :
|
||||
#' \itemize{
|
||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
||||
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
||||
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
||||
#' }
|
||||
#'
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a path to the model dump file.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
text <- readLines(filename_dump)
|
||||
} else {
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
}
|
||||
|
||||
if(text[2] == "bias:"){
|
||||
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
|
||||
} else {
|
||||
result <- treeDump(feature_names, text = text)
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
treeDump <- function(feature_names, text){
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)]
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
linearDump <- function(feature_names, text){
|
||||
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
|
||||
}
|
||||
163
R-package/R/xgb.model.dt.tree.R
Normal file
163
R-package/R/xgb.model.dt.tree.R
Normal file
@ -0,0 +1,163 @@
|
||||
#' Convert tree model dump to data.table
|
||||
#'
|
||||
#' Read a tree model text dump and return a data.table.
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table set
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom data.table copy
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom magrittr not
|
||||
#' @importFrom magrittr add
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
#'
|
||||
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
||||
#'
|
||||
#' @details
|
||||
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
||||
#'
|
||||
#' The content of the \code{data.table} is organised that way:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{ID}: unique identifier of a node ;
|
||||
#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
|
||||
#' \item \code{Split}: value of the chosen feature where is operated the split ;
|
||||
#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
|
||||
#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
|
||||
#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
|
||||
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!class(text) %in% c("character", "NULL")) {
|
||||
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
|
||||
}
|
||||
|
||||
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
||||
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
text = xgb.dump(model = model, with.stats = T)
|
||||
} else if(!is.null(filename_dump)){
|
||||
text <- readLines(filename_dump) %>% str_trim(side = "both")
|
||||
}
|
||||
|
||||
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
|
||||
|
||||
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
||||
|
||||
n_round <- min(length(position) - 1, n_first_tree)
|
||||
|
||||
addTreeId <- function(x, i) paste(i,x,sep = "-")
|
||||
|
||||
allTrees <- data.table()
|
||||
|
||||
for(i in 1:n_round){
|
||||
|
||||
tree <- text[(position[i]+1):(position[i+1]-1)]
|
||||
|
||||
# avoid tree made of a leaf only (no split)
|
||||
if(length(tree) <2) next
|
||||
|
||||
treeID <- i-1
|
||||
|
||||
notLeaf <- str_match(tree, "leaf") %>% is.na
|
||||
leaf <- notLeaf %>% not %>% tree[.]
|
||||
branch <- notLeaf %>% tree[.]
|
||||
idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
||||
idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
||||
featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
|
||||
if(!is.null(feature_names)){
|
||||
featureBranch <- feature_names[featureBranch + 1]
|
||||
}
|
||||
featureLeaf <- rep("Leaf", length(leaf))
|
||||
splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "")
|
||||
splitLeaf <- rep(NA, length(leaf))
|
||||
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
|
||||
yesLeaf <- rep(NA, length(leaf))
|
||||
noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
|
||||
noLeaf <- rep(NA, length(leaf))
|
||||
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
|
||||
missingLeaf <- rep(NA, length(leaf))
|
||||
qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
|
||||
qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
|
||||
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
||||
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
||||
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
|
||||
|
||||
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
||||
}
|
||||
|
||||
yes <- allTrees[!is.na(Yes),Yes]
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Feature",
|
||||
value = allTrees[ID == yes,Feature])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Cover",
|
||||
value = allTrees[ID == yes,Cover])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Quality",
|
||||
value = allTrees[ID == yes,Quality])
|
||||
|
||||
no <- allTrees[!is.na(No),No]
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Feature",
|
||||
value = allTrees[ID == no,Feature])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Cover",
|
||||
value = allTrees[ID == no,Cover])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Quality",
|
||||
value = allTrees[ID == no,Quality])
|
||||
|
||||
allTrees
|
||||
}
|
||||
88
R-package/R/xgb.plot.tree.R
Normal file
88
R-package/R/xgb.plot.tree.R
Normal file
@ -0,0 +1,88 @@
|
||||
#' Plot a boosted tree model
|
||||
#'
|
||||
#' Read a tree model text dump.
|
||||
#' Plotting only works for boosted tree model (not linear model).
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table set
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom data.table copy
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom magrittr not
|
||||
#' @importFrom magrittr add
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @importFrom DiagrammeR DiagrammeR
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
|
||||
#' @param width the width of the diagram in pixels.
|
||||
#' @param height the height of the diagram in pixels.
|
||||
#'
|
||||
#' @return A \code{DiagrammeR} of the model.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' The content of each node is organised that way:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{feature} value ;
|
||||
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
||||
#' \item \code{gain}: metric the importance of the node in the model.
|
||||
#' }
|
||||
#'
|
||||
#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
||||
#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
#'
|
||||
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){
|
||||
|
||||
if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
|
||||
stop("style: Has to be a character vector of size 1.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
|
||||
} else {
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
|
||||
}
|
||||
|
||||
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
|
||||
|
||||
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
|
||||
|
||||
|
||||
if(is.null(CSSstyle)){
|
||||
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
|
||||
}
|
||||
|
||||
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
|
||||
|
||||
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
||||
|
||||
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
|
||||
DiagrammeR(path, width, height)
|
||||
}
|
||||
@ -24,6 +24,8 @@
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
#' information of performance. If 2, xgboost will print information of both
|
||||
#' performance and construction progress information
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
@ -74,7 +76,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label} the label for each record
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
@ -101,7 +103,7 @@ NULL
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label} the label for each record
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
@ -116,5 +118,5 @@ NULL
|
||||
#' @name agaricus.test
|
||||
#' @usage data(agaricus.test)
|
||||
#' @format A list containing a label vector, and a dgCMatrix object with 1611
|
||||
#' rows and 127 variables
|
||||
#' rows and 126 variables
|
||||
NULL
|
||||
|
||||
@ -6,7 +6,7 @@ For up-to-date version(which is recommended), please install from github. Window
|
||||
|
||||
```r
|
||||
require(devtools)
|
||||
install_github('xgboost','tqchen',subdir='R-package')
|
||||
install_github('tqchen/xgboost',subdir='R-package')
|
||||
```
|
||||
|
||||
For stable version on CRAN, please run
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@ -4,3 +4,4 @@ boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
create_sparse_matrix
|
||||
|
||||
@ -6,6 +6,7 @@ XGBoost R Feature Walkthrough
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
* [Generalized Linear Model](generalized_linear_model.R)
|
||||
* [Cross validation](cross_validation.R)
|
||||
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
|
||||
|
||||
Benchmarks
|
||||
====
|
||||
|
||||
@ -88,6 +88,9 @@ pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
# Finally, you can dump the tree you learned using xgb.dump into a text file
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# You can dump the tree you learned using xgb.dump into a text file
|
||||
xgb.dump(bst, "dump.raw.txt", with.stats = T)
|
||||
|
||||
# Finally, you can check which features are the most important.
|
||||
print("Most important features (look at column Gain):")
|
||||
print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))
|
||||
|
||||
89
R-package/demo/create_sparse_matrix.R
Normal file
89
R-package/demo/create_sparse_matrix.R
Normal file
@ -0,0 +1,89 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
require(vcd) #Available in Cran. Used for its dataset with categorical values.
|
||||
|
||||
# According to its documentation, Xgboost works only on numbers.
|
||||
# Sometimes the dataset we have to work on have categorical data.
|
||||
# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
|
||||
#
|
||||
# In R, categorical variable is called Factor.
|
||||
# Type ?factor in console for more information.
|
||||
#
|
||||
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
|
||||
# The method we are going to see is usually called "one hot encoding".
|
||||
|
||||
#load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
|
||||
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
|
||||
# Let's have a look to the data.table
|
||||
cat("Print the dataset\n")
|
||||
print(df)
|
||||
|
||||
# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
|
||||
cat("Structure of the dataset\n")
|
||||
str(df)
|
||||
|
||||
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||
|
||||
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||
df[,AgeDiscret:= as.factor(round(Age/10,0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
|
||||
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[,ID:=NULL]
|
||||
|
||||
# List the different values for the column Treatment: Placebo, Treated.
|
||||
cat("Values of the categorical feature Treatment\n")
|
||||
print(levels(df[,Treatment]))
|
||||
|
||||
# Next step, we will transform the categorical data to dummy variables.
|
||||
# This method is also called one hot encoding.
|
||||
# The purpose is to transform each value of each categorical feature in one binary feature.
|
||||
#
|
||||
# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column Treated.
|
||||
#
|
||||
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
|
||||
# Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
|
||||
|
||||
cat("Encoding of the sparse Matrix\n")
|
||||
print(sparse_matrix)
|
||||
|
||||
# Create the output vector (not sparse)
|
||||
# 1. Set, for all rows, field in Y column to 0;
|
||||
# 2. set Y to 1 when Improved == Marked;
|
||||
# 3. Return Y column
|
||||
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
||||
eta = 1, nround = 10,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
||||
print(importance)
|
||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||
|
||||
# Does these results make sense?
|
||||
# Let's check some Chi2 between each of these features and the outcome.
|
||||
|
||||
print(chisq.test(df$Age, df$Y))
|
||||
# Pearson correlation between Age and illness disapearing is 35
|
||||
|
||||
print(chisq.test(df$AgeDiscret, df$Y))
|
||||
# Our first simplification of Age gives a Pearson correlation of 8.
|
||||
|
||||
print(chisq.test(df$AgeCat, df$Y))
|
||||
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
|
||||
|
||||
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
||||
# However it's almost always worse when you add some arbitrary rules.
|
||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
|
||||
@ -1,10 +1,11 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
\docType{data}
|
||||
\name{agaricus.test}
|
||||
\alias{agaricus.test}
|
||||
\title{Test part from Mushroom Data Set}
|
||||
\format{A list containing a label vector, and a dgCMatrix object with 1611
|
||||
rows and 127 variables}
|
||||
rows and 126 variables}
|
||||
\usage{
|
||||
data(agaricus.test)
|
||||
}
|
||||
@ -17,7 +18,7 @@ This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
\docType{data}
|
||||
\name{agaricus.train}
|
||||
\alias{agaricus.train}
|
||||
@ -17,7 +18,7 @@ This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 127 columns.
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/getinfo.xgb.DMatrix.R
|
||||
\docType{methods}
|
||||
\name{getinfo}
|
||||
\alias{getinfo}
|
||||
@ -12,9 +13,9 @@ getinfo(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
}
|
||||
\description{
|
||||
Get information of an xgb.DMatrix object
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/predict.xgb.Booster.R
|
||||
\docType{methods}
|
||||
\name{predict,xgb.Booster-method}
|
||||
\alias{predict,xgb.Booster-method}
|
||||
\title{Predict method for eXtreme Gradient Boosting model}
|
||||
\usage{
|
||||
\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
|
||||
ntreelimit = NULL)
|
||||
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
|
||||
outputmargin = FALSE, ntreelimit = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.Boost"}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/setinfo.xgb.DMatrix.R
|
||||
\docType{methods}
|
||||
\name{setinfo}
|
||||
\alias{setinfo}
|
||||
@ -12,11 +13,11 @@ setinfo(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Set information of an xgb.DMatrix object
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/slice.xgb.DMatrix.R
|
||||
\docType{methods}
|
||||
\name{slice}
|
||||
\alias{slice}
|
||||
@ -13,9 +14,9 @@ slice(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
}
|
||||
\description{
|
||||
Get a new DMatrix containing the specified rows of
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.DMatrix}
|
||||
\alias{xgb.DMatrix}
|
||||
\title{Contruct xgb.DMatrix object}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.save.R
|
||||
\name{xgb.DMatrix.save}
|
||||
\alias{xgb.DMatrix.save}
|
||||
\title{Save xgb.DMatrix object to binary file}
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.cv.R
|
||||
\name{xgb.cv}
|
||||
\alias{xgb.cv}
|
||||
\title{Cross Validation}
|
||||
\usage{
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
|
||||
metrics = list(), obj = NULL, feval = NULL, ...)
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
|
||||
feval = NULL, verbose = T, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
@ -30,7 +32,9 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, showsd = TRUE,
|
||||
|
||||
\item{label}{option field, when data is Matrix}
|
||||
|
||||
\item{showsd}{boolean, whether show standard deviation of cross validation}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
|
||||
|
||||
\item{metrics,}{list of evaluation metrics to be used in corss validation,
|
||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
@ -50,8 +54,13 @@ gradient with given prediction and dtrain,}
|
||||
\code{list(metric='metric-name', value='metric-value')} with given
|
||||
prediction and dtrain,}
|
||||
|
||||
\item{verbose}{\code{boolean}, print the statistics during the process.}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
}
|
||||
\value{
|
||||
A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
}
|
||||
\description{
|
||||
The cross valudation function of xgboost
|
||||
}
|
||||
@ -68,5 +77,6 @@ data(agaricus.train, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
|
||||
"max.depth"=3, "eta"=1, "objective"="binary:logistic")
|
||||
print(history)
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,30 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.dump.R
|
||||
\name{xgb.dump}
|
||||
\alias{xgb.dump}
|
||||
\title{Save xgboost model to text file}
|
||||
\usage{
|
||||
xgb.dump(model, fname, fmap = "")
|
||||
xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
|
||||
\item{fname}{the name of the binary file.}
|
||||
\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
|
||||
|
||||
\item{fmap}{feature map file representing the type of feature.
|
||||
Detailed description could be found at
|
||||
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
See demo/ for walkthrough example in R, and
|
||||
\url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
|
||||
for example Format.}
|
||||
Detailed description could be found at
|
||||
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
See demo/ for walkthrough example in R, and
|
||||
\url{https://github.com/tqchen/xgboost/blob/master/demo/data/featmap.txt}
|
||||
for example Format.}
|
||||
|
||||
\item{with.stats}{whether dump statistics of splits
|
||||
When this option is on, the model dump comes with two additional statistics:
|
||||
gain is the approximate loss function gain we get in each split;
|
||||
cover is the sum of second order gradient in each node.}
|
||||
}
|
||||
\value{
|
||||
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
}
|
||||
\description{
|
||||
Save a xgboost model to text file. Could be parsed later.
|
||||
@ -27,6 +36,10 @@ train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump')
|
||||
# save the model in file 'xgb.model.dump'
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
# print the model without saving it to a file
|
||||
print(xgb.dump(bst))
|
||||
}
|
||||
|
||||
|
||||
52
R-package/man/xgb.importance.Rd
Normal file
52
R-package/man/xgb.importance.Rd
Normal file
@ -0,0 +1,52 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.importance.R
|
||||
\name{xgb.importance}
|
||||
\alias{xgb.importance}
|
||||
\title{Show importance of features in a model}
|
||||
\usage{
|
||||
xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
}
|
||||
\value{
|
||||
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||
}
|
||||
\description{
|
||||
Read a xgboost model text dump.
|
||||
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
||||
}
|
||||
\details{
|
||||
This is the function to understand the model trained (and through your model, your data).
|
||||
|
||||
Results are returned for both linear and tree models.
|
||||
|
||||
\code{data.table} is returned by the function.
|
||||
There are 3 columns :
|
||||
\itemize{
|
||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
||||
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
||||
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
|
||||
}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.load.R
|
||||
\name{xgb.load}
|
||||
\alias{xgb.load}
|
||||
\title{Load xgboost model from binary file}
|
||||
|
||||
58
R-package/man/xgb.model.dt.tree.Rd
Normal file
58
R-package/man/xgb.model.dt.tree.Rd
Normal file
@ -0,0 +1,58 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.model.dt.tree.R
|
||||
\name{xgb.model.dt.tree}
|
||||
\alias{xgb.model.dt.tree}
|
||||
\title{Convert tree model dump to data.table}
|
||||
\usage{
|
||||
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
|
||||
model = NULL, text = NULL, n_first_tree = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
}
|
||||
\value{
|
||||
A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
||||
}
|
||||
\description{
|
||||
Read a tree model text dump and return a data.table.
|
||||
}
|
||||
\details{
|
||||
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
||||
|
||||
The content of the \code{data.table} is organised that way:
|
||||
|
||||
\itemize{
|
||||
\item \code{ID}: unique identifier of a node ;
|
||||
\item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
|
||||
\item \code{Split}: value of the chosen feature where is operated the split ;
|
||||
\item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
|
||||
\item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
|
||||
\item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
|
||||
\item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
\item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
\item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
|
||||
}
|
||||
|
||||
53
R-package/man/xgb.plot.tree.Rd
Normal file
53
R-package/man/xgb.plot.tree.Rd
Normal file
@ -0,0 +1,53 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.tree.R
|
||||
\name{xgb.plot.tree}
|
||||
\alias{xgb.plot.tree}
|
||||
\title{Plot a boosted tree model}
|
||||
\usage{
|
||||
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
|
||||
n_first_tree = NULL, CSSstyle = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
|
||||
|
||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
|
||||
\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
|
||||
}
|
||||
\value{
|
||||
A \code{DiagrammeR} of the model.
|
||||
}
|
||||
\description{
|
||||
Read a tree model text dump.
|
||||
Plotting only works for boosted tree model (not linear model).
|
||||
}
|
||||
\details{
|
||||
The content of each node is organised that way:
|
||||
|
||||
\itemize{
|
||||
\item \code{feature} value ;
|
||||
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
||||
\item \code{gain}: metric the importance of the node in the model.
|
||||
}
|
||||
|
||||
Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
||||
It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.save.R
|
||||
\name{xgb.save}
|
||||
\alias{xgb.save}
|
||||
\title{Save xgboost model to binary file}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.train.R
|
||||
\name{xgb.train}
|
||||
\alias{xgb.train}
|
||||
\title{eXtreme Gradient Boosting Training}
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
\name{xgboost}
|
||||
\alias{xgboost}
|
||||
\title{eXtreme Gradient Boosting (Tree) library}
|
||||
\usage{
|
||||
xgboost(data = NULL, label = NULL, params = list(), nrounds,
|
||||
verbose = 1, ...)
|
||||
xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
nrounds, verbose = 1, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
@ -12,6 +13,8 @@ xgboost(data = NULL, label = NULL, params = list(), nrounds,
|
||||
|
||||
\item{label}{the response variable. User should not set this field,}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
#include "wrapper/xgboost_wrapper.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/utils/omp.h"
|
||||
@ -242,10 +243,10 @@ extern "C" {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
vec_sptr.push_back(vec_names[i].c_str());
|
||||
}
|
||||
_WrapperEnd();
|
||||
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
|
||||
_WrapperEnd();
|
||||
}
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
|
||||
_WrapperBegin();
|
||||
@ -273,18 +274,21 @@ extern "C" {
|
||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
|
||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fmap)),
|
||||
&olen);
|
||||
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
|
||||
fprintf(fo, "%s", res[i]);
|
||||
CHAR(asChar(fmap)),
|
||||
asInteger(with_stats),
|
||||
&olen);
|
||||
SEXP out = PROTECT(allocVector(STRSXP, olen));
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
stringstream stream;
|
||||
stream << "booster["<<i<<"]\n" << res[i];
|
||||
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
|
||||
}
|
||||
fclose(fo);
|
||||
_WrapperEnd();
|
||||
UNPROTECT(1);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
@ -128,11 +128,11 @@ extern "C" {
|
||||
*/
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \brief dump model into a string
|
||||
* \param handle handle
|
||||
* \param fname file name of model that can be dumped into
|
||||
* \param fmap name to fmap can be empty string
|
||||
* \param with_stats whether dump statistics of splits
|
||||
*/
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
|
||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
|
||||
}
|
||||
#endif // XGBOOST_WRAPPER_R_H_
|
||||
|
||||
15
README.md
15
README.md
@ -19,7 +19,8 @@ Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washin
|
||||
|
||||
What's New
|
||||
=====
|
||||
|
||||
* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
|
||||
* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
|
||||
* Thanks to Bing Xu, [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) allows you to use xgboost from Julia
|
||||
* See the updated [demo folder](demo) for feature walkthrough
|
||||
* Thanks to Tong He, the new [R package](R-package) is available
|
||||
@ -43,6 +44,18 @@ Build
|
||||
- Alternatively, you can upgrade your compiler to compile multi-thread version
|
||||
* Windows(VS 2010): see [windows](windows) folder
|
||||
- In principle, you put all the cpp files in the Makefile to the project, and build
|
||||
* OS X:
|
||||
- For users who want OpenMP support using [Homebrew](http://brew.sh/), run ```brew update``` (ensures that you install gcc-4.9 or above) and ```brew install gcc```. Once it is installed, edit [Makefile](Makefile/) by replacing:
|
||||
```
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
```
|
||||
with
|
||||
```
|
||||
export CC = gcc-4.9
|
||||
export CXX = g++-4.9
|
||||
```
|
||||
Then run ```bash build.sh``` normally.
|
||||
|
||||
Version
|
||||
======
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
XGBoost Examples
|
||||
====
|
||||
This folder contains the all example codes using xgboost.
|
||||
This folder contains all the code examples using xgboost.
|
||||
|
||||
* Contribution of exampls, benchmarks is more than welcomed!
|
||||
* Contribution of examples, benchmarks is more than welcome!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||
|
||||
Features Walkthrough
|
||||
@ -12,7 +12,7 @@ This is a list of short codes introducing different functionalities of xgboost a
|
||||
[python](guide-python/basic_walkthrough.py)
|
||||
[R](../R-package/demo/basic_walkthrough.R)
|
||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
|
||||
* Cutomize loss function, and evaluation metric
|
||||
* Customize loss function, and evaluation metric
|
||||
[python](guide-python/custom_objective.py)
|
||||
[R](../R-package/demo/custom_objective.R)
|
||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/custom_objective.jl)
|
||||
@ -45,3 +45,4 @@ Basic Examples by Tasks
|
||||
Benchmarks
|
||||
====
|
||||
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
|
||||
* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
|
||||
|
||||
@ -447,7 +447,6 @@ class TreeModel {
|
||||
<< ",missing=" << nodes[nid].cdefault();
|
||||
}
|
||||
if (with_stats) {
|
||||
fo << ' ';
|
||||
stat(nid).Print(fo, false);
|
||||
}
|
||||
fo << '\n';
|
||||
@ -470,9 +469,9 @@ struct RTreeNodeStat {
|
||||
/*! \brief print information of current stats to fo */
|
||||
inline void Print(std::stringstream &fo, bool is_leaf) const {
|
||||
if (!is_leaf) {
|
||||
fo << "gain=" << loss_chg << ",cover=" << sum_hess;
|
||||
fo << ",gain=" << loss_chg << ",cover=" << sum_hess;
|
||||
} else {
|
||||
fo << "cover=" << sum_hess;
|
||||
fo << ",cover=" << sum_hess;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||
|
||||
# specify tensor path
|
||||
BIN = xgcombine_buffer
|
||||
OBJ =
|
||||
.PHONY: clean all
|
||||
|
||||
all: $(BIN) $(OBJ)
|
||||
export LDFLAGS= -pthread -lm
|
||||
|
||||
xgcombine_buffer : xgcombine_buffer.cpp
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
$(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
install:
|
||||
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) *~
|
||||
@ -1,248 +0,0 @@
|
||||
/*!
|
||||
* a tool to combine different set of features into binary buffer
|
||||
* not well organized code, but does it's job
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cmath>
|
||||
#include "../src/io/simple_dmatrix-inl.hpp"
|
||||
#include "../src/io/io.cpp"
|
||||
#include "../src/utils/utils.h"
|
||||
|
||||
using namespace xgboost;
|
||||
using namespace xgboost::io;
|
||||
|
||||
// header in dataset
|
||||
struct Header{
|
||||
FILE *fi;
|
||||
int tmp_num;
|
||||
int base;
|
||||
int num_feat;
|
||||
// whether it's dense format
|
||||
bool is_dense;
|
||||
bool warned;
|
||||
|
||||
Header( void ){ this->warned = false; this->is_dense = false; }
|
||||
|
||||
inline void CheckBase( unsigned findex ){
|
||||
if( findex >= (unsigned)num_feat && ! warned ) {
|
||||
fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat );
|
||||
warned = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
inline int norm( std::vector<Header> &vec, int base = 0 ){
|
||||
int n = base;
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
if( vec[i].is_dense ) vec[i].num_feat = 1;
|
||||
vec[i].base = n; n += vec[i].num_feat;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
inline void vclose( std::vector<Header> &vec ){
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
fclose( vec[i].fi );
|
||||
}
|
||||
}
|
||||
|
||||
inline int readnum( std::vector<Header> &vec ){
|
||||
int n = 0;
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
if( !vec[i].is_dense ){
|
||||
utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" );
|
||||
n += vec[i].tmp_num;
|
||||
}else{
|
||||
n ++;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
inline void vskip( std::vector<Header> &vec ){
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
if( !vec[i].is_dense ){
|
||||
utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0, "sparse" );
|
||||
}else{
|
||||
utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0, "dense" );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class DataLoader: public DMatrixSimple {
|
||||
public:
|
||||
// whether to do node and edge feature renormalization
|
||||
int rescale;
|
||||
int linelimit;
|
||||
public:
|
||||
FILE *fp, *fwlist, *fgroup, *fweight;
|
||||
std::vector<Header> fheader;
|
||||
DataLoader( void ){
|
||||
rescale = 0;
|
||||
linelimit = -1;
|
||||
fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL;
|
||||
}
|
||||
private:
|
||||
inline void Load( std::vector<SparseBatch::Entry> &feats, std::vector<Header> &vec ){
|
||||
SparseBatch::Entry e;
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
if( !vec[i].is_dense ) {
|
||||
for( int j = 0; j < vec[i].tmp_num; j ++ ){
|
||||
utils::Assert( fscanf ( vec[i].fi, "%u:%f", &e.index, &e.fvalue ) == 2, "Error when load feat" );
|
||||
vec[i].CheckBase( e.index );
|
||||
e.index += vec[i].base;
|
||||
feats.push_back(e);
|
||||
}
|
||||
}else{
|
||||
utils::Assert( fscanf ( vec[i].fi, "%f", &e.fvalue ) == 1, "load feat" );
|
||||
e.index = vec[i].base;
|
||||
feats.push_back(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void DoRescale( std::vector<SparseBatch::Entry> &vec ){
|
||||
double sum = 0.0;
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
sum += vec[i].fvalue * vec[i].fvalue;
|
||||
}
|
||||
sum = sqrt( sum );
|
||||
for( size_t i = 0; i < vec.size(); i ++ ){
|
||||
vec[i].fvalue /= sum;
|
||||
}
|
||||
}
|
||||
public:
|
||||
// basically we are loading all the data inside
|
||||
inline void Load( void ){
|
||||
this->Clear();
|
||||
float label, weight = 0.0f;
|
||||
|
||||
unsigned ngleft = 0, ngacc = 0;
|
||||
if( fgroup != NULL ){
|
||||
info.group_ptr.clear();
|
||||
info.group_ptr.push_back(0);
|
||||
}
|
||||
|
||||
while( fscanf( fp, "%f", &label ) == 1 ){
|
||||
if( ngleft == 0 && fgroup != NULL ){
|
||||
utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1, "group" );
|
||||
}
|
||||
if( fweight != NULL ){
|
||||
utils::Assert( fscanf( fweight, "%f", &weight ) == 1, "weight" );
|
||||
}
|
||||
|
||||
ngleft -= 1; ngacc += 1;
|
||||
|
||||
int pass = 1;
|
||||
if( fwlist != NULL ){
|
||||
utils::Assert( fscanf( fwlist, "%u", &pass ) ==1, "pass" );
|
||||
}
|
||||
if( pass == 0 ){
|
||||
vskip( fheader ); ngacc -= 1;
|
||||
}else{
|
||||
const int nfeat = readnum( fheader );
|
||||
|
||||
std::vector<SparseBatch::Entry> feats;
|
||||
|
||||
// pairs
|
||||
this->Load( feats, fheader );
|
||||
utils::Assert( feats.size() == (unsigned)nfeat, "nfeat" );
|
||||
if( rescale != 0 ) this->DoRescale( feats );
|
||||
// push back data :)
|
||||
this->info.labels.push_back( label );
|
||||
// push back weight if any
|
||||
if( fweight != NULL ){
|
||||
this->info.weights.push_back( weight );
|
||||
}
|
||||
this->AddRow( feats );
|
||||
}
|
||||
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||
utils::Assert( info.group_ptr.back() == info.num_row(), "group size must match num rows" );
|
||||
ngacc = 0;
|
||||
}
|
||||
// linelimit
|
||||
if( linelimit >= 0 ) {
|
||||
if( -- linelimit <= 0 ) break;
|
||||
}
|
||||
}
|
||||
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
|
||||
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
|
||||
utils::Assert( info.group_ptr.back() == info.num_row(), "group size must match num rows" );
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
const char *folder = "features";
|
||||
|
||||
int main( int argc, char *argv[] ){
|
||||
if( argc < 3 ){
|
||||
printf("Usage:xgcombine_buffer <inname> <outname> [options] -f [features] -fd [densefeatures]\n" \
|
||||
"options: -rescale -linelimit -fgroup <groupfilename> -wlist <whitelistinstance>\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
DataLoader loader;
|
||||
time_t start = time( NULL );
|
||||
|
||||
int mode = 0;
|
||||
for( int i = 3; i < argc; i ++ ){
|
||||
if( !strcmp( argv[i], "-f") ){
|
||||
mode = 0; continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-fd") ){
|
||||
mode = 2; continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-rescale") ){
|
||||
loader.rescale = 1; continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-wlist") ){
|
||||
loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-fgroup") ){
|
||||
loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-fweight") ){
|
||||
loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue;
|
||||
}
|
||||
if( !strcmp( argv[i], "-linelimit") ){
|
||||
loader.linelimit = atoi( argv[ ++i ] ); continue;
|
||||
}
|
||||
|
||||
char name[ 256 ];
|
||||
sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] );
|
||||
Header h;
|
||||
h.fi = utils::FopenCheck( name, "r" );
|
||||
|
||||
if( mode == 2 ){
|
||||
h.is_dense = true; h.num_feat = 1;
|
||||
loader.fheader.push_back( h );
|
||||
}else{
|
||||
utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" );
|
||||
switch( mode ){
|
||||
case 0: loader.fheader.push_back( h ); break;
|
||||
default: ;
|
||||
}
|
||||
}
|
||||
}
|
||||
loader.fp = utils::FopenCheck( argv[1], "r" );
|
||||
|
||||
printf("num_features=%d\n", norm( loader.fheader ) );
|
||||
printf("start creating buffer...\n");
|
||||
loader.Load();
|
||||
io::SaveDataMatrix(loader, argv[2]);
|
||||
// close files
|
||||
fclose( loader.fp );
|
||||
if( loader.fwlist != NULL ) fclose( loader.fwlist );
|
||||
if( loader.fgroup != NULL ) fclose( loader.fgroup );
|
||||
vclose( loader.fheader );
|
||||
printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) );
|
||||
return 0;
|
||||
}
|
||||
@ -382,13 +382,15 @@ class Booster:
|
||||
None
|
||||
"""
|
||||
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
|
||||
def dump_model(self, fo, fmap=''):
|
||||
def dump_model(self, fo, fmap='', with_stats = False):
|
||||
"""dump model into text file
|
||||
Args:
|
||||
fo: string
|
||||
file name to be dumped
|
||||
fmap: string, optional
|
||||
file name of feature map names
|
||||
with_stats: bool, optional
|
||||
whether output statistics of the split
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
@ -397,16 +399,18 @@ class Booster:
|
||||
need_close = True
|
||||
else:
|
||||
need_close = False
|
||||
ret = self.get_dump(fmap)
|
||||
ret = self.get_dump(fmap, with_stats)
|
||||
for i in range(len(ret)):
|
||||
fo.write('booster[%d]:\n' %i)
|
||||
fo.write( ret[i] )
|
||||
if need_close:
|
||||
fo.close()
|
||||
def get_dump(self, fmap=''):
|
||||
def get_dump(self, fmap='', with_stats=False):
|
||||
"""get dump of model as list of strings """
|
||||
length = ctypes.c_ulong()
|
||||
sarr = xglib.XGBoosterDumpModel(self.handle, ctypes.c_char_p(fmap.encode('utf-8')), ctypes.byref(length))
|
||||
sarr = xglib.XGBoosterDumpModel(self.handle,
|
||||
ctypes.c_char_p(fmap.encode('utf-8')),
|
||||
int(with_stats), ctypes.byref(length))
|
||||
res = []
|
||||
for i in range(length.value):
|
||||
res.append( str(sarr[i]) )
|
||||
|
||||
@ -295,11 +295,11 @@ extern "C"{
|
||||
void XGBoosterSaveModel(const void *handle, const char *fname) {
|
||||
static_cast<const Booster*>(handle)->SaveModel(fname);
|
||||
}
|
||||
const char** XGBoosterDumpModel(void *handle, const char *fmap, bst_ulong *len){
|
||||
const char** XGBoosterDumpModel(void *handle, const char *fmap, int with_stats, bst_ulong *len){
|
||||
utils::FeatMap featmap;
|
||||
if (strlen(fmap) != 0) {
|
||||
featmap.LoadText(fmap);
|
||||
}
|
||||
return static_cast<Booster*>(handle)->GetModelDump(featmap, false, len);
|
||||
return static_cast<Booster*>(handle)->GetModelDump(featmap, with_stats != 0, len);
|
||||
}
|
||||
}
|
||||
|
||||
@ -228,11 +228,12 @@ extern "C" {
|
||||
* \brief dump model, return array of strings representing model dump
|
||||
* \param handle handle
|
||||
* \param fmap name to fmap can be empty string
|
||||
* \param with_stats whether to dump with statistics
|
||||
* \param out_len length of output array
|
||||
* \return char *data[], representing dump of each model
|
||||
*/
|
||||
XGB_DLL const char **XGBoosterDumpModel(void *handle, const char *fmap,
|
||||
bst_ulong *out_len);
|
||||
int with_stats, bst_ulong *out_len);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user