From 901904b5357750c6cebaf13f7eb9162bcdd01029 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 13:50:05 +0100 Subject: [PATCH 1/6] linear text dump model --- src/gbm/gblinear-inl.hpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp index 624f15c28..473914b6e 100644 --- a/src/gbm/gblinear-inl.hpp +++ b/src/gbm/gblinear-inl.hpp @@ -8,6 +8,7 @@ */ #include #include +#include #include #include "./gbm.h" #include "../tree/updater.h" @@ -134,11 +135,24 @@ class GBLinear : public IGradBooster { } } } - virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) { - utils::Error("gblinear does not support dump model"); - return std::vector(); - } + virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) { + std::stringstream fo(""); + fo << "bias:\n"; + for (int i = 0; i < model.param.num_output_group; ++i) { + fo << model.bias()[i] << std::endl; + } + fo << "weight:\n"; + for (int i = 0; i < model.param.num_output_group; ++i) { + for (int j = 0; j v; + v.push_back(fo.str()); + return v; + } + protected: inline void Pred(const RowBatch::Inst &inst, float *preds) { for (int gid = 0; gid < model.param.num_output_group; ++gid) { From 5e5500d6d3234f1d4ff8d61275df2afd7fbf894a Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 13:50:28 +0100 Subject: [PATCH 2/6] rewording --- R-package/demo/create_sparse_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index cf0fcac4d..4060d1c48 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -70,7 +70,7 @@ xgb.dump(bst, 'xgb.model.dump', with.stats = T) # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') print(importance) -# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that there contribution is very low. +# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column). # Does these results make sense? # Let's check some Chi2 between each of these features and the outcome. From 34aaeff3d9199c2fcf1b281e818a39e0cf185825 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 14:57:48 +0100 Subject: [PATCH 3/6] small documentation change --- R-package/R/xgb.importance.R | 3 ++- R-package/man/xgb.importance.Rd | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index b2e60bed7..2071680d3 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -2,7 +2,6 @@ #' #' Read a xgboost model text dump. #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). -#' Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' #' @importFrom data.table data.table #' @importFrom magrittr %>% @@ -11,6 +10,8 @@ #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). #' +#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +#' #' @details #' This is the function to understand the model trained (and through your model, your data). #' diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 883819993..a7a71cefc 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -11,10 +11,12 @@ xgb.importance(feature_names = NULL, filename_dump = NULL) \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} } +\value{ +A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +} \description{ Read a xgboost model text dump. Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). -Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model. } \details{ This is the function to understand the model trained (and through your model, your data). From a524a51a06f979e5ea4d6a14e5c6368c88549dd0 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 16:05:43 +0100 Subject: [PATCH 4/6] return history as data.table for cross validation + documentation --- R-package/NAMESPACE | 1 + R-package/R/xgb.cv.R | 23 ++++++++++++++++++----- R-package/man/xgb.cv.Rd | 3 +++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 1714d2044..7e0bfa8ac 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -18,5 +18,6 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) importFrom(data.table,":=") importFrom(data.table,data.table) +importFrom(data.table,rbindlist) importFrom(magrittr,"%>%") importFrom(stringr,str_extract) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 02870b772..3a9fd9b86 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -1,7 +1,12 @@ #' Cross Validation #' #' The cross valudation function of xgboost -#' +#' +#' @importFrom data.table data.table +#' @importFrom magrittr %>% +#' @importFrom data.table := +#' @importFrom data.table rbindlist +#' @importFrom stringr str_extract #' @param params the list of parameters. Commonly used ones are: #' \itemize{ #' \item \code{objective} objective function, common ones are @@ -40,6 +45,8 @@ # value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' @param ... other parameters to pass to \code{params}. #' +#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set. +#' #' @details #' This is the cross validation function for xgboost #' @@ -88,9 +95,15 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = history <- c(history, ret) cat(paste(ret, "\n", sep="")) } - return (history) + + dt <- data.table(train_rmse_mean=numeric(), train_rmse_std=numeric(), train_auc_mean=numeric(), train_auc_std=numeric(), test_rmse_mean=numeric(), test_rmse_std=numeric(), test_auc_mean=numeric(), test_auc_std=numeric()) + + split = str_split(string = history, pattern = "\t") + for(line in split){ + dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)} + } + dt } -xgb.cv.strip.numeric <- function(x) { - as.numeric(strsplit(regmatches(x, regexec("test-(.*):(.*)$", x))[[1]][3], "\\+")[[1]]) -} + + diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 271182625..19f04ee79 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -56,6 +56,9 @@ prediction and dtrain,} \item{...}{other parameters to pass to \code{params}.} } +\value{ +a \code{data.table} with each mean and standard deviation stat for training set and test set. +} \description{ The cross valudation function of xgboost } From 8bbe45eed26f3a8fd87b574dd7c5eb4fce6c3fc1 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 16:09:03 +0100 Subject: [PATCH 5/6] fix some missing imports --- R-package/NAMESPACE | 2 ++ R-package/R/xgb.cv.R | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 7e0bfa8ac..5c9e19932 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -21,3 +21,5 @@ importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(magrittr,"%>%") importFrom(stringr,str_extract) +importFrom(stringr,str_extract_all) +importFrom(stringr,str_split) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 3a9fd9b86..a5be567bc 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -6,7 +6,9 @@ #' @importFrom magrittr %>% #' @importFrom data.table := #' @importFrom data.table rbindlist -#' @importFrom stringr str_extract +#' @importFrom stringr str_extract_all +#' @importFrom stringr str_split +#' #' @param params the list of parameters. Commonly used ones are: #' \itemize{ #' \item \code{objective} objective function, common ones are From 4d0d65837d14b5b688a1e88509df2129513a5b91 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 1 Jan 2015 22:43:23 +0100 Subject: [PATCH 6/6] parse history first line to guess which columns are required --- R-package/NAMESPACE | 4 ++++ R-package/R/xgb.cv.R | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 5c9e19932..bd12fc7ec 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -17,9 +17,13 @@ import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) importFrom(data.table,":=") +importFrom(data.table,as.data.table) importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(magrittr,"%>%") importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) +importFrom(stringr,str_match) +importFrom(stringr,str_replace) +importFrom(stringr,str_replace_all) importFrom(stringr,str_split) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index a5be567bc..c2e73e202 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -3,11 +3,15 @@ #' The cross valudation function of xgboost #' #' @importFrom data.table data.table +#' @importFrom data.table as.data.table #' @importFrom magrittr %>% #' @importFrom data.table := #' @importFrom data.table rbindlist #' @importFrom stringr str_extract_all #' @importFrom stringr str_split +#' @importFrom stringr str_replace_all +#' @importFrom stringr str_replace +#' @importFrom stringr str_match #' #' @param params the list of parameters. Commonly used ones are: #' \itemize{ @@ -98,14 +102,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = cat(paste(ret, "\n", sep="")) } - dt <- data.table(train_rmse_mean=numeric(), train_rmse_std=numeric(), train_auc_mean=numeric(), train_auc_std=numeric(), test_rmse_mean=numeric(), test_rmse_std=numeric(), test_auc_mean=numeric(), test_auc_std=numeric()) + colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".") + + colnamesMean <- paste(colnames, "mean") + colnamesStd <- paste(colnames, "std") + colnames <- c() + for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) + + type <- rep(x = "numeric", times = length(colnames)) + + dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table split = str_split(string = history, pattern = "\t") for(line in split){ dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)} } dt -} - - - +} \ No newline at end of file