Merge pull request #132 from pommedeterresautee/master

Return history as data.table for cross validation + bring back linear model dump to master + other fixes
This commit is contained in:
Tianqi Chen 2015-01-02 17:06:24 +08:00
commit 61df646eed
7 changed files with 66 additions and 14 deletions

View File

@ -17,6 +17,13 @@ import(methods)
importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix) importClassesFrom(Matrix,dgeMatrix)
importFrom(data.table,":=") importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,data.table) importFrom(data.table,data.table)
importFrom(data.table,rbindlist)
importFrom(magrittr,"%>%") importFrom(magrittr,"%>%")
importFrom(stringr,str_extract) importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all)
importFrom(stringr,str_match)
importFrom(stringr,str_replace)
importFrom(stringr,str_replace_all)
importFrom(stringr,str_split)

View File

@ -1,7 +1,18 @@
#' Cross Validation #' Cross Validation
#' #'
#' The cross valudation function of xgboost #' The cross valudation function of xgboost
#' #'
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_split
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#'
#' @param params the list of parameters. Commonly used ones are: #' @param params the list of parameters. Commonly used ones are:
#' \itemize{ #' \itemize{
#' \item \code{objective} objective function, common ones are #' \item \code{objective} objective function, common ones are
@ -40,6 +51,8 @@
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. # value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set.
#'
#' @details #' @details
#' This is the cross validation function for xgboost #' This is the cross validation function for xgboost
#' #'
@ -88,9 +101,21 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
history <- c(history, ret) history <- c(history, ret)
cat(paste(ret, "\n", sep="")) cat(paste(ret, "\n", sep=""))
} }
return (history)
} colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".")
xgb.cv.strip.numeric <- function(x) { colnamesMean <- paste(colnames, "mean")
as.numeric(strsplit(regmatches(x, regexec("test-(.*):(.*)$", x))[[1]][3], "\\+")[[1]]) colnamesStd <- paste(colnames, "std")
} colnames <- c()
for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
type <- rep(x = "numeric", times = length(colnames))
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
split = str_split(string = history, pattern = "\t")
for(line in split){
dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)}
}
dt
}

View File

@ -2,7 +2,6 @@
#' #'
#' Read a xgboost model text dump. #' Read a xgboost model text dump.
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
#' Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#' #'
#' @importFrom data.table data.table #' @importFrom data.table data.table
#' @importFrom magrittr %>% #' @importFrom magrittr %>%
@ -11,6 +10,8 @@
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
#' #'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#'
#' @details #' @details
#' This is the function to understand the model trained (and through your model, your data). #' This is the function to understand the model trained (and through your model, your data).
#' #'

View File

@ -70,7 +70,7 @@ xgb.dump(bst, 'xgb.model.dump', with.stats = T)
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
print(importance) print(importance)
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that there contribution is very low. # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
# Does these results make sense? # Does these results make sense?
# Let's check some Chi2 between each of these features and the outcome. # Let's check some Chi2 between each of these features and the outcome.

View File

@ -56,6 +56,9 @@ prediction and dtrain,}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
} }
\value{
a \code{data.table} with each mean and standard deviation stat for training set and test set.
}
\description{ \description{
The cross valudation function of xgboost The cross valudation function of xgboost
} }

View File

@ -11,10 +11,12 @@ xgb.importance(feature_names = NULL, filename_dump = NULL)
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
} }
\value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
}
\description{ \description{
Read a xgboost model text dump. Read a xgboost model text dump.
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
} }
\details{ \details{
This is the function to understand the model trained (and through your model, your data). This is the function to understand the model trained (and through your model, your data).

View File

@ -8,6 +8,7 @@
*/ */
#include <vector> #include <vector>
#include <string> #include <string>
#include <sstream>
#include <algorithm> #include <algorithm>
#include "./gbm.h" #include "./gbm.h"
#include "../tree/updater.h" #include "../tree/updater.h"
@ -134,11 +135,24 @@ class GBLinear : public IGradBooster {
} }
} }
} }
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
utils::Error("gblinear does not support dump model");
return std::vector<std::string>();
}
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
std::stringstream fo("");
fo << "bias:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
fo << model.bias()[i] << std::endl;
}
fo << "weight:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
for (int j = 0; j <model.param.num_feature; ++j) {
fo << model[i][j] << std::endl;
}
}
std::vector<std::string> v;
v.push_back(fo.str());
return v;
}
protected: protected:
inline void Pred(const RowBatch::Inst &inst, float *preds) { inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) { for (int gid = 0; gid < model.param.num_output_group; ++gid) {