Merge pull request #132 from pommedeterresautee/master
Return history as data.table for cross validation + bring back linear model dump to master + other fixes
This commit is contained in:
commit
61df646eed
@ -17,6 +17,13 @@ import(methods)
|
|||||||
importClassesFrom(Matrix,dgCMatrix)
|
importClassesFrom(Matrix,dgCMatrix)
|
||||||
importClassesFrom(Matrix,dgeMatrix)
|
importClassesFrom(Matrix,dgeMatrix)
|
||||||
importFrom(data.table,":=")
|
importFrom(data.table,":=")
|
||||||
|
importFrom(data.table,as.data.table)
|
||||||
importFrom(data.table,data.table)
|
importFrom(data.table,data.table)
|
||||||
|
importFrom(data.table,rbindlist)
|
||||||
importFrom(magrittr,"%>%")
|
importFrom(magrittr,"%>%")
|
||||||
importFrom(stringr,str_extract)
|
importFrom(stringr,str_extract)
|
||||||
|
importFrom(stringr,str_extract_all)
|
||||||
|
importFrom(stringr,str_match)
|
||||||
|
importFrom(stringr,str_replace)
|
||||||
|
importFrom(stringr,str_replace_all)
|
||||||
|
importFrom(stringr,str_split)
|
||||||
|
|||||||
@ -2,6 +2,17 @@
|
|||||||
#'
|
#'
|
||||||
#' The cross valudation function of xgboost
|
#' The cross valudation function of xgboost
|
||||||
#'
|
#'
|
||||||
|
#' @importFrom data.table data.table
|
||||||
|
#' @importFrom data.table as.data.table
|
||||||
|
#' @importFrom magrittr %>%
|
||||||
|
#' @importFrom data.table :=
|
||||||
|
#' @importFrom data.table rbindlist
|
||||||
|
#' @importFrom stringr str_extract_all
|
||||||
|
#' @importFrom stringr str_split
|
||||||
|
#' @importFrom stringr str_replace_all
|
||||||
|
#' @importFrom stringr str_replace
|
||||||
|
#' @importFrom stringr str_match
|
||||||
|
#'
|
||||||
#' @param params the list of parameters. Commonly used ones are:
|
#' @param params the list of parameters. Commonly used ones are:
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{objective} objective function, common ones are
|
#' \item \code{objective} objective function, common ones are
|
||||||
@ -40,6 +51,8 @@
|
|||||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#'
|
#'
|
||||||
|
#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||||
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' This is the cross validation function for xgboost
|
#' This is the cross validation function for xgboost
|
||||||
#'
|
#'
|
||||||
@ -88,9 +101,21 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
history <- c(history, ret)
|
history <- c(history, ret)
|
||||||
cat(paste(ret, "\n", sep=""))
|
cat(paste(ret, "\n", sep=""))
|
||||||
}
|
}
|
||||||
return (history)
|
|
||||||
}
|
|
||||||
|
|
||||||
xgb.cv.strip.numeric <- function(x) {
|
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".")
|
||||||
as.numeric(strsplit(regmatches(x, regexec("test-(.*):(.*)$", x))[[1]][3], "\\+")[[1]])
|
|
||||||
|
colnamesMean <- paste(colnames, "mean")
|
||||||
|
colnamesStd <- paste(colnames, "std")
|
||||||
|
colnames <- c()
|
||||||
|
for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||||
|
|
||||||
|
type <- rep(x = "numeric", times = length(colnames))
|
||||||
|
|
||||||
|
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||||
|
|
||||||
|
split = str_split(string = history, pattern = "\t")
|
||||||
|
for(line in split){
|
||||||
|
dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)}
|
||||||
|
}
|
||||||
|
dt
|
||||||
}
|
}
|
||||||
@ -2,7 +2,6 @@
|
|||||||
#'
|
#'
|
||||||
#' Read a xgboost model text dump.
|
#' Read a xgboost model text dump.
|
||||||
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
||||||
#' Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
|
||||||
#'
|
#'
|
||||||
#' @importFrom data.table data.table
|
#' @importFrom data.table data.table
|
||||||
#' @importFrom magrittr %>%
|
#' @importFrom magrittr %>%
|
||||||
@ -11,6 +10,8 @@
|
|||||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
||||||
#'
|
#'
|
||||||
|
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||||
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' This is the function to understand the model trained (and through your model, your data).
|
#' This is the function to understand the model trained (and through your model, your data).
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -70,7 +70,7 @@ xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
|||||||
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
||||||
print(importance)
|
print(importance)
|
||||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that there contribution is very low.
|
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||||
|
|
||||||
# Does these results make sense?
|
# Does these results make sense?
|
||||||
# Let's check some Chi2 between each of these features and the outcome.
|
# Let's check some Chi2 between each of these features and the outcome.
|
||||||
|
|||||||
@ -56,6 +56,9 @@ prediction and dtrain,}
|
|||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
}
|
}
|
||||||
|
\value{
|
||||||
|
a \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||||
|
}
|
||||||
\description{
|
\description{
|
||||||
The cross valudation function of xgboost
|
The cross valudation function of xgboost
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,10 +11,12 @@ xgb.importance(feature_names = NULL, filename_dump = NULL)
|
|||||||
|
|
||||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
|
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
|
||||||
}
|
}
|
||||||
|
\value{
|
||||||
|
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||||
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a xgboost model text dump.
|
Read a xgboost model text dump.
|
||||||
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
||||||
Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
This is the function to understand the model trained (and through your model, your data).
|
This is the function to understand the model trained (and through your model, your data).
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
*/
|
*/
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "./gbm.h"
|
#include "./gbm.h"
|
||||||
#include "../tree/updater.h"
|
#include "../tree/updater.h"
|
||||||
@ -134,9 +135,22 @@ class GBLinear : public IGradBooster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
|
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
|
||||||
utils::Error("gblinear does not support dump model");
|
std::stringstream fo("");
|
||||||
return std::vector<std::string>();
|
fo << "bias:\n";
|
||||||
|
for (int i = 0; i < model.param.num_output_group; ++i) {
|
||||||
|
fo << model.bias()[i] << std::endl;
|
||||||
|
}
|
||||||
|
fo << "weight:\n";
|
||||||
|
for (int i = 0; i < model.param.num_output_group; ++i) {
|
||||||
|
for (int j = 0; j <model.param.num_feature; ++j) {
|
||||||
|
fo << model[i][j] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector<std::string> v;
|
||||||
|
v.push_back(fo.str());
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user