From 901904b5357750c6cebaf13f7eb9162bcdd01029 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 13:50:05 +0100
Subject: [PATCH 1/6] linear text dump model

---
 src/gbm/gblinear-inl.hpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 624f15c28..473914b6e 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -8,6 +8,7 @@
  */
 #include <vector>
 #include <string>
+#include <sstream>
 #include <algorithm>
 #include "./gbm.h"
 #include "../tree/updater.h"
@@ -134,11 +135,24 @@ class GBLinear : public IGradBooster {
       }
     }
   }
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    utils::Error("gblinear does not support dump model");
-    return std::vector<std::string>();
-  }
 
+ virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
+    std::stringstream fo("");
+    fo << "bias:\n";
+    for (int i = 0; i < model.param.num_output_group; ++i) {
+      fo << model.bias()[i] << std::endl;
+    }
+    fo << "weight:\n";
+    for (int i = 0; i < model.param.num_output_group; ++i) {
+      for (int j = 0; j <model.param.num_feature; ++j) {
+        fo << model[i][j] << std::endl;
+      }
+    }
+    std::vector<std::string> v;
+    v.push_back(fo.str());
+    return v;
+ }
+  
  protected:
   inline void Pred(const RowBatch::Inst &inst, float *preds) {
     for (int gid = 0; gid < model.param.num_output_group; ++gid) {

From 5e5500d6d3234f1d4ff8d61275df2afd7fbf894a Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 13:50:28 +0100
Subject: [PATCH 2/6] rewording

---
 R-package/demo/create_sparse_matrix.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R
index cf0fcac4d..4060d1c48 100644
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -70,7 +70,7 @@ xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
 importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
 print(importance)
-# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that there contribution is very low.
+# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
 
 # Does these results make sense?
 # Let's check some Chi2 between each of these features and the outcome.

From 34aaeff3d9199c2fcf1b281e818a39e0cf185825 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 14:57:48 +0100
Subject: [PATCH 3/6] small documentation change

---
 R-package/R/xgb.importance.R    | 3 ++-
 R-package/man/xgb.importance.Rd | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index b2e60bed7..2071680d3 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -2,7 +2,6 @@
 #' 
 #' Read a xgboost model text dump. 
 #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
-#' Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 #' 
 #' @importFrom data.table data.table
 #' @importFrom magrittr %>%
@@ -11,6 +10,8 @@
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
 #'
+#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+#'
 #' @details 
 #' This is the function to understand the model trained (and through your model, your data).
 #' 
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 883819993..a7a71cefc 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -11,10 +11,12 @@ xgb.importance(feature_names = NULL, filename_dump = NULL)
 
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
 }
+\value{
+A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+}
 \description{
 Read a xgboost model text dump.
 Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
-Return a data.table of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 }
 \details{
 This is the function to understand the model trained (and through your model, your data).

From a524a51a06f979e5ea4d6a14e5c6368c88549dd0 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 16:05:43 +0100
Subject: [PATCH 4/6] return history as data.table for cross validation +
 documentation

---
 R-package/NAMESPACE     |  1 +
 R-package/R/xgb.cv.R    | 23 ++++++++++++++++++-----
 R-package/man/xgb.cv.Rd |  3 +++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 1714d2044..7e0bfa8ac 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -18,5 +18,6 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(data.table,":=")
 importFrom(data.table,data.table)
+importFrom(data.table,rbindlist)
 importFrom(magrittr,"%>%")
 importFrom(stringr,str_extract)
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 02870b772..3a9fd9b86 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -1,7 +1,12 @@
 #' Cross Validation
 #' 
 #' The cross valudation function of xgboost
-#'
+#' 
+#' @importFrom data.table data.table
+#' @importFrom magrittr %>%
+#' @importFrom data.table :=
+#' @importFrom data.table rbindlist
+#' @importFrom stringr str_extract
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
 #'   \item \code{objective} objective function, common ones are
@@ -40,6 +45,8 @@
 #     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
 #' @param ... other parameters to pass to \code{params}.
 #' 
+#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set.
+#' 
 #' @details 
 #' This is the cross validation function for xgboost
 #'
@@ -88,9 +95,15 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     history <- c(history, ret)
     cat(paste(ret, "\n", sep=""))
   }
-  return (history)
+  
+  dt <- data.table(train_rmse_mean=numeric(), train_rmse_std=numeric(), train_auc_mean=numeric(), train_auc_std=numeric(), test_rmse_mean=numeric(), test_rmse_std=numeric(), test_auc_mean=numeric(), test_auc_std=numeric())
+  
+  split = str_split(string = history, pattern = "\t")
+  for(line in split){
+    dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)}
+  }
+  dt
 }
 
-xgb.cv.strip.numeric <- function(x) {
-  as.numeric(strsplit(regmatches(x, regexec("test-(.*):(.*)$", x))[[1]][3], "\\+")[[1]])
-}
+
+
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 271182625..19f04ee79 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -56,6 +56,9 @@ prediction and dtrain,}
 
 \item{...}{other parameters to pass to \code{params}.}
 }
+\value{
+a \code{data.table} with each mean and standard deviation stat for training set and test set.
+}
 \description{
 The cross valudation function of xgboost
 }

From 8bbe45eed26f3a8fd87b574dd7c5eb4fce6c3fc1 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 16:09:03 +0100
Subject: [PATCH 5/6] fix some missing imports

---
 R-package/NAMESPACE  | 2 ++
 R-package/R/xgb.cv.R | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 7e0bfa8ac..5c9e19932 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -21,3 +21,5 @@ importFrom(data.table,data.table)
 importFrom(data.table,rbindlist)
 importFrom(magrittr,"%>%")
 importFrom(stringr,str_extract)
+importFrom(stringr,str_extract_all)
+importFrom(stringr,str_split)
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 3a9fd9b86..a5be567bc 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -6,7 +6,9 @@
 #' @importFrom magrittr %>%
 #' @importFrom data.table :=
 #' @importFrom data.table rbindlist
-#' @importFrom stringr str_extract
+#' @importFrom stringr str_extract_all
+#' @importFrom stringr str_split
+#' 
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
 #'   \item \code{objective} objective function, common ones are

From 4d0d65837d14b5b688a1e88509df2129513a5b91 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Thu, 1 Jan 2015 22:43:23 +0100
Subject: [PATCH 6/6] parse history first line to guess which columns are
 required

---
 R-package/NAMESPACE  |  4 ++++
 R-package/R/xgb.cv.R | 20 +++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 5c9e19932..bd12fc7ec 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -17,9 +17,13 @@ import(methods)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(data.table,":=")
+importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
 importFrom(data.table,rbindlist)
 importFrom(magrittr,"%>%")
 importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
+importFrom(stringr,str_match)
+importFrom(stringr,str_replace)
+importFrom(stringr,str_replace_all)
 importFrom(stringr,str_split)
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index a5be567bc..c2e73e202 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -3,11 +3,15 @@
 #' The cross valudation function of xgboost
 #' 
 #' @importFrom data.table data.table
+#' @importFrom data.table as.data.table
 #' @importFrom magrittr %>%
 #' @importFrom data.table :=
 #' @importFrom data.table rbindlist
 #' @importFrom stringr str_extract_all
 #' @importFrom stringr str_split
+#' @importFrom stringr str_replace_all
+#' @importFrom stringr str_replace
+#' @importFrom stringr str_match
 #' 
 #' @param params the list of parameters. Commonly used ones are:
 #' \itemize{
@@ -98,14 +102,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     cat(paste(ret, "\n", sep=""))
   }
   
-  dt <- data.table(train_rmse_mean=numeric(), train_rmse_std=numeric(), train_auc_mean=numeric(), train_auc_std=numeric(), test_rmse_mean=numeric(), test_rmse_std=numeric(), test_auc_mean=numeric(), test_auc_std=numeric())
+  colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".")
+  
+  colnamesMean <- paste(colnames, "mean")
+  colnamesStd <- paste(colnames, "std")
+  colnames <- c()
+  for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
+  
+  type <- rep(x = "numeric", times = length(colnames))
+  
+  dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
   
   split = str_split(string = history, pattern = "\t")
   for(line in split){
     dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)}
   }
   dt
-}
-
-
-
+}
\ No newline at end of file