From f84884431049faeafffccb05c88624920b984eca Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 10:05:47 -0800 Subject: [PATCH 01/13] better warning at multiclass, fix cran check --- src/io/page_fmatrix-inl.hpp | 2 +- src/learner/learner-inl.hpp | 13 ++++++++++--- src/learner/objective-inl.hpp | 3 ++- src/utils/matrix_csr.h | 1 - 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 0527da827..44cb9abdc 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -339,7 +339,7 @@ class FMatrixPage : public IFMatrix { } if (ktop % 100000 == 0) { utils::Printf("\r \r"); - utils::Printf("InitCol: %lu rows ", static_cast(ktop)); + utils::Printf("InitCol: %lu rows ", static_cast(ktop)); } } } diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 630f8fa20..616cf03e9 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -227,14 +227,19 @@ class BoostLearner : public rabit::ISerializable { */ inline void SaveModel(const char *fname) const { FILE *fp; + bool use_stdout = false;; +#ifndef XGBOOST_STRICT_CXX98_ if (!strcmp(fname, "stdout")) { fp = stdout; - } else { + use_stdout = true; + } else +#endif + { fp = utils::FopenCheck(fname, "wb"); } utils::FileStream fo(fp); std::string header; - if (save_base64 != 0|| fp == stdout) { + if (save_base64 != 0|| use_stdout) { fo.Write("bs64\t", 5); utils::Base64OutStream bout(fp); this->SaveModel(bout); @@ -243,7 +248,9 @@ class BoostLearner : public rabit::ISerializable { fo.Write("binf", 4); this->SaveModel(fo); } - if (fp != stdout) fclose(fp); + if (!use_stdout) { + fclose(fp); + } } /*! * \brief check if data matrix is ready to be used by training, diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 7702774f9..9887e7a05 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -206,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction { Softmax(&rec); const unsigned j = i % nstep; int label = static_cast(info.labels[j]); - utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class"); + utils::Check(label >= 0 && label < nclass, + "SoftmaxMultiClassObj: label must be in [0, num_class)"); const float wt = info.GetWeight(j); for (int k = 0; k < nclass; ++k) { float p = rec[k]; diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h index bc9479cc3..14e0667ee 100644 --- a/src/utils/matrix_csr.h +++ b/src/utils/matrix_csr.h @@ -208,7 +208,6 @@ struct SparseCSRFileBuilder { fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); } } - printf("CSV::begin_dat=%lu\n", begin_data); } protected: inline void WriteBuffer(void) { From 8971f0ff5051580a1c7cdda3682cec91d323c1f3 Mon Sep 17 00:00:00 2001 From: Tong He Date: Sun, 25 Jan 2015 10:21:24 -0800 Subject: [PATCH 02/13] Update xgboost.R --- R-package/R/xgboost.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 02a554f68..39e047e8e 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -24,8 +24,7 @@ #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both #' performance and construction progress information -#' @param missing Missing is only used when input is dense matrix, pick a float -# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param missing Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' @param ... other parameters to pass to \code{params}. #' #' @details From 33101d5cade2c5d8f4a152127584af8213278a89 Mon Sep 17 00:00:00 2001 From: hetong Date: Sun, 25 Jan 2015 10:31:48 -0800 Subject: [PATCH 03/13] edit document --- R-package/NAMESPACE | 2 +- R-package/man/agaricus.test.Rd | 3 +-- R-package/man/agaricus.train.Rd | 3 +-- R-package/man/getinfo.Rd | 7 +++---- R-package/man/predict-xgb.Booster-method.Rd | 3 +-- R-package/man/setinfo.Rd | 7 +++---- R-package/man/slice.Rd | 7 +++---- R-package/man/xgb.DMatrix.Rd | 3 +-- R-package/man/xgb.DMatrix.save.Rd | 3 +-- R-package/man/xgb.cv.Rd | 3 +-- R-package/man/xgb.dump.Rd | 3 +-- R-package/man/xgb.importance.Rd | 3 +-- R-package/man/xgb.load.Rd | 3 +-- R-package/man/xgb.model.dt.tree.Rd | 3 +-- R-package/man/xgb.plot.tree.Rd | 3 +-- R-package/man/xgb.save.Rd | 3 +-- R-package/man/xgb.train.Rd | 3 +-- R-package/man/xgboost.Rd | 7 +++---- 18 files changed, 26 insertions(+), 43 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index d29ad7a18..12225c966 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.0): do not edit by hand +# Generated by roxygen2 (4.0.1): do not edit by hand export(getinfo) export(setinfo) diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index 556425379..c050d3ecd 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgboost.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{data} \name{agaricus.test} \alias{agaricus.test} diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index 879b3d5df..02571cf54 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgboost.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{data} \name{agaricus.train} \alias{agaricus.train} diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 37e0ad0be..23e3adc84 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/getinfo.xgb.DMatrix.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{methods} \name{getinfo} \alias{getinfo} @@ -13,9 +12,9 @@ getinfo(object, ...) \arguments{ \item{object}{Object of class "xgb.DMatrix"} -\item{...}{other parameters} - \item{name}{the name of the field to get} + +\item{...}{other parameters} } \description{ Get information of an xgb.DMatrix object diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index d8da7975e..66cdfc36c 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/predict.xgb.Booster.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{methods} \name{predict,xgb.Booster-method} \alias{predict,xgb.Booster-method} diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index 4ed262b46..7ea992110 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/setinfo.xgb.DMatrix.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{methods} \name{setinfo} \alias{setinfo} @@ -13,11 +12,11 @@ setinfo(object, ...) \arguments{ \item{object}{Object of class "xgb.DMatrix"} -\item{...}{other parameters} - \item{name}{the name of the field to get} \item{info}{the specific field of information to set} + +\item{...}{other parameters} } \description{ Set information of an xgb.DMatrix object diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index a7812e886..a749aa8ff 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/slice.xgb.DMatrix.R +% Generated by roxygen2 (4.0.1): do not edit by hand \docType{methods} \name{slice} \alias{slice} @@ -14,9 +13,9 @@ slice(object, ...) \arguments{ \item{object}{Object of class "xgb.DMatrix"} -\item{...}{other parameters} - \item{idxset}{a integer vector of indices of rows needed} + +\item{...}{other parameters} } \description{ Get a new DMatrix containing the specified rows of diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 86000220f..227fb515f 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.DMatrix.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.DMatrix} \alias{xgb.DMatrix} \title{Contruct xgb.DMatrix object} diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index 6bbc277b3..803de912b 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.DMatrix.save.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.DMatrix.save} \alias{xgb.DMatrix.save} \title{Save xgb.DMatrix object to binary file} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 0867134ae..12aaf9bf1 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.cv.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.cv} \alias{xgb.cv} \title{Cross Validation} diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 7958a72e8..d1968217b 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.dump.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.dump} \alias{xgb.dump} \title{Save xgboost model to text file} diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 1b2946729..1588639b4 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.importance.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.importance} \alias{xgb.importance} \title{Show importance of features in a model} diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 433b38c79..d2c5d94b6 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.load.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.load} \alias{xgb.load} \title{Load xgboost model from binary file} diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index f91a2afe9..51c965970 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.model.dt.tree.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.model.dt.tree} \alias{xgb.model.dt.tree} \title{Convert tree model dump to data.table} diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 8aec827ec..dc95dfec0 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.plot.tree.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.plot.tree} \alias{xgb.plot.tree} \title{Plot a boosted tree model} diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index ded444446..0ccdf13da 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.save.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.save} \alias{xgb.save} \title{Save xgboost model to binary file} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 58ef94135..a05e2eeb9 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgb.train.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgb.train} \alias{xgb.train} \title{eXtreme Gradient Boosting Training} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 21b1ad220..dec97f239 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -1,5 +1,4 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/xgboost.R +% Generated by roxygen2 (4.0.1): do not edit by hand \name{xgboost} \alias{xgboost} \title{eXtreme Gradient Boosting (Tree) library} @@ -13,8 +12,6 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), \item{label}{the response variable. User should not set this field,} -\item{missing}{Missing is only used when input is dense matrix, pick a float} - \item{params}{the list of parameters. Commonly used ones are: \itemize{ \item \code{objective} objective function, common ones are @@ -36,6 +33,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), information of performance. If 2, xgboost will print information of both performance and construction progress information} +\item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} + \item{...}{other parameters to pass to \code{params}.} } \description{ From f75387f7016f5b21b8d10ec97204475615eec9c9 Mon Sep 17 00:00:00 2001 From: hetong Date: Sun, 25 Jan 2015 10:37:11 -0800 Subject: [PATCH 04/13] update document --- R-package/R/predict.xgb.Booster.R | 4 ++-- R-package/R/xgb.DMatrix.R | 2 +- R-package/R/xgb.cv.R | 4 ++-- R-package/R/xgboost.R | 5 +++-- R-package/man/predict-xgb.Booster-method.Rd | 3 ++- R-package/man/xgb.DMatrix.Rd | 3 ++- R-package/man/xgb.cv.Rd | 3 ++- R-package/man/xgboost.Rd | 6 ++++-- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 49f1ad4f0..1e458e708 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -7,8 +7,8 @@ setClass("xgb.Booster") #' @param object Object of class "xgb.Boost" #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or #' \code{xgb.DMatrix}. -#' @param missing Missing is only used when input is dense matrix, pick a float -# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param missing Missing is only used when input is dense matrix, pick a float +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' @param outputmargin whether the prediction should be shown in the original #' value of sum of functions, when outputmargin=TRUE, the prediction is #' untransformed margin value. In logistic regression, outputmargin=T will diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index b7a5a9897..8c3ea80bc 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -6,7 +6,7 @@ #' indicating the data file. #' @param info a list of information of the xgb.DMatrix object #' @param missing Missing is only used when input is dense matrix, pick a float -# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. # #' @param ... other information to pass to \code{info}. #' diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 2129b418a..ed088df52 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -32,7 +32,7 @@ #' @param nfold number of folds used #' @param label option field, when data is Matrix #' @param missing Missing is only used when input is dense matrix, pick a float -# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' @param prediction A logical value indicating whether to return the prediction vector. #' @param showsd \code{boolean}, whether show standard deviation of cross validation #' @param metrics, list of evaluation metrics to be used in corss validation, @@ -134,4 +134,4 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(".") \ No newline at end of file +globalVariables(".") diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 39e047e8e..c72c4d5b0 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -5,7 +5,7 @@ #' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or #' \code{xgb.DMatrix}. #' @param label the response variable. User should not set this field, -# if data is local data file or \code{xgb.DMatrix}. +#' if data is local data file or \code{xgb.DMatrix}. #' @param params the list of parameters. Commonly used ones are: #' \itemize{ #' \item \code{objective} objective function, common ones are @@ -24,7 +24,8 @@ #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' information of performance. If 2, xgboost will print information of both #' performance and construction progress information -#' @param missing Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param missing Missing is only used when input is dense matrix, pick a float +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' @param ... other parameters to pass to \code{params}. #' #' @details diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 66cdfc36c..204a8167f 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -13,7 +13,8 @@ \item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.} -\item{missing}{Missing is only used when input is dense matrix, pick a float} +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} \item{outputmargin}{whether the prediction should be shown in the original value of sum of functions, when outputmargin=TRUE, the prediction is diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 227fb515f..31efde687 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -11,7 +11,8 @@ indicating the data file.} \item{info}{a list of information of the xgb.DMatrix object} -\item{missing}{Missing is only used when input is dense matrix, pick a float} +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} \item{...}{other information to pass to \code{info}.} } diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 12aaf9bf1..149ec392f 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -31,7 +31,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, \item{label}{option field, when data is Matrix} -\item{missing}{Missing is only used when input is dense matrix, pick a float} +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} \item{prediction}{A logical value indicating whether to return the prediction vector.} diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index dec97f239..035eec9e7 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -10,7 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.} -\item{label}{the response variable. User should not set this field,} +\item{label}{the response variable. User should not set this field, +if data is local data file or \code{xgb.DMatrix}.} \item{params}{the list of parameters. Commonly used ones are: \itemize{ @@ -33,7 +34,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), information of performance. If 2, xgboost will print information of both performance and construction progress information} -\item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} \item{...}{other parameters to pass to \code{params}.} } From 5188bad8733e578fe9849fbd2ae89e9aee4bc1bc Mon Sep 17 00:00:00 2001 From: hetong007 Date: Sun, 25 Jan 2015 14:16:46 -0800 Subject: [PATCH 05/13] fix cv attr --- R-package/R/slice.xgb.DMatrix.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index 419170a66..b70a8ee92 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix", if (class(object) != "xgb.DMatrix") { stop("slice: first argument dtrain must be xgb.DMatrix") } - ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") + ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, + PACKAGE = "xgboost") + + attr_list <- attributes(object) + nr <- xgb.numrow(object) + len <- sapply(attr_list,length) + ind <- which(len==nr) + if (length(ind)>0) { + nms <- names(attr_list)[ind] + for (i in 1:length(ind)) { + attr(ret,nms[i]) <- attr(object,nms[i])[idxset] + } + } return(structure(ret, class = "xgb.DMatrix")) }) From 4266827105300ec3237b0081810b827b6e796c72 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 26 Jan 2015 09:04:34 -0800 Subject: [PATCH 06/13] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d74292118..949a10ba6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ xgboost: eXtreme Gradient Boosting ====== An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. -It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree. +It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data. Contributors: https://github.com/tqchen/xgboost/graphs/contributors From 97e058dbd725ca5197c5ff913af07bebec7039d8 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 26 Jan 2015 09:04:55 -0800 Subject: [PATCH 07/13] Update README.md --- multi-node/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multi-node/README.md b/multi-node/README.md index b94cc2c77..0beb55de7 100644 --- a/multi-node/README.md +++ b/multi-node/README.md @@ -1,6 +1,6 @@ Distributed XGBoost ====== -This folder contains information of Distributed XGBoost. +This folder contains information of Distributed XGBoost (Distributed GBDT). * The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit) - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning From c34367b2077bdee159efecdbe115e6eb5fd28677 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 26 Jan 2015 10:27:44 -0800 Subject: [PATCH 08/13] add msd --- .gitignore | 2 ++ demo/.gitignore | 1 + demo/yearpredMSD/csv2libsvm.py | 14 ++++++++++++++ demo/yearpredMSD/runexp.sh | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+) create mode 100644 demo/.gitignore create mode 100755 demo/yearpredMSD/csv2libsvm.py create mode 100755 demo/yearpredMSD/runexp.sh diff --git a/.gitignore b/.gitignore index d454c6d1d..ee5928043 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,5 @@ Debug *save *csv .Rproj.user +xgboost +xgboost.mock diff --git a/demo/.gitignore b/demo/.gitignore new file mode 100644 index 000000000..e52797d15 --- /dev/null +++ b/demo/.gitignore @@ -0,0 +1 @@ +*.libsvm \ No newline at end of file diff --git a/demo/yearpredMSD/csv2libsvm.py b/demo/yearpredMSD/csv2libsvm.py new file mode 100755 index 000000000..d7c1d15c1 --- /dev/null +++ b/demo/yearpredMSD/csv2libsvm.py @@ -0,0 +1,14 @@ +#!/usr/bin/python +import sys + +if len(sys.argv) < 3: + print 'Usage: ' + print 'convert a all numerical csv to libsvm' + +fo = open(sys.argv[2], 'w') +for l in open(sys.argv[1]): + arr = l.split(',') + fo.write('%s' % arr[0]) + for i in xrange(len(arr) - 1): + fo.write(' %d:%s' % (i, arr[i+1])) +fo.close() diff --git a/demo/yearpredMSD/runexp.sh b/demo/yearpredMSD/runexp.sh new file mode 100755 index 000000000..fa75b837e --- /dev/null +++ b/demo/yearpredMSD/runexp.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [ -f YearPredictionMSD.txt ] +then + echo "use existing data to run experiment" +else + echo "getting data from uci, make sure you are connected to internet" + wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip + unzip YearPredictionMSD.txt.zip +fi +echo "start making data.." +# map feature using indicator encoding, also produce featmap.txt +python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm +head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train +tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test +echo "finish making the data" +../../xgboost yearpredMSD.conf + + + \ No newline at end of file From e72174f0f86e69576427536ace371c9477b2d738 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 26 Jan 2015 10:29:34 -0800 Subject: [PATCH 09/13] add readme --- demo/yearpredMSD/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 demo/yearpredMSD/README.md diff --git a/demo/yearpredMSD/README.md b/demo/yearpredMSD/README.md new file mode 100644 index 000000000..989a1062c --- /dev/null +++ b/demo/yearpredMSD/README.md @@ -0,0 +1,6 @@ +Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD) + +1. Run runexp.sh +```bash +./runexp.sh +``` From a264bc3969ce6b6f9d4b1716b4efe690b6bf319b Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 26 Jan 2015 10:30:12 -0800 Subject: [PATCH 10/13] ok --- demo/yearpredMSD/runexp.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/demo/yearpredMSD/runexp.sh b/demo/yearpredMSD/runexp.sh index fa75b837e..8853c3f20 100755 --- a/demo/yearpredMSD/runexp.sh +++ b/demo/yearpredMSD/runexp.sh @@ -15,6 +15,3 @@ head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test echo "finish making the data" ../../xgboost yearpredMSD.conf - - - \ No newline at end of file From deb4983273f944e6e4dddaa837ec46fa0a6a7ede Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 26 Jan 2015 10:40:04 -0800 Subject: [PATCH 11/13] ok --- demo/yearpredMSD/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/demo/yearpredMSD/README.md b/demo/yearpredMSD/README.md index 989a1062c..3fe35056a 100644 --- a/demo/yearpredMSD/README.md +++ b/demo/yearpredMSD/README.md @@ -4,3 +4,6 @@ Demonstrating how to use XGBoost on [Year Prediction task of Million Song Datase ```bash ./runexp.sh ``` + +You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node). +Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset. From 3e0fba392d34771ad0f1d7eaf4cb79d8eefaae55 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 27 Jan 2015 16:29:52 -0800 Subject: [PATCH 12/13] fix the integer overflow --- R-package/src/xgboost_R.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index b4757542d..f1958c709 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -71,8 +71,8 @@ extern "C" { SEXP missing) { _WrapperBegin(); SEXP dim = getAttrib(mat, R_DimSymbol); - int nrow = INTEGER(dim)[0]; - int ncol = INTEGER(dim)[1]; + bst_ulong nrow = static_cast(INTEGER(dim)[0]); + bst_ulong ncol = static_cast(INTEGER(dim)[1]); double *din = REAL(mat); std::vector data(nrow * ncol); #pragma omp parallel for schedule(static) From 16db3ce6207f174ed6065a566794b80eb522deb9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 27 Jan 2015 16:31:53 -0800 Subject: [PATCH 13/13] quick fix --- R-package/src/xgboost_R.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index f1958c709..aa17b30cc 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -76,8 +76,8 @@ extern "C" { double *din = REAL(mat); std::vector data(nrow * ncol); #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow; ++i) { - for (int j = 0; j < ncol; ++j) { + for (bst_omp_uint i = 0; i < nrow; ++i) { + for (bst_ulong j = 0; j < ncol; ++j) { data[i * ncol +j] = din[i + nrow * j]; } }