Merge remote-tracking branch 'upstream/master'

This commit is contained in:
El Potaeto 2015-01-28 10:13:58 +01:00
commit e35a9f4822
35 changed files with 117 additions and 68 deletions

1
.gitignore vendored
View File

@ -54,3 +54,4 @@ train*
rabit rabit
.Rbuildignore .Rbuildignore
R-package.Rproj R-package.Rproj

View File

@ -1,4 +1,4 @@
# Generated by roxygen2 (4.1.0): do not edit by hand # Generated by roxygen2 (4.0.1): do not edit by hand
export(getinfo) export(getinfo)
export(setinfo) export(setinfo)

View File

@ -8,7 +8,7 @@ setClass("xgb.Booster")
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}. #' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original #' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is #' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will #' untransformed margin value. In logistic regression, outputmargin=T will

View File

@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") { if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix") stop("slice: first argument dtrain must be xgb.DMatrix")
} }
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len==nr)
if (length(ind)>0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix")) return(structure(ret, class = "xgb.DMatrix"))
}) })

View File

@ -6,7 +6,7 @@
#' indicating the data file. #' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object #' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
# #
#' @param ... other information to pass to \code{info}. #' @param ... other information to pass to \code{info}.
#' #'

View File

@ -32,7 +32,7 @@
#' @param nfold number of folds used #' @param nfold number of folds used
#' @param label option field, when data is Matrix #' @param label option field, when data is Matrix
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector. #' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation #' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation, #' @param metrics, list of evaluation metrics to be used in corss validation,

View File

@ -5,7 +5,7 @@
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or #' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}. #' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field, #' @param label the response variable. User should not set this field,
# if data is local data file or \code{xgb.DMatrix}. #' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters. Commonly used ones are: #' @param params the list of parameters. Commonly used ones are:
#' \itemize{ #' \itemize{
#' \item \code{objective} objective function, common ones are #' \item \code{objective} objective function, common ones are
@ -25,7 +25,7 @@
#' information of performance. If 2, xgboost will print information of both #' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information #' performance and construction progress information
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @details #' @details

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data} \docType{data}
\name{agaricus.test} \name{agaricus.test}
\alias{agaricus.test} \alias{agaricus.test}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data} \docType{data}
\name{agaricus.train} \name{agaricus.train}
\alias{agaricus.train} \alias{agaricus.train}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
\docType{methods} \docType{methods}
\name{getinfo} \name{getinfo}
\alias{getinfo} \alias{getinfo}
@ -13,9 +12,9 @@ getinfo(object, ...)
\arguments{ \arguments{
\item{object}{Object of class "xgb.DMatrix"} \item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get} \item{name}{the name of the field to get}
\item{...}{other parameters}
} }
\description{ \description{
Get information of an xgb.DMatrix object Get information of an xgb.DMatrix object

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
\docType{methods} \docType{methods}
\name{predict,xgb.Booster-method} \name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method} \alias{predict,xgb.Booster-method}
@ -14,7 +13,8 @@
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.} \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float} \item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{outputmargin}{whether the prediction should be shown in the original \item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is value of sum of functions, when outputmargin=TRUE, the prediction is

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
\docType{methods} \docType{methods}
\name{setinfo} \name{setinfo}
\alias{setinfo} \alias{setinfo}
@ -13,11 +12,11 @@ setinfo(object, ...)
\arguments{ \arguments{
\item{object}{Object of class "xgb.DMatrix"} \item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get} \item{name}{the name of the field to get}
\item{info}{the specific field of information to set} \item{info}{the specific field of information to set}
\item{...}{other parameters}
} }
\description{ \description{
Set information of an xgb.DMatrix object Set information of an xgb.DMatrix object

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
\docType{methods} \docType{methods}
\name{slice} \name{slice}
\alias{slice} \alias{slice}
@ -14,9 +13,9 @@ slice(object, ...)
\arguments{ \arguments{
\item{object}{Object of class "xgb.DMatrix"} \item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed} \item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters}
} }
\description{ \description{
Get a new DMatrix containing the specified rows of Get a new DMatrix containing the specified rows of

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix} \name{xgb.DMatrix}
\alias{xgb.DMatrix} \alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object} \title{Contruct xgb.DMatrix object}
@ -12,7 +11,8 @@ indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object} \item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float} \item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.} \item{...}{other information to pass to \code{info}.}
} }

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
\name{xgb.DMatrix.save} \name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save} \alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file} \title{Save xgb.DMatrix object to binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{xgb.cv} \name{xgb.cv}
\alias{xgb.cv} \alias{xgb.cv}
\title{Cross Validation} \title{Cross Validation}
@ -32,7 +31,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
\item{label}{option field, when data is Matrix} \item{label}{option field, when data is Matrix}
\item{missing}{Missing is only used when input is dense matrix, pick a float} \item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{prediction}{A logical value indicating whether to return the prediction vector.} \item{prediction}{A logical value indicating whether to return the prediction vector.}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.dump.R
\name{xgb.dump} \name{xgb.dump}
\alias{xgb.dump} \alias{xgb.dump}
\title{Save xgboost model to text file} \title{Save xgboost model to text file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.importance.R
\name{xgb.importance} \name{xgb.importance}
\alias{xgb.importance} \alias{xgb.importance}
\title{Show importance of features in a model} \title{Show importance of features in a model}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.load.R
\name{xgb.load} \name{xgb.load}
\alias{xgb.load} \alias{xgb.load}
\title{Load xgboost model from binary file} \title{Load xgboost model from binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.model.dt.tree.R
\name{xgb.model.dt.tree} \name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree} \alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table} \title{Convert tree model dump to data.table}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.plot.tree.R
\name{xgb.plot.tree} \name{xgb.plot.tree}
\alias{xgb.plot.tree} \alias{xgb.plot.tree}
\title{Plot a boosted tree model} \title{Plot a boosted tree model}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.save.R
\name{xgb.save} \name{xgb.save}
\alias{xgb.save} \alias{xgb.save}
\title{Save xgboost model to binary file} \title{Save xgboost model to binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgb.train.R
\name{xgb.train} \name{xgb.train}
\alias{xgb.train} \alias{xgb.train}
\title{eXtreme Gradient Boosting Training} \title{eXtreme Gradient Boosting Training}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand % Generated by roxygen2 (4.0.1): do not edit by hand
% Please edit documentation in R/xgboost.R
\name{xgboost} \name{xgboost}
\alias{xgboost} \alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library} \title{eXtreme Gradient Boosting (Tree) library}
@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.} \code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,} \item{label}{the response variable. User should not set this field,
if data is local data file or \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{params}{the list of parameters. Commonly used ones are: \item{params}{the list of parameters. Commonly used ones are:
\itemize{ \itemize{
@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
information of performance. If 2, xgboost will print information of both information of performance. If 2, xgboost will print information of both
performance and construction progress information} performance and construction progress information}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other parameters to pass to \code{params}.} \item{...}{other parameters to pass to \code{params}.}
} }
\description{ \description{

View File

@ -71,13 +71,13 @@ extern "C" {
SEXP missing) { SEXP missing) {
_WrapperBegin(); _WrapperBegin();
SEXP dim = getAttrib(mat, R_DimSymbol); SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0]; bst_ulong nrow = static_cast<bst_ulong>(INTEGER(dim)[0]);
int ncol = INTEGER(dim)[1]; bst_ulong ncol = static_cast<bst_ulong>(INTEGER(dim)[1]);
double *din = REAL(mat); double *din = REAL(mat);
std::vector<float> data(nrow * ncol); std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) { for (bst_omp_uint i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) { for (bst_ulong j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j]; data[i * ncol +j] = din[i + nrow * j];
} }
} }

View File

@ -1,7 +1,7 @@
xgboost: eXtreme Gradient Boosting xgboost: eXtreme Gradient Boosting
====== ======
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree. It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
Contributors: https://github.com/tqchen/xgboost/graphs/contributors Contributors: https://github.com/tqchen/xgboost/graphs/contributors

1
demo/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.libsvm

View File

@ -0,0 +1,9 @@
Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD)
1. Run runexp.sh
```bash
./runexp.sh
```
You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node).
Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset.

14
demo/yearpredMSD/csv2libsvm.py Executable file
View File

@ -0,0 +1,14 @@
#!/usr/bin/python
import sys
if len(sys.argv) < 3:
print 'Usage: <csv> <libsvm>'
print 'convert a all numerical csv to libsvm'
fo = open(sys.argv[2], 'w')
for l in open(sys.argv[1]):
arr = l.split(',')
fo.write('%s' % arr[0])
for i in xrange(len(arr) - 1):
fo.write(' %d:%s' % (i, arr[i+1]))
fo.close()

17
demo/yearpredMSD/runexp.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/bash
if [ -f YearPredictionMSD.txt ]
then
echo "use existing data to run experiment"
else
echo "getting data from uci, make sure you are connected to internet"
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
unzip YearPredictionMSD.txt.zip
fi
echo "start making data.."
# map feature using indicator encoding, also produce featmap.txt
python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
echo "finish making the data"
../../xgboost yearpredMSD.conf

View File

@ -1,6 +1,6 @@
Distributed XGBoost Distributed XGBoost
====== ======
This folder contains information of Distributed XGBoost. This folder contains information of Distributed XGBoost (Distributed GBDT).
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit) * The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning

View File

@ -227,14 +227,19 @@ class BoostLearner : public rabit::ISerializable {
*/ */
inline void SaveModel(const char *fname) const { inline void SaveModel(const char *fname) const {
FILE *fp; FILE *fp;
bool use_stdout = false;;
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(fname, "stdout")) { if (!strcmp(fname, "stdout")) {
fp = stdout; fp = stdout;
} else { use_stdout = true;
} else
#endif
{
fp = utils::FopenCheck(fname, "wb"); fp = utils::FopenCheck(fname, "wb");
} }
utils::FileStream fo(fp); utils::FileStream fo(fp);
std::string header; std::string header;
if (save_base64 != 0|| fp == stdout) { if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5); fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp); utils::Base64OutStream bout(fp);
this->SaveModel(bout); this->SaveModel(bout);
@ -243,7 +248,9 @@ class BoostLearner : public rabit::ISerializable {
fo.Write("binf", 4); fo.Write("binf", 4);
this->SaveModel(fo); this->SaveModel(fo);
} }
if (fp != stdout) fclose(fp); if (!use_stdout) {
fclose(fp);
}
} }
/*! /*!
* \brief check if data matrix is ready to be used by training, * \brief check if data matrix is ready to be used by training,

View File

@ -206,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
Softmax(&rec); Softmax(&rec);
const unsigned j = i % nstep; const unsigned j = i % nstep;
int label = static_cast<int>(info.labels[j]); int label = static_cast<int>(info.labels[j]);
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class"); utils::Check(label >= 0 && label < nclass,
"SoftmaxMultiClassObj: label must be in [0, num_class)");
const float wt = info.GetWeight(j); const float wt = info.GetWeight(j);
for (int k = 0; k < nclass; ++k) { for (int k = 0; k < nclass; ++k) {
float p = rec[k]; float p = rec[k];

View File

@ -208,7 +208,6 @@ struct SparseCSRFileBuilder {
fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
} }
} }
printf("CSV::begin_dat=%lu\n", begin_data);
} }
protected: protected:
inline void WriteBuffer(void) { inline void WriteBuffer(void) {