Merge remote-tracking branch 'upstream/master'

This commit is contained in:
El Potaeto 2015-01-28 10:13:58 +01:00
commit e35a9f4822
35 changed files with 117 additions and 68 deletions

1
.gitignore vendored
View File

@ -54,3 +54,4 @@ train*
rabit
.Rbuildignore
R-package.Rproj

View File

@ -1,4 +1,4 @@
# Generated by roxygen2 (4.1.0): do not edit by hand
# Generated by roxygen2 (4.0.1): do not edit by hand
export(getinfo)
export(setinfo)

View File

@ -7,8 +7,8 @@ setClass("xgb.Booster")
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will

View File

@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len==nr)
if (length(ind)>0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
})

View File

@ -6,7 +6,7 @@
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'

View File

@ -32,7 +32,7 @@
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
@ -134,4 +134,4 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")
globalVariables(".")

View File

@ -5,7 +5,7 @@
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
# if data is local data file or \code{xgb.DMatrix}.
#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
@ -24,8 +24,8 @@
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{getinfo}
\alias{getinfo}
@ -13,9 +12,9 @@ getinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{...}{other parameters}
}
\description{
Get information of an xgb.DMatrix object

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
@ -14,7 +13,8 @@
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{setinfo}
\alias{setinfo}
@ -13,11 +12,11 @@ setinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
\item{...}{other parameters}
}
\description{
Set information of an xgb.DMatrix object

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{slice}
\alias{slice}
@ -14,9 +13,9 @@ slice(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters}
}
\description{
Get a new DMatrix containing the specified rows of

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
@ -12,7 +11,8 @@ indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.}
}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.cv.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
@ -32,7 +31,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
\item{label}{option field, when data is Matrix}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{prediction}{A logical value indicating whether to return the prediction vector.}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.dump.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.importance.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.load.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.model.dt.tree.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.plot.tree.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.plot.tree}
\alias{xgb.plot.tree}
\title{Plot a boosted tree model}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.save.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.train.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{label}{the response variable. User should not set this field,
if data is local data file or \code{xgb.DMatrix}.}
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{

View File

@ -71,13 +71,13 @@ extern "C" {
SEXP missing) {
_WrapperBegin();
SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0];
int ncol = INTEGER(dim)[1];
bst_ulong nrow = static_cast<bst_ulong>(INTEGER(dim)[0]);
bst_ulong ncol = static_cast<bst_ulong>(INTEGER(dim)[1]);
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
for (bst_omp_uint i = 0; i < nrow; ++i) {
for (bst_ulong j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}

View File

@ -1,7 +1,7 @@
xgboost: eXtreme Gradient Boosting
======
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
Contributors: https://github.com/tqchen/xgboost/graphs/contributors

1
demo/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.libsvm

View File

@ -0,0 +1,9 @@
Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD)
1. Run runexp.sh
```bash
./runexp.sh
```
You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node).
Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset.

14
demo/yearpredMSD/csv2libsvm.py Executable file
View File

@ -0,0 +1,14 @@
#!/usr/bin/python
import sys
if len(sys.argv) < 3:
print 'Usage: <csv> <libsvm>'
print 'convert a all numerical csv to libsvm'
fo = open(sys.argv[2], 'w')
for l in open(sys.argv[1]):
arr = l.split(',')
fo.write('%s' % arr[0])
for i in xrange(len(arr) - 1):
fo.write(' %d:%s' % (i, arr[i+1]))
fo.close()

17
demo/yearpredMSD/runexp.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/bash
if [ -f YearPredictionMSD.txt ]
then
echo "use existing data to run experiment"
else
echo "getting data from uci, make sure you are connected to internet"
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
unzip YearPredictionMSD.txt.zip
fi
echo "start making data.."
# map feature using indicator encoding, also produce featmap.txt
python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
echo "finish making the data"
../../xgboost yearpredMSD.conf

View File

@ -1,6 +1,6 @@
Distributed XGBoost
======
This folder contains information of Distributed XGBoost.
This folder contains information of Distributed XGBoost (Distributed GBDT).
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning

View File

@ -339,7 +339,7 @@ class FMatrixPage : public IFMatrix {
}
if (ktop % 100000 == 0) {
utils::Printf("\r \r");
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
}
}
}

View File

@ -227,14 +227,19 @@ class BoostLearner : public rabit::ISerializable {
*/
inline void SaveModel(const char *fname) const {
FILE *fp;
bool use_stdout = false;;
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(fname, "stdout")) {
fp = stdout;
} else {
use_stdout = true;
} else
#endif
{
fp = utils::FopenCheck(fname, "wb");
}
utils::FileStream fo(fp);
std::string header;
if (save_base64 != 0|| fp == stdout) {
if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp);
this->SaveModel(bout);
@ -243,7 +248,9 @@ class BoostLearner : public rabit::ISerializable {
fo.Write("binf", 4);
this->SaveModel(fo);
}
if (fp != stdout) fclose(fp);
if (!use_stdout) {
fclose(fp);
}
}
/*!
* \brief check if data matrix is ready to be used by training,

View File

@ -206,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
Softmax(&rec);
const unsigned j = i % nstep;
int label = static_cast<int>(info.labels[j]);
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
utils::Check(label >= 0 && label < nclass,
"SoftmaxMultiClassObj: label must be in [0, num_class)");
const float wt = info.GetWeight(j);
for (int k = 0; k < nclass; ++k) {
float p = rec[k];

View File

@ -208,7 +208,6 @@ struct SparseCSRFileBuilder {
fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
}
}
printf("CSV::begin_dat=%lu\n", begin_data);
}
protected:
inline void WriteBuffer(void) {