Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
e35a9f4822
1
.gitignore
vendored
1
.gitignore
vendored
@ -54,3 +54,4 @@ train*
|
||||
rabit
|
||||
.Rbuildignore
|
||||
R-package.Rproj
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
# Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
|
||||
export(getinfo)
|
||||
export(setinfo)
|
||||
|
||||
@ -7,8 +7,8 @@ setClass("xgb.Booster")
|
||||
#' @param object Object of class "xgb.Boost"
|
||||
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param outputmargin whether the prediction should be shown in the original
|
||||
#' value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
#' untransformed margin value. In logistic regression, outputmargin=T will
|
||||
|
||||
@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
|
||||
if (class(object) != "xgb.DMatrix") {
|
||||
stop("slice: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
|
||||
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
|
||||
PACKAGE = "xgboost")
|
||||
|
||||
attr_list <- attributes(object)
|
||||
nr <- xgb.numrow(object)
|
||||
len <- sapply(attr_list,length)
|
||||
ind <- which(len==nr)
|
||||
if (length(ind)>0) {
|
||||
nms <- names(attr_list)[ind]
|
||||
for (i in 1:length(ind)) {
|
||||
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
|
||||
}
|
||||
}
|
||||
return(structure(ret, class = "xgb.DMatrix"))
|
||||
})
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#' indicating the data file.
|
||||
#' @param info a list of information of the xgb.DMatrix object
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#
|
||||
#' @param ... other information to pass to \code{info}.
|
||||
#'
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
#' @param nfold number of folds used
|
||||
#' @param label option field, when data is Matrix
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param prediction A logical value indicating whether to return the prediction vector.
|
||||
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in corss validation,
|
||||
@ -134,4 +134,4 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(".")
|
||||
globalVariables(".")
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param label the response variable. User should not set this field,
|
||||
# if data is local data file or \code{xgb.DMatrix}.
|
||||
#' if data is local data file or \code{xgb.DMatrix}.
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
@ -24,8 +24,8 @@
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
#' information of performance. If 2, xgboost will print information of both
|
||||
#' performance and construction progress information
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{data}
|
||||
\name{agaricus.test}
|
||||
\alias{agaricus.test}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{data}
|
||||
\name{agaricus.train}
|
||||
\alias{agaricus.train}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/getinfo.xgb.DMatrix.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{getinfo}
|
||||
\alias{getinfo}
|
||||
@ -13,9 +12,9 @@ getinfo(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Get information of an xgb.DMatrix object
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/predict.xgb.Booster.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{predict,xgb.Booster-method}
|
||||
\alias{predict,xgb.Booster-method}
|
||||
@ -14,7 +13,8 @@
|
||||
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{outputmargin}{whether the prediction should be shown in the original
|
||||
value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/setinfo.xgb.DMatrix.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{setinfo}
|
||||
\alias{setinfo}
|
||||
@ -13,11 +12,11 @@ setinfo(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Set information of an xgb.DMatrix object
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/slice.xgb.DMatrix.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{slice}
|
||||
\alias{slice}
|
||||
@ -14,9 +13,9 @@ slice(object, ...)
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Get a new DMatrix containing the specified rows of
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.DMatrix}
|
||||
\alias{xgb.DMatrix}
|
||||
\title{Contruct xgb.DMatrix object}
|
||||
@ -12,7 +11,8 @@ indicating the data file.}
|
||||
|
||||
\item{info}{a list of information of the xgb.DMatrix object}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{...}{other information to pass to \code{info}.}
|
||||
}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.save.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.DMatrix.save}
|
||||
\alias{xgb.DMatrix.save}
|
||||
\title{Save xgb.DMatrix object to binary file}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.cv.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.cv}
|
||||
\alias{xgb.cv}
|
||||
\title{Cross Validation}
|
||||
@ -32,7 +31,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
|
||||
\item{label}{option field, when data is Matrix}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{prediction}{A logical value indicating whether to return the prediction vector.}
|
||||
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.dump.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.dump}
|
||||
\alias{xgb.dump}
|
||||
\title{Save xgboost model to text file}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.importance.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.importance}
|
||||
\alias{xgb.importance}
|
||||
\title{Show importance of features in a model}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.load.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.load}
|
||||
\alias{xgb.load}
|
||||
\title{Load xgboost model from binary file}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.model.dt.tree.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.model.dt.tree}
|
||||
\alias{xgb.model.dt.tree}
|
||||
\title{Convert tree model dump to data.table}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.tree.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.plot.tree}
|
||||
\alias{xgb.plot.tree}
|
||||
\title{Plot a boosted tree model}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.save.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.save}
|
||||
\alias{xgb.save}
|
||||
\title{Save xgboost model to binary file}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.train.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.train}
|
||||
\alias{xgb.train}
|
||||
\title{eXtreme Gradient Boosting Training}
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgboost}
|
||||
\alias{xgboost}
|
||||
\title{eXtreme Gradient Boosting (Tree) library}
|
||||
@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{label}{the response variable. User should not set this field,}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{label}{the response variable. User should not set this field,
|
||||
if data is local data file or \code{xgb.DMatrix}.}
|
||||
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
information of performance. If 2, xgboost will print information of both
|
||||
performance and construction progress information}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
}
|
||||
\description{
|
||||
|
||||
@ -71,13 +71,13 @@ extern "C" {
|
||||
SEXP missing) {
|
||||
_WrapperBegin();
|
||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
||||
int nrow = INTEGER(dim)[0];
|
||||
int ncol = INTEGER(dim)[1];
|
||||
bst_ulong nrow = static_cast<bst_ulong>(INTEGER(dim)[0]);
|
||||
bst_ulong ncol = static_cast<bst_ulong>(INTEGER(dim)[1]);
|
||||
double *din = REAL(mat);
|
||||
std::vector<float> data(nrow * ncol);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < nrow; ++i) {
|
||||
for (int j = 0; j < ncol; ++j) {
|
||||
for (bst_omp_uint i = 0; i < nrow; ++i) {
|
||||
for (bst_ulong j = 0; j < ncol; ++j) {
|
||||
data[i * ncol +j] = din[i + nrow * j];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
xgboost: eXtreme Gradient Boosting
|
||||
======
|
||||
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
|
||||
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree.
|
||||
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
|
||||
|
||||
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
|
||||
|
||||
|
||||
1
demo/.gitignore
vendored
Normal file
1
demo/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.libsvm
|
||||
9
demo/yearpredMSD/README.md
Normal file
9
demo/yearpredMSD/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD)
|
||||
|
||||
1. Run runexp.sh
|
||||
```bash
|
||||
./runexp.sh
|
||||
```
|
||||
|
||||
You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node).
|
||||
Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset.
|
||||
14
demo/yearpredMSD/csv2libsvm.py
Executable file
14
demo/yearpredMSD/csv2libsvm.py
Executable file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print 'Usage: <csv> <libsvm>'
|
||||
print 'convert a all numerical csv to libsvm'
|
||||
|
||||
fo = open(sys.argv[2], 'w')
|
||||
for l in open(sys.argv[1]):
|
||||
arr = l.split(',')
|
||||
fo.write('%s' % arr[0])
|
||||
for i in xrange(len(arr) - 1):
|
||||
fo.write(' %d:%s' % (i, arr[i+1]))
|
||||
fo.close()
|
||||
17
demo/yearpredMSD/runexp.sh
Executable file
17
demo/yearpredMSD/runexp.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -f YearPredictionMSD.txt ]
|
||||
then
|
||||
echo "use existing data to run experiment"
|
||||
else
|
||||
echo "getting data from uci, make sure you are connected to internet"
|
||||
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
|
||||
unzip YearPredictionMSD.txt.zip
|
||||
fi
|
||||
echo "start making data.."
|
||||
# map feature using indicator encoding, also produce featmap.txt
|
||||
python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
|
||||
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
|
||||
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
|
||||
echo "finish making the data"
|
||||
../../xgboost yearpredMSD.conf
|
||||
@ -1,6 +1,6 @@
|
||||
Distributed XGBoost
|
||||
======
|
||||
This folder contains information of Distributed XGBoost.
|
||||
This folder contains information of Distributed XGBoost (Distributed GBDT).
|
||||
|
||||
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
|
||||
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning
|
||||
|
||||
@ -339,7 +339,7 @@ class FMatrixPage : public IFMatrix {
|
||||
}
|
||||
if (ktop % 100000 == 0) {
|
||||
utils::Printf("\r \r");
|
||||
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
|
||||
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -227,14 +227,19 @@ class BoostLearner : public rabit::ISerializable {
|
||||
*/
|
||||
inline void SaveModel(const char *fname) const {
|
||||
FILE *fp;
|
||||
bool use_stdout = false;;
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
fp = stdout;
|
||||
} else {
|
||||
use_stdout = true;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
fp = utils::FopenCheck(fname, "wb");
|
||||
}
|
||||
utils::FileStream fo(fp);
|
||||
std::string header;
|
||||
if (save_base64 != 0|| fp == stdout) {
|
||||
if (save_base64 != 0|| use_stdout) {
|
||||
fo.Write("bs64\t", 5);
|
||||
utils::Base64OutStream bout(fp);
|
||||
this->SaveModel(bout);
|
||||
@ -243,7 +248,9 @@ class BoostLearner : public rabit::ISerializable {
|
||||
fo.Write("binf", 4);
|
||||
this->SaveModel(fo);
|
||||
}
|
||||
if (fp != stdout) fclose(fp);
|
||||
if (!use_stdout) {
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief check if data matrix is ready to be used by training,
|
||||
|
||||
@ -206,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
|
||||
Softmax(&rec);
|
||||
const unsigned j = i % nstep;
|
||||
int label = static_cast<int>(info.labels[j]);
|
||||
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
|
||||
utils::Check(label >= 0 && label < nclass,
|
||||
"SoftmaxMultiClassObj: label must be in [0, num_class)");
|
||||
const float wt = info.GetWeight(j);
|
||||
for (int k = 0; k < nclass; ++k) {
|
||||
float p = rec[k];
|
||||
|
||||
@ -208,7 +208,6 @@ struct SparseCSRFileBuilder {
|
||||
fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
|
||||
}
|
||||
}
|
||||
printf("CSV::begin_dat=%lu\n", begin_data);
|
||||
}
|
||||
protected:
|
||||
inline void WriteBuffer(void) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user