Merge branch 'master' into unity

This commit is contained in:
tqchen 2014-08-30 15:01:52 -07:00
commit e18a4fc5b6
15 changed files with 140 additions and 55 deletions

View File

@ -11,7 +11,7 @@ xgboost-0.2x
* Weighted samples instances * Weighted samples instances
* Initial version of pairwise rank * Initial version of pairwise rank
xgboost-unity xgboost-0.3
===== =====
* Faster tree construction module * Faster tree construction module
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio``` - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```

View File

@ -6,7 +6,7 @@ Date: 2014-08-23
Author: Tianqi Chen, Tong He Author: Tianqi Chen, Tong He
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com> Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
Description: xgboost Description: xgboost
License: See LICENSE file in the project root of xgboost. License: file LICENSE
URL: https://github.com/tqchen/xgboost URL: https://github.com/tqchen/xgboost
BugReports: https://github.com/tqchen/xgboost/issues BugReports: https://github.com/tqchen/xgboost/issues
Depends: Depends:

13
R-package/LICENSE Normal file
View File

@ -0,0 +1,13 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -11,7 +11,7 @@ setClass('xgb.DMatrix')
#' data(iris) #' data(iris)
#' iris[,5] <- as.numeric(iris[,5]) #' iris[,5] <- as.numeric(iris[,5])
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
#' labels <- getinfo(dtest, "label") #' labels <- getinfo(dtrain, "label")
#' @export #' @export
#' #'
getinfo <- function(object, ...){ getinfo <- function(object, ...){

View File

@ -21,6 +21,7 @@ xgb.save <- function(model, fname) {
.Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost") .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster") stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
xgb.DMatrix object.")
return(FALSE) return(FALSE)
} }

1
R-package/demo/00Index Normal file
View File

@ -0,0 +1 @@
demo R code for xgboost usages on agaricus data

View File

@ -17,6 +17,6 @@ Get information of an xgb.DMatrix object
data(iris) data(iris)
iris[,5] <- as.numeric(iris[,5]) iris[,5] <- as.numeric(iris[,5])
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
labels <- getinfo(dtest, "label") labels <- getinfo(dtrain, "label")
} }

View File

@ -2,13 +2,11 @@
PKGROOT=../../ PKGROOT=../../
# _*_ mode: Makefile; _*_ # _*_ mode: Makefile; _*_
CXX=`R CMD config CXX` CXX=`R CMD config CXX`
CFLAGS=`R CMD config CFLAGS` TCFLAGS=`R CMD config CFLAGS`
# expose these flags to R CMD SHLIB # expose these flags to R CMD SHLIB
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS)
PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) XGBFLAG= $(TCFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
ifeq ($(no_omp),1) ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP PKG_CPPFLAGS += -DDISABLE_OPENMP
@ -26,7 +24,7 @@ xgboost_io.o: $(PKGROOT)/src/io/io.cpp
xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp
$(CXXOBJ) : $(CXXOBJ) :
$(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean: clean:

View File

@ -2,11 +2,11 @@
PKGROOT=../../ PKGROOT=../../
# _*_ mode: Makefile; _*_ # _*_ mode: Makefile; _*_
CXX=`Rcmd config CXX` CXX=`Rcmd config CXX`
CFLAGS=`Rcmd config CFLAGS` TCFLAGS=`Rcmd config CFLAGS`
# expose these flags to R CMD SHLIB # expose these flags to R CMD SHLIB
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS)
PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) XGBFLAG= -O3 -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
ifeq ($(no_omp),1) ifeq ($(no_omp),1)
@ -21,12 +21,13 @@ all: $(SHLIB)
$(SHLIB): $(OBJECTS) $(SHLIB): $(OBJECTS)
xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp
xgboost_io.o: $(PKGROOT)/src/io/io.cpp xgboost_io.o: $(PKGROOT)/src/io/io.cpp
xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp
$(CXXOBJ) : $(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean: clean:
rm -rf *.so *.o *~ *.dll rm -rf *.so *.o *~ *.dll

View File

@ -1,39 +1,63 @@
\documentclass{article} \documentclass{article}
\RequirePackage{url}
\usepackage{natbib}
\usepackage{graphics}
\usepackage{amsmath}
\usepackage{hyperref} \usepackage{hyperref}
\usepackage{indentfirst} \RequirePackage{amsmath}
\usepackage[utf8]{inputenc} \RequirePackage{natbib}
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
\DeclareMathOperator{\var}{var} \makeatletter
\DeclareMathOperator{\cov}{cov} % \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
% \VignetteIndexEntry{xgboost} %\VignettePackage{xgboost}
% \VignetteEngine{knitr::knitr}
\makeatother
\begin{document} \begin{document}
%\SweaveOpts{concordance=TRUE}
<<foo,include=FALSE,echo=FALSE>>= <<knitropts,echo=FALSE,message=FALSE>>=
options(keep.source = TRUE, width = 60) if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
foo <- packageDescription("xgboost")
@ @
\title{xgboost Package Example (Version \Sexpr{foo$Version})} %
\author{Tong He} <<prelim,echo=FALSE>>=
\maketitle xgboost.version = '0.3-0'
@
%
\begin{center}
\vspace*{6\baselineskip}
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
{\large \today}\par
\vfill
\end{center}
\thispagestyle{empty}
\clearpage
\setcounter{page}{1}
\section{Introduction} \section{Introduction}
This is an example of using the \verb@xgboost@ package in R. This is an introductory document of using the \verb@xgboost@ package in R.
\verb@xgboost@ is short for eXtreme Gradient Boosting (Tree). It supports \verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
regression and classification analysis on different types of input datasets. and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}.
The package includes efficient linear model solver and tree learning algorithm.
Comparing to \verb@gbm@ in R, it has several features: It supports various objective functions, including regression, classification
and ranking. The package is made to be extendible, so that user are also allowed
to define there own objectives easily. It has several features:
\begin{enumerate} \begin{enumerate}
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
Windows and Linux, with openmp.} Windows and Linux, with openmp. It is generally over 10 times faster than
\verb@gbm@.}
\item{Input Type: }{\verb@xgboost@ takes several types of input data:} \item{Input Type: }{\verb@xgboost@ takes several types of input data:}
\begin{itemize} \begin{itemize}
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
@ -41,15 +65,15 @@ Comparing to \verb@gbm@ in R, it has several features:
\item{Data File: }{Local data files} \item{Data File: }{Local data files}
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
\end{itemize} \end{itemize}
\item{Regularization: }{\verb@xgboost@ supports regularization for \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
$L_1,L_2$ term on weights and $L_2$ term on bias.} and linear booster, and is optimized for sparse input.}
\item{Customization: }{\verb@xgboost@ supports customized objective function \item{Customization: }{\verb@xgboost@ supports customized objective function
and evaluation function} and evaluation function}
\item{Performance: }{\verb@xgboost@ has better performance on several different \item{Performance: }{\verb@xgboost@ has better performance on several different
datasets. Its rising popularity and fame in different Kaggle competitions datasets.}
is the evidence.}
\end{enumerate} \end{enumerate}
\section{Example with iris} \section{Example with iris}
In this section, we will illustrate some common usage of \verb@xgboost@. In this section, we will illustrate some common usage of \verb@xgboost@.
@ -62,7 +86,6 @@ bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
xgb.save(bst, 'model.save') xgb.save(bst, 'model.save')
bst = xgb.load('model.save') bst = xgb.load('model.save')
pred <- predict(bst, as.matrix(iris[,1:4])) pred <- predict(bst, as.matrix(iris[,1:4]))
hist(pred)
@ @
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. \verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
@ -93,7 +116,8 @@ booster[1]:
\end{verbatim} \end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
It speeds up \verb@xgboost@. It speeds up \verb@xgboost@, and is needed for advanced features such as
training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>= <<xgb.DMatrix>>=
@ -119,7 +143,7 @@ is more flexible than \verb@xgboost@, but it requires users to read the document
a bit more carefully. a bit more carefully.
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it \verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it
supports some additional features as custom objective and evaluation functions. supports advanced features as custom objective and evaluation functions.
<<Customized loss function>>= <<Customized loss function>>=
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
@ -149,14 +173,14 @@ objective function.
We also have \verb@slice@ for row extraction. It is useful in We also have \verb@slice@ for row extraction. It is useful in
cross-validation. cross-validation.
For a walkthrough demo, please see \verb@R-package/demo/demo.R@ for further
details.
\section{The Higgs Boson competition} \section{The Higgs Boson competition}
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
Boson Machine Learning Challenge}. Boson Machine Learning Challenge}.
Our result reaches 3.60 with a single model. This results stands in the top 30%
of the competition.
Here are the instructions to make a submission Here are the instructions to make a submission
\begin{enumerate} \begin{enumerate}
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
@ -169,5 +193,23 @@ Here are the instructions to make a submission
and submit your result. and submit your result.
\end{enumerate} \end{enumerate}
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
The training set contains 350000 records and 30 features.
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
still about two times faster than \verb@gbm@.
Meanwhile, the result from \verb@xgboost@ reaches
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
single model. This results stands in the
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
competition.
\bibliographystyle{jss}
\nocite{*} % list uncited references
\bibliography{xgboost}
\end{document} \end{document}

View File

@ -0,0 +1,20 @@
@article{friedman2001greedy,
title={Greedy function approximation: a gradient boosting machine},
author={Friedman, Jerome H},
journal={Annals of Statistics},
pages={1189--1232},
year={2001},
publisher={JSTOR}
}
@article{friedman2000additive,
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
journal={The annals of statistics},
volume={28},
number={2},
pages={337--407},
year={2000},
publisher={Institute of Mathematical Statistics}
}

View File

@ -33,7 +33,7 @@ Build
Version Version
====== ======
* This version is named xgboost-unity, the code has been refactored from 0.2x to be cleaner and more flexibility * This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure * This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
- This means the model and buffer file of previous version can not be loaded in xgboost-unity - This means the model and buffer file of previous version can not be loaded in xgboost-unity
* For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22) * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)

View File

@ -105,7 +105,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n", printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row(), info.num_col(), row_data_.size(), fname); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
} }
fclose(file); fclose(file);
// try to load in additional file // try to load in additional file
@ -155,7 +157,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded", printf("%lux%lu matrix with %lu entries is loaded",
info.num_row(), info.num_col(), row_data_.size()); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()));
if (fname != NULL) { if (fname != NULL) {
printf(" from %s\n", fname); printf(" from %s\n", fname);
} else { } else {
@ -183,9 +187,12 @@ class DMatrixSimple : public DataMatrix {
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is saved to %s\n", printf("%lux%lu matrix with %lu entries is saved to %s\n",
info.num_row(), info.num_col(), row_data_.size(), fname); static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
if (info.group_ptr.size() != 0) { if (info.group_ptr.size() != 0) {
printf("data contains %lu groups\n", info.group_ptr.size()-1); printf("data contains %u groups\n",
static_cast<unsigned>(info.group_ptr.size()-1));
} }
} }
} }

View File

@ -98,7 +98,8 @@ struct MetaInfo {
group_ptr.push_back(group_ptr.back()+nline); group_ptr.push_back(group_ptr.back()+nline);
} }
if (!silent) { if (!silent) {
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname); printf("%u groups are loaded from %s\n",
static_cast<unsigned>(group_ptr.size()-1), fname);
} }
fclose(fi); fclose(fi);
return true; return true;

View File

@ -66,10 +66,11 @@ class BoostLearner {
snprintf(str_temp, sizeof(str_temp), "%u", num_feature); snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
this->SetParam("bst:num_feature", str_temp); this->SetParam("bst:num_feature", str_temp);
} }
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size); snprintf(str_temp, sizeof(str_temp), "%lu",
static_cast<unsigned long>(buffer_size));
this->SetParam("num_pbuffer", str_temp); this->SetParam("num_pbuffer", str_temp);
if (!silent) { if (!silent) {
printf("buffer_size=%ld\n", buffer_size); printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
} }
} }
/*! /*!