Merge branch 'master' into unity
This commit is contained in:
commit
e18a4fc5b6
@ -11,7 +11,7 @@ xgboost-0.2x
|
||||
* Weighted samples instances
|
||||
* Initial version of pairwise rank
|
||||
|
||||
xgboost-unity
|
||||
xgboost-0.3
|
||||
=====
|
||||
* Faster tree construction module
|
||||
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
||||
|
||||
@ -6,7 +6,7 @@ Date: 2014-08-23
|
||||
Author: Tianqi Chen, Tong He
|
||||
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
|
||||
Description: xgboost
|
||||
License: See LICENSE file in the project root of xgboost.
|
||||
License: file LICENSE
|
||||
URL: https://github.com/tqchen/xgboost
|
||||
BugReports: https://github.com/tqchen/xgboost/issues
|
||||
Depends:
|
||||
|
||||
13
R-package/LICENSE
Normal file
13
R-package/LICENSE
Normal file
@ -0,0 +1,13 @@
|
||||
Copyright (c) 2014 by Tianqi Chen and Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@ -11,7 +11,7 @@ setClass('xgb.DMatrix')
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' labels <- getinfo(dtest, "label")
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' @export
|
||||
#'
|
||||
getinfo <- function(object, ...){
|
||||
|
||||
@ -21,6 +21,7 @@ xgb.save <- function(model, fname) {
|
||||
.Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
|
||||
xgb.DMatrix object.")
|
||||
return(FALSE)
|
||||
}
|
||||
|
||||
1
R-package/demo/00Index
Normal file
1
R-package/demo/00Index
Normal file
@ -0,0 +1 @@
|
||||
demo R code for xgboost usages on agaricus data
|
||||
@ -17,6 +17,6 @@ Get information of an xgb.DMatrix object
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
labels <- getinfo(dtest, "label")
|
||||
labels <- getinfo(dtrain, "label")
|
||||
}
|
||||
|
||||
|
||||
@ -2,13 +2,11 @@
|
||||
PKGROOT=../../
|
||||
# _*_ mode: Makefile; _*_
|
||||
CXX=`R CMD config CXX`
|
||||
CFLAGS=`R CMD config CFLAGS`
|
||||
TCFLAGS=`R CMD config CFLAGS`
|
||||
# expose these flags to R CMD SHLIB
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
|
||||
XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
XGBFLAG= $(TCFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
|
||||
ifeq ($(no_omp),1)
|
||||
PKG_CPPFLAGS += -DDISABLE_OPENMP
|
||||
@ -26,7 +24,7 @@ xgboost_io.o: $(PKGROOT)/src/io/io.cpp
|
||||
xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
|
||||
xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp
|
||||
|
||||
$(CXXOBJ) :
|
||||
$(CXXOBJ) :
|
||||
$(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
clean:
|
||||
|
||||
@ -2,11 +2,11 @@
|
||||
PKGROOT=../../
|
||||
# _*_ mode: Makefile; _*_
|
||||
CXX=`Rcmd config CXX`
|
||||
CFLAGS=`Rcmd config CFLAGS`
|
||||
TCFLAGS=`Rcmd config CFLAGS`
|
||||
# expose these flags to R CMD SHLIB
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
|
||||
XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
XGBFLAG= -O3 -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
|
||||
|
||||
ifeq ($(no_omp),1)
|
||||
@ -21,12 +21,13 @@ all: $(SHLIB)
|
||||
$(SHLIB): $(OBJECTS)
|
||||
|
||||
xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp
|
||||
xgboost_io.o: $(PKGROOT)/src/io/io.cpp
|
||||
xgboost_io.o: $(PKGROOT)/src/io/io.cpp
|
||||
xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
|
||||
xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp
|
||||
|
||||
$(CXXOBJ) :
|
||||
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
$(CXXOBJ) :
|
||||
$(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
clean:
|
||||
rm -rf *.so *.o *~ *.dll
|
||||
|
||||
|
||||
@ -1,39 +1,63 @@
|
||||
\documentclass{article}
|
||||
|
||||
\usepackage{natbib}
|
||||
\usepackage{graphics}
|
||||
\usepackage{amsmath}
|
||||
\RequirePackage{url}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{indentfirst}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\RequirePackage{amsmath}
|
||||
\RequirePackage{natbib}
|
||||
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
|
||||
|
||||
\DeclareMathOperator{\var}{var}
|
||||
\DeclareMathOperator{\cov}{cov}
|
||||
|
||||
% \VignetteIndexEntry{xgboost}
|
||||
\makeatletter
|
||||
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
|
||||
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
|
||||
%\VignettePackage{xgboost}
|
||||
% \VignetteEngine{knitr::knitr}
|
||||
\makeatother
|
||||
|
||||
\begin{document}
|
||||
%\SweaveOpts{concordance=TRUE}
|
||||
|
||||
<<foo,include=FALSE,echo=FALSE>>=
|
||||
options(keep.source = TRUE, width = 60)
|
||||
foo <- packageDescription("xgboost")
|
||||
<<knitropts,echo=FALSE,message=FALSE>>=
|
||||
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
|
||||
@
|
||||
|
||||
\title{xgboost Package Example (Version \Sexpr{foo$Version})}
|
||||
\author{Tong He}
|
||||
\maketitle
|
||||
%
|
||||
<<prelim,echo=FALSE>>=
|
||||
xgboost.version = '0.3-0'
|
||||
@
|
||||
%
|
||||
|
||||
\begin{center}
|
||||
\vspace*{6\baselineskip}
|
||||
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
|
||||
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
|
||||
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
|
||||
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
|
||||
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
|
||||
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
|
||||
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
|
||||
{\large \today}\par
|
||||
\vfill
|
||||
\end{center}
|
||||
|
||||
\thispagestyle{empty}
|
||||
|
||||
\clearpage
|
||||
|
||||
\setcounter{page}{1}
|
||||
|
||||
\section{Introduction}
|
||||
|
||||
This is an example of using the \verb@xgboost@ package in R.
|
||||
This is an introductory document of using the \verb@xgboost@ package in R.
|
||||
|
||||
\verb@xgboost@ is short for eXtreme Gradient Boosting (Tree). It supports
|
||||
regression and classification analysis on different types of input datasets.
|
||||
|
||||
Comparing to \verb@gbm@ in R, it has several features:
|
||||
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
|
||||
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}.
|
||||
The package includes efficient linear model solver and tree learning algorithm.
|
||||
It supports various objective functions, including regression, classification
|
||||
and ranking. The package is made to be extendible, so that user are also allowed
|
||||
to define there own objectives easily. It has several features:
|
||||
\begin{enumerate}
|
||||
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
|
||||
Windows and Linux, with openmp.}
|
||||
Windows and Linux, with openmp. It is generally over 10 times faster than
|
||||
\verb@gbm@.}
|
||||
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
|
||||
\begin{itemize}
|
||||
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
|
||||
@ -41,15 +65,15 @@ Comparing to \verb@gbm@ in R, it has several features:
|
||||
\item{Data File: }{Local data files}
|
||||
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
|
||||
\end{itemize}
|
||||
\item{Regularization: }{\verb@xgboost@ supports regularization for
|
||||
$L_1,L_2$ term on weights and $L_2$ term on bias.}
|
||||
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
|
||||
and linear booster, and is optimized for sparse input.}
|
||||
\item{Customization: }{\verb@xgboost@ supports customized objective function
|
||||
and evaluation function}
|
||||
\item{Performance: }{\verb@xgboost@ has better performance on several different
|
||||
datasets. Its rising popularity and fame in different Kaggle competitions
|
||||
is the evidence.}
|
||||
datasets.}
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
\section{Example with iris}
|
||||
|
||||
In this section, we will illustrate some common usage of \verb@xgboost@.
|
||||
@ -62,7 +86,6 @@ bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
|
||||
xgb.save(bst, 'model.save')
|
||||
bst = xgb.load('model.save')
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
hist(pred)
|
||||
@
|
||||
|
||||
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
||||
@ -93,7 +116,8 @@ booster[1]:
|
||||
\end{verbatim}
|
||||
|
||||
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
||||
It speeds up \verb@xgboost@.
|
||||
It speeds up \verb@xgboost@, and is needed for advanced features such as
|
||||
training from initial prediction value, weighted training instance.
|
||||
|
||||
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
||||
<<xgb.DMatrix>>=
|
||||
@ -119,7 +143,7 @@ is more flexible than \verb@xgboost@, but it requires users to read the document
|
||||
a bit more carefully.
|
||||
|
||||
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it
|
||||
supports some additional features as custom objective and evaluation functions.
|
||||
supports advanced features as custom objective and evaluation functions.
|
||||
|
||||
<<Customized loss function>>=
|
||||
logregobj <- function(preds, dtrain) {
|
||||
@ -149,14 +173,14 @@ objective function.
|
||||
We also have \verb@slice@ for row extraction. It is useful in
|
||||
cross-validation.
|
||||
|
||||
For a walkthrough demo, please see \verb@R-package/demo/demo.R@ for further
|
||||
details.
|
||||
|
||||
\section{The Higgs Boson competition}
|
||||
|
||||
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
|
||||
Boson Machine Learning Challenge}.
|
||||
|
||||
Our result reaches 3.60 with a single model. This results stands in the top 30%
|
||||
of the competition.
|
||||
|
||||
Here are the instructions to make a submission
|
||||
\begin{enumerate}
|
||||
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
|
||||
@ -169,5 +193,23 @@ Here are the instructions to make a submission
|
||||
and submit your result.
|
||||
\end{enumerate}
|
||||
|
||||
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
|
||||
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
|
||||
The training set contains 350000 records and 30 features.
|
||||
|
||||
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
|
||||
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
|
||||
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
|
||||
still about two times faster than \verb@gbm@.
|
||||
|
||||
Meanwhile, the result from \verb@xgboost@ reaches
|
||||
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
|
||||
single model. This results stands in the
|
||||
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
|
||||
competition.
|
||||
|
||||
\bibliographystyle{jss}
|
||||
\nocite{*} % list uncited references
|
||||
\bibliography{xgboost}
|
||||
|
||||
\end{document}
|
||||
20
R-package/vignettes/xgboost.bib
Normal file
20
R-package/vignettes/xgboost.bib
Normal file
@ -0,0 +1,20 @@
|
||||
@article{friedman2001greedy,
|
||||
title={Greedy function approximation: a gradient boosting machine},
|
||||
author={Friedman, Jerome H},
|
||||
journal={Annals of Statistics},
|
||||
pages={1189--1232},
|
||||
year={2001},
|
||||
publisher={JSTOR}
|
||||
}
|
||||
|
||||
@article{friedman2000additive,
|
||||
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
|
||||
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
|
||||
journal={The annals of statistics},
|
||||
volume={28},
|
||||
number={2},
|
||||
pages={337--407},
|
||||
year={2000},
|
||||
publisher={Institute of Mathematical Statistics}
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ Build
|
||||
|
||||
Version
|
||||
======
|
||||
* This version is named xgboost-unity, the code has been refactored from 0.2x to be cleaner and more flexibility
|
||||
* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
|
||||
* This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
|
||||
- This means the model and buffer file of previous version can not be loaded in xgboost-unity
|
||||
* For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
|
||||
|
||||
@ -105,7 +105,9 @@ class DMatrixSimple : public DataMatrix {
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()), fname);
|
||||
}
|
||||
fclose(file);
|
||||
// try to load in additional file
|
||||
@ -155,7 +157,9 @@ class DMatrixSimple : public DataMatrix {
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is loaded",
|
||||
info.num_row(), info.num_col(), row_data_.size());
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()));
|
||||
if (fname != NULL) {
|
||||
printf(" from %s\n", fname);
|
||||
} else {
|
||||
@ -183,9 +187,12 @@ class DMatrixSimple : public DataMatrix {
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()), fname);
|
||||
if (info.group_ptr.size() != 0) {
|
||||
printf("data contains %lu groups\n", info.group_ptr.size()-1);
|
||||
printf("data contains %u groups\n",
|
||||
static_cast<unsigned>(info.group_ptr.size()-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -98,7 +98,8 @@ struct MetaInfo {
|
||||
group_ptr.push_back(group_ptr.back()+nline);
|
||||
}
|
||||
if (!silent) {
|
||||
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname);
|
||||
printf("%u groups are loaded from %s\n",
|
||||
static_cast<unsigned>(group_ptr.size()-1), fname);
|
||||
}
|
||||
fclose(fi);
|
||||
return true;
|
||||
|
||||
@ -66,10 +66,11 @@ class BoostLearner {
|
||||
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||
this->SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
|
||||
snprintf(str_temp, sizeof(str_temp), "%lu",
|
||||
static_cast<unsigned long>(buffer_size));
|
||||
this->SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%ld\n", buffer_size);
|
||||
printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user