diff --git a/CHANGES.md b/CHANGES.md index 62d21c21a..027a077c6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,7 +11,7 @@ xgboost-0.2x * Weighted samples instances * Initial version of pairwise rank -xgboost-unity +xgboost-0.3 ===== * Faster tree construction module - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio``` diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 16a007c0a..7d60143bd 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -6,7 +6,7 @@ Date: 2014-08-23 Author: Tianqi Chen, Tong He Maintainer: Tianqi Chen , Tong He Description: xgboost -License: See LICENSE file in the project root of xgboost. +License: file LICENSE URL: https://github.com/tqchen/xgboost BugReports: https://github.com/tqchen/xgboost/issues Depends: diff --git a/R-package/LICENSE b/R-package/LICENSE new file mode 100644 index 000000000..b9f38c38a --- /dev/null +++ b/R-package/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2014 by Tianqi Chen and Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R index 4fa8d58fa..5b438049c 100644 --- a/R-package/R/getinfo.xgb.DMatrix.R +++ b/R-package/R/getinfo.xgb.DMatrix.R @@ -11,7 +11,7 @@ setClass('xgb.DMatrix') #' data(iris) #' iris[,5] <- as.numeric(iris[,5]) #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) -#' labels <- getinfo(dtest, "label") +#' labels <- getinfo(dtrain, "label") #' @export #' getinfo <- function(object, ...){ diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index a3a3ca6a9..64add0ca9 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -21,6 +21,7 @@ xgb.save <- function(model, fname) { .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost") return(TRUE) } - stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster") + stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save + xgb.DMatrix object.") return(FALSE) } diff --git a/R-package/demo/00Index b/R-package/demo/00Index new file mode 100644 index 000000000..2ca4abd32 --- /dev/null +++ b/R-package/demo/00Index @@ -0,0 +1 @@ +demo R code for xgboost usages on agaricus data diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index beee4f850..4f63b5e92 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -17,6 +17,6 @@ Get information of an xgb.DMatrix object data(iris) iris[,5] <- as.numeric(iris[,5]) dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) -labels <- getinfo(dtest, "label") +labels <- getinfo(dtrain, "label") } diff --git a/R-package/src/Makevars b/R-package/src/Makevars index b0d3283b9..3539a913d 100644 --- a/R-package/src/Makevars +++ b/R-package/src/Makevars @@ -2,13 +2,11 @@ PKGROOT=../../ # _*_ mode: Makefile; _*_ CXX=`R CMD config CXX` -CFLAGS=`R CMD config CFLAGS` +TCFLAGS=`R CMD config CFLAGS` # expose these flags to R CMD SHLIB PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) -XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) - -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) +XGBFLAG= $(TCFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) ifeq ($(no_omp),1) PKG_CPPFLAGS += -DDISABLE_OPENMP @@ -26,7 +24,7 @@ xgboost_io.o: $(PKGROOT)/src/io/io.cpp xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp -$(CXXOBJ) : +$(CXXOBJ) : $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) ) clean: diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index 8f5f7ed98..ae599fbf3 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -2,11 +2,11 @@ PKGROOT=../../ # _*_ mode: Makefile; _*_ CXX=`Rcmd config CXX` -CFLAGS=`Rcmd config CFLAGS` +TCFLAGS=`Rcmd config CFLAGS` # expose these flags to R CMD SHLIB PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT) $(SHLIB_OPENMP_CFLAGS) PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS) -XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) +XGBFLAG= -O3 -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) ifeq ($(no_omp),1) @@ -21,12 +21,13 @@ all: $(SHLIB) $(SHLIB): $(OBJECTS) xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp -xgboost_io.o: $(PKGROOT)/src/io/io.cpp +xgboost_io.o: $(PKGROOT)/src/io/io.cpp xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp -$(CXXOBJ) : - $(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) +$(CXXOBJ) : + $(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) ) clean: rm -rf *.so *.o *~ *.dll + diff --git a/R-package/inst/doc/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw similarity index 57% rename from R-package/inst/doc/xgboost.Rnw rename to R-package/vignettes/xgboost.Rnw index 8fabff2ab..ed4447d57 100644 --- a/R-package/inst/doc/xgboost.Rnw +++ b/R-package/vignettes/xgboost.Rnw @@ -1,39 +1,63 @@ \documentclass{article} - -\usepackage{natbib} -\usepackage{graphics} -\usepackage{amsmath} +\RequirePackage{url} \usepackage{hyperref} -\usepackage{indentfirst} -\usepackage[utf8]{inputenc} +\RequirePackage{amsmath} +\RequirePackage{natbib} +\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry} -\DeclareMathOperator{\var}{var} -\DeclareMathOperator{\cov}{cov} - -% \VignetteIndexEntry{xgboost} +\makeatletter +% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting} +%\VignetteKeywords{xgboost, gbm, gradient boosting machines} +%\VignettePackage{xgboost} +% \VignetteEngine{knitr::knitr} +\makeatother \begin{document} +%\SweaveOpts{concordance=TRUE} -<>= -options(keep.source = TRUE, width = 60) -foo <- packageDescription("xgboost") +<>= +if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE) @ -\title{xgboost Package Example (Version \Sexpr{foo$Version})} -\author{Tong He} -\maketitle +% +<>= +xgboost.version = '0.3-0' +@ +% + + \begin{center} + \vspace*{6\baselineskip} + \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt} + \rule{\textwidth}{0.4pt}\\[2\baselineskip] + {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip] + \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt} + \rule{\textwidth}{1.6pt}\\[2\baselineskip] + {\Large Tianqi Chen, Tong He}\\[\baselineskip] + {\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip] + {\large \today}\par + \vfill + \end{center} + +\thispagestyle{empty} + +\clearpage + +\setcounter{page}{1} \section{Introduction} -This is an example of using the \verb@xgboost@ package in R. +This is an introductory document of using the \verb@xgboost@ package in R. -\verb@xgboost@ is short for eXtreme Gradient Boosting (Tree). It supports -regression and classification analysis on different types of input datasets. - -Comparing to \verb@gbm@ in R, it has several features: +\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient + and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. +The package includes efficient linear model solver and tree learning algorithm. +It supports various objective functions, including regression, classification +and ranking. The package is made to be extendible, so that user are also allowed +to define there own objectives easily. It has several features: \begin{enumerate} \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on - Windows and Linux, with openmp.} + Windows and Linux, with openmp. It is generally over 10 times faster than + \verb@gbm@.} \item{Input Type: }{\verb@xgboost@ takes several types of input data:} \begin{itemize} \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} @@ -41,15 +65,15 @@ Comparing to \verb@gbm@ in R, it has several features: \item{Data File: }{Local data files} \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} \end{itemize} - \item{Regularization: }{\verb@xgboost@ supports regularization for - $L_1,L_2$ term on weights and $L_2$ term on bias.} + \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster + and linear booster, and is optimized for sparse input.} \item{Customization: }{\verb@xgboost@ supports customized objective function and evaluation function} \item{Performance: }{\verb@xgboost@ has better performance on several different - datasets. Its rising popularity and fame in different Kaggle competitions - is the evidence.} + datasets.} \end{enumerate} + \section{Example with iris} In this section, we will illustrate some common usage of \verb@xgboost@. @@ -62,7 +86,6 @@ bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), xgb.save(bst, 'model.save') bst = xgb.load('model.save') pred <- predict(bst, as.matrix(iris[,1:4])) -hist(pred) @ \verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. @@ -93,7 +116,8 @@ booster[1]: \end{verbatim} It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. -It speeds up \verb@xgboost@. +It speeds up \verb@xgboost@, and is needed for advanced features such as +training from initial prediction value, weighted training instance. We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: <>= @@ -119,7 +143,7 @@ is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. \verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it -supports some additional features as custom objective and evaluation functions. +supports advanced features as custom objective and evaluation functions. <>= logregobj <- function(preds, dtrain) { @@ -149,14 +173,14 @@ objective function. We also have \verb@slice@ for row extraction. It is useful in cross-validation. +For a walkthrough demo, please see \verb@R-package/demo/demo.R@ for further +details. + \section{The Higgs Boson competition} We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs Boson Machine Learning Challenge}. -Our result reaches 3.60 with a single model. This results stands in the top 30% -of the competition. - Here are the instructions to make a submission \begin{enumerate} \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} @@ -169,5 +193,23 @@ Here are the instructions to make a submission and submit your result. \end{enumerate} +We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script} +to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. +The training set contains 350000 records and 30 features. + +\verb@xgboost@ can automatically do parallel computation. On a machine with Intel +i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster +than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was +still about two times faster than \verb@gbm@. + +Meanwhile, the result from \verb@xgboost@ reaches +\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a +single model. This results stands in the +\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the +competition. + +\bibliographystyle{jss} +\nocite{*} % list uncited references +\bibliography{xgboost} \end{document} diff --git a/R-package/vignettes/xgboost.bib b/R-package/vignettes/xgboost.bib new file mode 100644 index 000000000..f69866f04 --- /dev/null +++ b/R-package/vignettes/xgboost.bib @@ -0,0 +1,20 @@ +@article{friedman2001greedy, + title={Greedy function approximation: a gradient boosting machine}, + author={Friedman, Jerome H}, + journal={Annals of Statistics}, + pages={1189--1232}, + year={2001}, + publisher={JSTOR} +} + +@article{friedman2000additive, + title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)}, + author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others}, + journal={The annals of statistics}, + volume={28}, + number={2}, + pages={337--407}, + year={2000}, + publisher={Institute of Mathematical Statistics} +} + diff --git a/README.md b/README.md index c81059773..ba4b08bfd 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Build Version ====== -* This version is named xgboost-unity, the code has been refactored from 0.2x to be cleaner and more flexibility +* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility * This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure - This means the model and buffer file of previous version can not be loaded in xgboost-unity * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22) diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 8d7064bdd..bd18f0476 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -105,7 +105,9 @@ class DMatrixSimple : public DataMatrix { if (!silent) { printf("%lux%lu matrix with %lu entries is loaded from %s\n", - info.num_row(), info.num_col(), row_data_.size(), fname); + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size()), fname); } fclose(file); // try to load in additional file @@ -155,7 +157,9 @@ class DMatrixSimple : public DataMatrix { if (!silent) { printf("%lux%lu matrix with %lu entries is loaded", - info.num_row(), info.num_col(), row_data_.size()); + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size())); if (fname != NULL) { printf(" from %s\n", fname); } else { @@ -183,9 +187,12 @@ class DMatrixSimple : public DataMatrix { if (!silent) { printf("%lux%lu matrix with %lu entries is saved to %s\n", - info.num_row(), info.num_col(), row_data_.size(), fname); + static_cast(info.num_row()), + static_cast(info.num_col()), + static_cast(row_data_.size()), fname); if (info.group_ptr.size() != 0) { - printf("data contains %lu groups\n", info.group_ptr.size()-1); + printf("data contains %u groups\n", + static_cast(info.group_ptr.size()-1)); } } } diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index cd897f1d5..542b6f6f5 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -98,7 +98,8 @@ struct MetaInfo { group_ptr.push_back(group_ptr.back()+nline); } if (!silent) { - printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname); + printf("%u groups are loaded from %s\n", + static_cast(group_ptr.size()-1), fname); } fclose(fi); return true; diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index c01e2ec15..8e7bce0a8 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -66,10 +66,11 @@ class BoostLearner { snprintf(str_temp, sizeof(str_temp), "%u", num_feature); this->SetParam("bst:num_feature", str_temp); } - snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size); + snprintf(str_temp, sizeof(str_temp), "%lu", + static_cast(buffer_size)); this->SetParam("num_pbuffer", str_temp); if (!silent) { - printf("buffer_size=%ld\n", buffer_size); + printf("buffer_size=%ld\n", static_cast(buffer_size)); } } /*!