Merge branch 'master' into unity

2014-08-30 15:01:52 -07:00
parent 366ac95ad3 602558c5d6
commit e18a4fc5b6
15 changed files with 140 additions and 55 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -11,7 +11,7 @@ xgboost-0.2x
 * Weighted samples instances
 * Initial version of pairwise rank

-xgboost-unity
+xgboost-0.3
 =====
 * Faster tree construction module
  - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -6,7 +6,7 @@ Date: 2014-08-23
 Author: Tianqi Chen, Tong He
 Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
 Description: xgboost
-License: See LICENSE file in the project root of xgboost.
+License: file LICENSE
 URL: https://github.com/tqchen/xgboost
 BugReports: https://github.com/tqchen/xgboost/issues
 Depends:
--- a/R-package/LICENSE
+++ b/R-package/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2014 by Tianqi Chen and Contributors 
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/R-package/R/getinfo.xgb.DMatrix.R
+++ b/R-package/R/getinfo.xgb.DMatrix.R
@@ -11,7 +11,7 @@ setClass('xgb.DMatrix')
 #' data(iris)
 #' iris[,5] <- as.numeric(iris[,5])
 #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
-#' labels <- getinfo(dtest, "label")
+#' labels <- getinfo(dtrain, "label")
 #' @export
 #' 
 getinfo <- function(object, ...){
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -21,6 +21,7 @@ xgb.save <- function(model, fname) {
    .Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
    return(TRUE)
  }
-  stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
+  stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
+       xgb.DMatrix object.")
  return(FALSE)
 } 
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -0,0 +1 @@
+demo R code for xgboost usages on agaricus data
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -17,6 +17,6 @@ Get information of an xgb.DMatrix object
 data(iris)
 iris[,5] <- as.numeric(iris[,5])
 dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
-labels <- getinfo(dtest, "label")
+labels <- getinfo(dtrain, "label")
 }

--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -2,13 +2,11 @@
 PKGROOT=../../
 # _*_ mode: Makefile; _*_
 CXX=`R CMD config CXX`
-CFLAGS=`R CMD config CFLAGS` 
+TCFLAGS=`R CMD config CFLAGS` 
 # expose these flags to R CMD SHLIB
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT)  $(SHLIB_OPENMP_CFLAGS)
 PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
-XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
-
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
+XGBFLAG= $(TCFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)

 ifeq ($(no_omp),1)
 	PKG_CPPFLAGS += -DDISABLE_OPENMP 
@@ -26,7 +24,7 @@ xgboost_io.o: $(PKGROOT)/src/io/io.cpp
 xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
 xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp

-$(CXXOBJ) : 
+$(CXXOBJ) :
 	$(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )

 clean:
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -2,11 +2,11 @@
 PKGROOT=../../
 # _*_ mode: Makefile; _*_
 CXX=`Rcmd config CXX`
-CFLAGS=`Rcmd config CFLAGS` 
+TCFLAGS=`Rcmd config CFLAGS` 
 # expose these flags to R CMD SHLIB
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT)  $(SHLIB_OPENMP_CFLAGS)
 PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
-XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
+XGBFLAG= -O3 -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)

 ifeq ($(no_omp),1)
@@ -21,12 +21,13 @@ all: $(SHLIB)
 $(SHLIB): $(OBJECTS)

 xgboost_wrapper.o: $(PKGROOT)/wrapper/xgboost_wrapper.cpp
-xgboost_io.o: $(PKGROOT)/src/io/io.cpp
+xgboost_io.o: $(PKGROOT)/src/io/io.cpp 
 xgboost_gbm.o: $(PKGROOT)/src/gbm/gbm.cpp
 xgboost_updater.o: $(PKGROOT)/src/tree/updater.cpp

-$(CXXOBJ) : 
-	$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+$(CXXOBJ) :
+	$(CXX) -c $(XGBFLAG) -o $@ $(firstword $(filter %.cpp %.c, $^) )

 clean:
 	rm -rf *.so *.o *~ *.dll
+
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@@ -1,39 +1,63 @@
 \documentclass{article}
-
-\usepackage{natbib}
-\usepackage{graphics}
-\usepackage{amsmath}
+\RequirePackage{url}
 \usepackage{hyperref}
-\usepackage{indentfirst}
-\usepackage[utf8]{inputenc}
+\RequirePackage{amsmath}
+\RequirePackage{natbib}
+\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}

-\DeclareMathOperator{\var}{var}
-\DeclareMathOperator{\cov}{cov}
-
-% \VignetteIndexEntry{xgboost}
+\makeatletter
+% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
+%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
+%\VignettePackage{xgboost}
+% \VignetteEngine{knitr::knitr}
+\makeatother

 \begin{document}
+%\SweaveOpts{concordance=TRUE}

-<<foo,include=FALSE,echo=FALSE>>=
-options(keep.source = TRUE, width = 60)
-foo <- packageDescription("xgboost")
+<<knitropts,echo=FALSE,message=FALSE>>=
+if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
@

-\title{xgboost Package Example (Version \Sexpr{foo$Version})}
-\author{Tong He}
-\maketitle
+%
+<<prelim,echo=FALSE>>=
+xgboost.version = '0.3-0'
+@
+%
+
+    \begin{center}
+    \vspace*{6\baselineskip}
+    \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
+    \rule{\textwidth}{0.4pt}\\[2\baselineskip]
+    {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
+    \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
+    \rule{\textwidth}{1.6pt}\\[2\baselineskip]
+    {\Large Tianqi Chen, Tong He}\\[\baselineskip]
+    {\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
+    {\large \today}\par
+    \vfill
+    \end{center}
+
+\thispagestyle{empty}
+
+\clearpage
+
+\setcounter{page}{1}

 \section{Introduction}

-This is an example of using the \verb@xgboost@ package in R. 
+This is an introductory document of using the \verb@xgboost@ package in R. 

-\verb@xgboost@ is short for eXtreme Gradient Boosting (Tree). It supports
-regression and classification analysis on different types of input datasets.
-
-Comparing to \verb@gbm@ in R, it has several features:
+\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
+ and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. 
+The package includes efficient linear model solver and tree learning algorithm.
+It supports various objective functions, including regression, classification
+and ranking. The package is made to be extendible, so that user are also allowed
+to define there own objectives easily. It has several features:
 \begin{enumerate}
    \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on 
-    Windows and Linux, with openmp.}
+    Windows and Linux, with openmp. It is generally over 10 times faster than
+    \verb@gbm@.}
    \item{Input Type: }{\verb@xgboost@ takes several types of input data:}
    \begin{itemize}
        \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
@@ -41,15 +65,15 @@ Comparing to \verb@gbm@ in R, it has several features:
        \item{Data File: }{Local data files}
        \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
    \end{itemize}
-    \item{Regularization: }{\verb@xgboost@ supports regularization for 
-    $L_1,L_2$ term on weights and $L_2$ term on bias.}
+    \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster 
+    and linear booster, and is optimized for sparse input.}
    \item{Customization: }{\verb@xgboost@ supports customized objective function 
    and evaluation function}
    \item{Performance: }{\verb@xgboost@ has better performance on several different
-    datasets. Its rising popularity and fame in different Kaggle competitions 
-    is the evidence.}
+    datasets.}
 \end{enumerate}

+
 \section{Example with iris}

 In this section, we will illustrate some common usage of \verb@xgboost@.
@@ -62,7 +86,6 @@ bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
 xgb.save(bst, 'model.save')
 bst = xgb.load('model.save')
 pred <- predict(bst, as.matrix(iris[,1:4]))
-hist(pred)
@

 \verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
@@ -93,7 +116,8 @@ booster[1]:
 \end{verbatim}

 It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
-It speeds up \verb@xgboost@. 
+It speeds up \verb@xgboost@, and is needed for advanced features such as 
+training from initial prediction value, weighted training instance. 

 We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
 <<xgb.DMatrix>>=
@@ -119,7 +143,7 @@ is more flexible than \verb@xgboost@, but it requires users to read the document
 a bit more carefully.

 \verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it 
-supports some additional features as custom objective and evaluation functions.
+supports advanced features as custom objective and evaluation functions.

 <<Customized loss function>>=
 logregobj <- function(preds, dtrain) {
@@ -149,14 +173,14 @@ objective function.
 We also have \verb@slice@ for row extraction. It is useful in 
 cross-validation.

+For a walkthrough demo, please see \verb@R-package/demo/demo.R@ for further 
+details.
+
 \section{The Higgs Boson competition}

 We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs 
 Boson Machine Learning Challenge}. 

-Our result reaches 3.60 with a single model. This results stands in the top 30%
-of the competition.
-
 Here are the instructions to make a submission
 \begin{enumerate}
    \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
@@ -169,5 +193,23 @@ Here are the instructions to make a submission
    and submit your result.
 \end{enumerate}

+We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
+to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. 
+The training set contains 350000 records and 30 features. 
+
+\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
+i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
+than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was 
+still about two times faster than \verb@gbm@. 
+
+Meanwhile, the result from \verb@xgboost@ reaches 
+\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a 
+single model. This results stands in the 
+\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the 
+competition. 
+
+\bibliographystyle{jss}
+\nocite{*} % list uncited references
+\bibliography{xgboost}

 \end{document}
--- a/R-package/vignettes/xgboost.bib
+++ b/R-package/vignettes/xgboost.bib
@@ -0,0 +1,20 @@
+@article{friedman2001greedy,
+    title={Greedy function approximation: a gradient boosting machine},
+    author={Friedman, Jerome H},
+    journal={Annals of Statistics},
+    pages={1189--1232},
+    year={2001},
+    publisher={JSTOR}
+}
+
+@article{friedman2000additive,
+  title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
+  author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
+  journal={The annals of statistics},
+  volume={28},
+  number={2},
+  pages={337--407},
+  year={2000},
+  publisher={Institute of Mathematical Statistics}
+}
+
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Build

 Version
 ======
-* This version is named xgboost-unity, the code has been refactored from 0.2x to be cleaner and more flexibility
+* This version xgboost-0.3, the code has been refactored from 0.2x to be cleaner and more flexibility
 * This version of xgboost is not compatible with 0.2x, due to huge amount of changes in code structure
  - This means the model and buffer file of previous version can not be loaded in xgboost-unity
 * For legacy 0.2x code, refer to [Here](https://github.com/tqchen/xgboost/releases/tag/v0.22)
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -105,7 +105,9 @@ class DMatrixSimple : public DataMatrix {

    if (!silent) {
      printf("%lux%lu matrix with %lu entries is loaded from %s\n",
-             info.num_row(), info.num_col(), row_data_.size(), fname);
+             static_cast<unsigned long>(info.num_row()),
+             static_cast<unsigned long>(info.num_col()),
+             static_cast<unsigned long>(row_data_.size()), fname);
    }
    fclose(file);
    // try to load in additional file
@@ -155,7 +157,9 @@ class DMatrixSimple : public DataMatrix {

    if (!silent) {
      printf("%lux%lu matrix with %lu entries is loaded",
-             info.num_row(), info.num_col(), row_data_.size());
+             static_cast<unsigned long>(info.num_row()),
+             static_cast<unsigned long>(info.num_col()),
+             static_cast<unsigned long>(row_data_.size()));
      if (fname != NULL) {
        printf(" from %s\n", fname);
      } else {
@@ -183,9 +187,12 @@ class DMatrixSimple : public DataMatrix {

    if (!silent) {
      printf("%lux%lu matrix with %lu entries is saved to %s\n",
-             info.num_row(), info.num_col(), row_data_.size(), fname);
+             static_cast<unsigned long>(info.num_row()),
+             static_cast<unsigned long>(info.num_col()),
+             static_cast<unsigned long>(row_data_.size()), fname);
      if (info.group_ptr.size() != 0) {
-        printf("data contains %lu groups\n", info.group_ptr.size()-1);
+        printf("data contains %u groups\n",
+               static_cast<unsigned>(info.group_ptr.size()-1));
      }
    }
  }
--- a/src/learner/dmatrix.h
+++ b/src/learner/dmatrix.h
@@ -98,7 +98,8 @@ struct MetaInfo {
      group_ptr.push_back(group_ptr.back()+nline);
    }
    if (!silent) {
-      printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname);
+      printf("%u groups are loaded from %s\n",
+			  static_cast<unsigned>(group_ptr.size()-1), fname);
    }
    fclose(fi);
    return true;
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -66,10 +66,11 @@ class BoostLearner {
      snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
      this->SetParam("bst:num_feature", str_temp);
    }
-    snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
+    snprintf(str_temp, sizeof(str_temp), "%lu",
+			 static_cast<unsigned long>(buffer_size));
    this->SetParam("num_pbuffer", str_temp);
    if (!silent) {
-      printf("buffer_size=%ld\n", buffer_size);
+      printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
    }
  }
  /*!
				`@@ -0,0 +1 @@`
				`demo R code for xgboost usages on agaricus data`