diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 05a25c152..7206d6b17 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -21,7 +21,7 @@ Get information of an xgb.DMatrix object } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) labels <- getinfo(dtrain, "label") } diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index d192997d2..9c19b8f33 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -18,15 +18,16 @@ value of sum of functions, when outputmargin=TRUE, the prediction is untransformed margin value. In logistic regression, outputmargin=T will output value before logistic transformation.} -\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear. -set it to be value bigger than 0. It will use all trees by default.} +\item{ntreelimit}{limit number of trees used in prediction, this parameter is +only valid for gbtree, but not for gblinear. set it to be value bigger +than 0. It will use all trees by default.} } \description{ Predicted values based on xgboost model object. } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) pred <- predict(bst, as.matrix(iris[,1:4])) } diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 7acb14a32..a4d0a4568 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -23,7 +23,7 @@ orginal xgb.DMatrix object } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) dsub <- slice(dtrain, 1:3) } diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 166d69f68..ea7ff8ce6 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -20,7 +20,7 @@ Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index e5e70501d..2692069dc 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -15,7 +15,7 @@ Save xgb.DMatrix object to binary file } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 4d6933811..a4ac12cd4 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -21,7 +21,7 @@ Save a xgboost model to text file. Could be parsed later. } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.dump(bst, 'iris.xgb.model.dump') } diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 980daf88d..a8969c07d 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -13,7 +13,7 @@ Load xgboost model from the binary model file } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.save(bst, 'iris.xgb.model') bst <- xgb.load('iris.xgb.model') pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index ba390d1b4..0dca58287 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -15,7 +15,7 @@ Save xgboost model from xgboost or xgb.train } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.save(bst, 'iris.xgb.model') bst <- xgb.load('iris.xgb.model') pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 4da3b0013..75c43cd56 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -56,7 +56,7 @@ therefore it is more flexible than \code{\link{xgboost}}. } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) dtest <- dtrain watchlist <- list(eval = dtest, train = dtrain) diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 2b6c1a124..435423d28 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -46,7 +46,7 @@ Number of threads can also be manually specified via "nthread" parameter } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) pred <- predict(bst, as.matrix(iris[,1:4])) } diff --git a/R-package/vignettes/xgboost.aux b/R-package/vignettes/xgboost.aux new file mode 100644 index 000000000..6e6babc4c --- /dev/null +++ b/R-package/vignettes/xgboost.aux @@ -0,0 +1,28 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} +\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined +\global\let\oldcontentsline\contentsline +\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} +\global\let\oldnewlabel\newlabel +\gdef\newlabel#1#2{\newlabelxx{#1}#2} +\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} +\AtEndDocument{\ifx\hyper@anchor\@undefined +\let\contentsline\oldcontentsline +\let\newlabel\oldnewlabel +\fi} +\fi} +\global\let\hyper@last\relax +\gdef\HyperFirstAtBeginDocument#1{#1} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\citation{friedman2001greedy} +\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}} +\@writefile{toc}{\contentsline {section}{\numberline {2}Example with iris}{1}{section.2}} +\@writefile{toc}{\contentsline {section}{\numberline {3}Advanced Examples}{2}{section.3}} +\bibstyle{jss} +\citation{*} +\bibdata{xgboost} +\bibcite{friedman2000additive}{{1}{2000}{{Friedman \emph {et~al.}}}{{Friedman, Hastie, Tibshirani \emph {et~al.}}}} +\bibcite{friedman2001greedy}{{2}{2001}{{Friedman}}{{}}} +\@writefile{toc}{\contentsline {section}{\numberline {4}The Higgs Boson competition}{3}{section.4}} diff --git a/R-package/vignettes/xgboost.bbl b/R-package/vignettes/xgboost.bbl new file mode 100644 index 000000000..fdf58e763 --- /dev/null +++ b/R-package/vignettes/xgboost.bbl @@ -0,0 +1,24 @@ +\begin{thebibliography}{2} +\newcommand{\enquote}[1]{``#1''} +\providecommand{\natexlab}[1]{#1} +\providecommand{\url}[1]{\texttt{#1}} +\providecommand{\urlprefix}{URL } +\expandafter\ifx\csname urlstyle\endcsname\relax + \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else + \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup + \urlstyle{rm}\Url}\fi +\providecommand{\eprint}[2][]{\url{#2}} + +\bibitem[{Friedman \emph{et~al.}(2000)Friedman, Hastie, Tibshirani + \emph{et~al.}}]{friedman2000additive} +Friedman J, Hastie T, Tibshirani R, \emph{et~al.} (2000). +\newblock \enquote{Additive logistic regression: a statistical view of boosting + (with discussion and a rejoinder by the authors).} +\newblock \emph{The annals of statistics}, \textbf{28}(2), 337--407. + +\bibitem[{Friedman(2001)}]{friedman2001greedy} +Friedman JH (2001). +\newblock \enquote{Greedy function approximation: a gradient boosting machine.} +\newblock \emph{Annals of Statistics}, pp. 1189--1232. + +\end{thebibliography} diff --git a/R-package/vignettes/xgboost.blg b/R-package/vignettes/xgboost.blg new file mode 100644 index 000000000..2c0e87387 --- /dev/null +++ b/R-package/vignettes/xgboost.blg @@ -0,0 +1,47 @@ +This is BibTeX, Version 0.99d (TeX Live 2013/Debian) +Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 +The top-level auxiliary file: xgboost.aux +The style file: jss.bst +Database file #1: xgboost.bib +Reallocated wiz_functions (elt_size=4) to 6000 items from 3000. +You've used 2 entries, + 3140 wiz_defined-function locations, + 641 strings with 5430 characters, +and the built_in function-call counts, 1920 in all, are: += -- 162 +> -- 44 +< -- 2 ++ -- 17 +- -- 15 +* -- 149 +:= -- 256 +add.period$ -- 8 +call.type$ -- 2 +change.case$ -- 12 +chr.to.int$ -- 2 +cite$ -- 2 +duplicate$ -- 171 +empty$ -- 175 +format.name$ -- 19 +if$ -- 395 +int.to.chr$ -- 1 +int.to.str$ -- 1 +missing$ -- 24 +newline$ -- 21 +num.names$ -- 8 +pop$ -- 51 +preamble$ -- 1 +purify$ -- 12 +quote$ -- 0 +skip$ -- 53 +stack$ -- 0 +substring$ -- 181 +swap$ -- 65 +text.length$ -- 1 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 18 +warning$ -- 0 +while$ -- 16 +width$ -- 0 +write$ -- 36 diff --git a/R-package/vignettes/xgboost.out b/R-package/vignettes/xgboost.out new file mode 100644 index 000000000..6d60796a3 --- /dev/null +++ b/R-package/vignettes/xgboost.out @@ -0,0 +1,4 @@ +\BOOKMARK [1][-]{section.1}{Introduction}{}% 1 +\BOOKMARK [1][-]{section.2}{Example with iris}{}% 2 +\BOOKMARK [1][-]{section.3}{Advanced Examples}{}% 3 +\BOOKMARK [1][-]{section.4}{The Higgs Boson competition}{}% 4 diff --git a/R-package/vignettes/xgboost.tex b/R-package/vignettes/xgboost.tex new file mode 100644 index 000000000..0ed4015b7 --- /dev/null +++ b/R-package/vignettes/xgboost.tex @@ -0,0 +1,319 @@ +\documentclass{article}\usepackage[]{graphicx}\usepackage[]{color} +%% maxwidth is the original width if it is less than linewidth +%% otherwise use linewidth (to make sure the graphics do not exceed the margin) +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother + +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}% +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}% +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}% +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}% +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}% +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}% +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}% +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}% +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}% + +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{% + \def\at@end@of@kframe{}% + \ifinner\ifhmode% + \def\at@end@of@kframe{\end{minipage}}% + \begin{minipage}{\columnwidth}% + \fi\fi% + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + % There is no \\@totalrightmargin, so: + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}}% + {\par\unskip\endMakeFramed% + \at@end@of@kframe} +\makeatother + +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX + +\usepackage{alltt} +\RequirePackage{url} +\usepackage{hyperref} +\RequirePackage{amsmath} +\RequirePackage{natbib} +\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry} + +\makeatletter +% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting} +%\VignetteKeywords{xgboost, gbm, gradient boosting machines} +%\VignettePackage{xgboost} +% \VignetteEngine{knitr::knitr} +\makeatother +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +\begin{document} +%\SweaveOpts{concordance=TRUE} + + + +% + +% + + \begin{center} + \vspace*{6\baselineskip} + \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt} + \rule{\textwidth}{0.4pt}\\[2\baselineskip] + {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip] + \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt} + \rule{\textwidth}{1.6pt}\\[2\baselineskip] + {\Large Tianqi Chen, Tong He}\\[\baselineskip] + {\large Package Version: 0.3-0}\\[\baselineskip] + {\large \today}\par + \vfill + \end{center} + +\thispagestyle{empty} + +\clearpage + +\setcounter{page}{1} + +\section{Introduction} + +This is an introductory document of using the \verb@xgboost@ package in R. + +\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient + and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. +The package includes efficient linear model solver and tree learning algorithm. +It supports various objective functions, including regression, classification +and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features: +\begin{enumerate} + \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on + Windows and Linux, with openmp. It is generally over 10 times faster than + \verb@gbm@.} + \item{Input Type: }{\verb@xgboost@ takes several types of input data:} + \begin{itemize} + \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} + \item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@} + \item{Data File: }{Local data files} + \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} + \end{itemize} + \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster + and linear booster, and is optimized for sparse input.} + \item{Customization: }{\verb@xgboost@ supports customized objective function + and evaluation function} + \item{Performance: }{\verb@xgboost@ has better performance on several different + datasets.} +\end{enumerate} + + +\section{Example with iris} + +In this section, we will illustrate some common usage of \verb@xgboost@. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{library}\hlstd{(xgboost)} +\hlkwd{data}\hlstd{(iris)} +\hlstd{bst} \hlkwb{<-} \hlkwd{xgboost}\hlstd{(}\hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]),}\hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{),} + \hlkwc{nrounds} \hlstd{=} \hlnum{5}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [0] train-rmse:0.351971 +## [1] train-rmse:0.247769 +## [2] train-rmse:0.174418 +## [3] train-rmse:0.122783 +## [4] train-rmse:0.086435 +\end{verbatim} +\begin{alltt} +\hlkwd{xgb.save}\hlstd{(bst,} \hlstr{'model.save'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] TRUE +\end{verbatim} +\begin{alltt} +\hlstd{bst} \hlkwb{=} \hlkwd{xgb.load}\hlstd{(}\hlstr{'model.save'}\hlstd{)} +\hlstd{pred} \hlkwb{<-} \hlkwd{predict}\hlstd{(bst,} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]))} +\end{alltt} +\end{kframe} +\end{knitrout} + +\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. +\verb@predict@ does prediction on the model. + +Here we can save the model to a binary local file, and load it when needed. +We can't inspect the trees inside. However we have another function to save the +model in plain text. +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{xgb.dump}\hlstd{(bst,} \hlstr{'model.dump'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] TRUE +\end{verbatim} +\end{kframe} +\end{knitrout} + +The output looks like + +\begin{verbatim} +booster[0]: +0:[f2<2.45] yes=1,no=2,missing=1 + 1:leaf=0.147059 + 2:[f3<1.65] yes=3,no=4,missing=3 + 3:leaf=0.464151 + 4:leaf=0.722449 +booster[1]: +0:[f2<2.45] yes=1,no=2,missing=1 + 1:leaf=0.103806 + 2:[f2<4.85] yes=3,no=4,missing=3 + 3:leaf=0.316341 + 4:leaf=0.510365 +\end{verbatim} + +It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. +It speeds up \verb@xgboost@, and is needed for advanced features such as +training from initial prediction value, weighted training instance. + +We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{iris.mat} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{])} +\hlstd{iris.label} \hlkwb{<-} \hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{)} +\hlstd{diris} \hlkwb{<-} \hlkwd{xgb.DMatrix}\hlstd{(iris.mat,} \hlkwc{label} \hlstd{= iris.label)} +\hlkwd{class}\hlstd{(diris)} +\end{alltt} +\begin{verbatim} +## [1] "xgb.DMatrix" +\end{verbatim} +\begin{alltt} +\hlkwd{getinfo}\hlstd{(diris,}\hlstr{'label'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [141] 0 0 0 0 0 0 0 0 0 0 +\end{verbatim} +\end{kframe} +\end{knitrout} + +We can also save the matrix to a binary file. Then load it simply with +\verb@xgb.DMatrix@ +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{xgb.DMatrix.save}\hlstd{(diris,} \hlstr{'iris.xgb.DMatrix'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## 150x4 matrix with 600 entries is saved to iris.xgb.DMatrix +## [1] TRUE +\end{verbatim} +\begin{alltt} +\hlstd{diris} \hlkwb{=} \hlkwd{xgb.DMatrix}\hlstd{(}\hlstr{'iris.xgb.DMatrix'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## 150x4 matrix with 600 entries is loaded from iris.xgb.DMatrix +\end{verbatim} +\end{kframe} +\end{knitrout} + +\section{Advanced Examples} + +The function \verb@xgboost@ is a simple function with less parameter, in order +to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. + +\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{logregobj} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} + \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} + \hlstd{preds} \hlkwb{<-} \hlnum{1}\hlopt{/}\hlstd{(}\hlnum{1} \hlopt{+} \hlkwd{exp}\hlstd{(}\hlopt{-}\hlstd{preds))} + \hlstd{grad} \hlkwb{<-} \hlstd{preds} \hlopt{-} \hlstd{labels} + \hlstd{hess} \hlkwb{<-} \hlstd{preds} \hlopt{*} \hlstd{(}\hlnum{1} \hlopt{-} \hlstd{preds)} + \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{grad} \hlstd{= grad,} \hlkwc{hess} \hlstd{= hess))} +\hlstd{\}} + +\hlstd{evalerror} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} + \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} + \hlstd{err} \hlkwb{<-} \hlkwd{sqrt}\hlstd{(}\hlkwd{mean}\hlstd{((preds}\hlopt{-}\hlstd{labels)}\hlopt{^}\hlnum{2}\hlstd{))} + \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{metric} \hlstd{=} \hlstr{"MSE"}\hlstd{,} \hlkwc{value} \hlstd{= err))} +\hlstd{\}} + +\hlstd{dtest} \hlkwb{<-} \hlkwd{slice}\hlstd{(diris,}\hlnum{1}\hlopt{:}\hlnum{100}\hlstd{)} +\hlstd{watchlist} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{eval} \hlstd{= dtest,} \hlkwc{train} \hlstd{= diris)} +\hlstd{param} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{max_depth} \hlstd{=} \hlnum{2}\hlstd{,} \hlkwc{eta} \hlstd{=} \hlnum{1}\hlstd{,} \hlkwc{silent} \hlstd{=} \hlnum{1}\hlstd{)} + +\hlstd{bst} \hlkwb{<-} \hlkwd{xgb.train}\hlstd{(param, diris,} \hlkwc{nround} \hlstd{=} \hlnum{2}\hlstd{, watchlist, logregobj, evalerror)} +\end{alltt} +\begin{verbatim} +## [1] eval-MSE:1.601 train-MSE:1.76 +## [2] eval-MSE:2.567 train-MSE:2.745 +\end{verbatim} +\end{kframe} +\end{knitrout} + +The gradient and second order gradient is required for the output of customized +objective function. + +We also have \verb@slice@ for row extraction. It is useful in +cross-validation. + +For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further +details. + +\section{The Higgs Boson competition} + +We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs +Boson Machine Learning Challenge}. + +Here are the instructions to make a submission +\begin{enumerate} + \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} + and extract them to \verb@data/@. + \item Run scripts under \verb@xgboost/demo/kaggle-higgs/@: + \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R} + and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}. + The computation will take less than a minute on Intel i7. + \item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page} + and submit your result. +\end{enumerate} + +We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script} +to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. +The training set contains 350000 records and 30 features. + +\verb@xgboost@ can automatically do parallel computation. On a machine with Intel +i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster +than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was +still about two times faster than \verb@gbm@. + +Meanwhile, the result from \verb@xgboost@ reaches +\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a +single model. This results stands in the +\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the +competition. + +\bibliographystyle{jss} +\nocite{*} % list uncited references +\bibliography{xgboost} + +\end{document} +