From d776e0fdf54b5c86d8ac2547e09f54423d92fd34 Mon Sep 17 00:00:00 2001 From: hetong Date: Fri, 5 Sep 2014 19:22:27 -0700 Subject: [PATCH 1/5] fix iris multiclass problem --- R-package/R/getinfo.xgb.DMatrix.R | 2 +- R-package/R/predict.xgb.Booster.R | 7 ++++--- R-package/R/slice.xgb.DMatrix.R | 2 +- R-package/R/xgb.DMatrix.R | 2 +- R-package/R/xgb.DMatrix.save.R | 2 +- R-package/R/xgb.dump.R | 2 +- R-package/R/xgb.load.R | 2 +- R-package/R/xgb.save.R | 2 +- R-package/R/xgb.train.R | 2 +- R-package/R/xgboost.R | 2 +- R-package/vignettes/xgboost.Rnw | 4 ++-- 11 files changed, 15 insertions(+), 14 deletions(-) diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R index 3a79fd2fb..2a7ae8e5e 100644 --- a/R-package/R/getinfo.xgb.DMatrix.R +++ b/R-package/R/getinfo.xgb.DMatrix.R @@ -6,7 +6,7 @@ setClass('xgb.DMatrix') #' #' @examples #' data(iris) -#' iris[,5] <- as.numeric(iris[,5]) +#' iris[,5] <- as.numeric(iris[,5]=='setosa') #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' labels <- getinfo(dtrain, "label") #' @rdname getinfo diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 390ac689e..a41b26873 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -11,11 +11,12 @@ setClass("xgb.Booster") #' value of sum of functions, when outputmargin=TRUE, the prediction is #' untransformed margin value. In logistic regression, outputmargin=T will #' output value before logistic transformation. -#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear. -#' set it to be value bigger than 0. It will use all trees by default. +#' @param ntreelimit limit number of trees used in prediction, this parameter is +#' only valid for gbtree, but not for gblinear. set it to be value bigger +#' than 0. It will use all trees by default. #' @examples #' data(iris) -#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) #' pred <- predict(bst, as.matrix(iris[,1:4])) #' @export #' diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index 8a93efc4d..72f94893a 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -8,7 +8,7 @@ setClass('xgb.DMatrix') #' #' @examples #' data(iris) -#' iris[,5] <- as.numeric(iris[,5]) +#' iris[,5] <- as.numeric(iris[,5]=='setosa') #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' dsub <- slice(dtrain, 1:3) #' @rdname slice diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index d52847ef2..3b320d73f 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -12,7 +12,7 @@ #' #' @examples #' data(iris) -#' iris[,5] <- as.numeric(iris[,5]) +#' iris[,5] <- as.numeric(iris[,5]=='setosa') #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') #' dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R index 4fcb71301..4f4f49399 100644 --- a/R-package/R/xgb.DMatrix.save.R +++ b/R-package/R/xgb.DMatrix.save.R @@ -7,7 +7,7 @@ #' #' @examples #' data(iris) -#' iris[,5] <- as.numeric(iris[,5]) +#' iris[,5] <- as.numeric(iris[,5]=='setosa') #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') #' dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 09406dc99..78fcf4d0b 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -13,7 +13,7 @@ #' #' @examples #' data(iris) -#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) #' xgb.dump(bst, 'iris.xgb.model.dump') #' @export #' diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 626c08d0d..54afe65dd 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -6,7 +6,7 @@ #' #' @examples #' data(iris) -#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) #' xgb.save(bst, 'iris.xgb.model') #' bst <- xgb.load('iris.xgb.model') #' pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 64add0ca9..c211429ad 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -7,7 +7,7 @@ #' #' @examples #' data(iris) -#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) #' xgb.save(bst, 'iris.xgb.model') #' bst <- xgb.load('iris.xgb.model') #' pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 58a575d03..e5400829f 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -44,7 +44,7 @@ #' #' @examples #' data(iris) -#' iris[,5] <- as.numeric(iris[,5]) +#' iris[,5] <- as.numeric(iris[,5]=='setosa') #' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) #' dtest <- dtrain #' watchlist <- list(eval = dtest, train = dtrain) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 6f4633fb8..dc8b17fa0 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -34,7 +34,7 @@ #' #' @examples #' data(iris) -#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) #' pred <- predict(bst, as.matrix(iris[,1:4])) #' @export #' diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw index 9ecceca17..45ab1a096 100644 --- a/R-package/vignettes/xgboost.Rnw +++ b/R-package/vignettes/xgboost.Rnw @@ -80,7 +80,7 @@ In this section, we will illustrate some common usage of \verb@xgboost@. <>= library(xgboost) data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 5) xgb.save(bst, 'model.save') bst = xgb.load('model.save') @@ -121,7 +121,7 @@ training from initial prediction value, weighted training instance. We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: <>= iris.mat <- as.matrix(iris[,1:4]) -iris.label <- as.numeric(iris[,5]) +iris.label <- as.numeric(iris[,5]=='setosa') diris <- xgb.DMatrix(iris.mat, label = iris.label) class(diris) getinfo(diris,'label') From 801a17fa02cf4357caf292724b876f0c8807e7dc Mon Sep 17 00:00:00 2001 From: hetong Date: Fri, 5 Sep 2014 19:47:58 -0700 Subject: [PATCH 2/5] fix iris to Rd files --- R-package/man/getinfo.Rd | 2 +- R-package/man/predict-xgb.Booster-method.Rd | 7 +- R-package/man/slice.Rd | 2 +- R-package/man/xgb.DMatrix.Rd | 2 +- R-package/man/xgb.DMatrix.save.Rd | 2 +- R-package/man/xgb.dump.Rd | 2 +- R-package/man/xgb.load.Rd | 2 +- R-package/man/xgb.save.Rd | 2 +- R-package/man/xgb.train.Rd | 2 +- R-package/man/xgboost.Rd | 2 +- R-package/vignettes/xgboost.aux | 28 ++ R-package/vignettes/xgboost.bbl | 24 ++ R-package/vignettes/xgboost.blg | 47 +++ R-package/vignettes/xgboost.out | 4 + R-package/vignettes/xgboost.tex | 319 ++++++++++++++++++++ 15 files changed, 435 insertions(+), 12 deletions(-) create mode 100644 R-package/vignettes/xgboost.aux create mode 100644 R-package/vignettes/xgboost.bbl create mode 100644 R-package/vignettes/xgboost.blg create mode 100644 R-package/vignettes/xgboost.out create mode 100644 R-package/vignettes/xgboost.tex diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 05a25c152..7206d6b17 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -21,7 +21,7 @@ Get information of an xgb.DMatrix object } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) labels <- getinfo(dtrain, "label") } diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index d192997d2..9c19b8f33 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -18,15 +18,16 @@ value of sum of functions, when outputmargin=TRUE, the prediction is untransformed margin value. In logistic regression, outputmargin=T will output value before logistic transformation.} -\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear. -set it to be value bigger than 0. It will use all trees by default.} +\item{ntreelimit}{limit number of trees used in prediction, this parameter is +only valid for gbtree, but not for gblinear. set it to be value bigger +than 0. It will use all trees by default.} } \description{ Predicted values based on xgboost model object. } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) pred <- predict(bst, as.matrix(iris[,1:4])) } diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 7acb14a32..a4d0a4568 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -23,7 +23,7 @@ orginal xgb.DMatrix object } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) dsub <- slice(dtrain, 1:3) } diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 166d69f68..ea7ff8ce6 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -20,7 +20,7 @@ Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index e5e70501d..2692069dc 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -15,7 +15,7 @@ Save xgb.DMatrix object to binary file } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix') dtrain <- xgb.DMatrix('iris.xgb.DMatrix') diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 4d6933811..a4ac12cd4 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -21,7 +21,7 @@ Save a xgboost model to text file. Could be parsed later. } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.dump(bst, 'iris.xgb.model.dump') } diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 980daf88d..a8969c07d 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -13,7 +13,7 @@ Load xgboost model from the binary model file } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.save(bst, 'iris.xgb.model') bst <- xgb.load('iris.xgb.model') pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index ba390d1b4..0dca58287 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -15,7 +15,7 @@ Save xgboost model from xgboost or xgb.train } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) xgb.save(bst, 'iris.xgb.model') bst <- xgb.load('iris.xgb.model') pred <- predict(bst, as.matrix(iris[,1:4])) diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 4da3b0013..75c43cd56 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -56,7 +56,7 @@ therefore it is more flexible than \code{\link{xgboost}}. } \examples{ data(iris) -iris[,5] <- as.numeric(iris[,5]) +iris[,5] <- as.numeric(iris[,5]=='setosa') dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5]) dtest <- dtrain watchlist <- list(eval = dtest, train = dtrain) diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 2b6c1a124..435423d28 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -46,7 +46,7 @@ Number of threads can also be manually specified via "nthread" parameter } \examples{ data(iris) -bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2) +bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'), nrounds = 2) pred <- predict(bst, as.matrix(iris[,1:4])) } diff --git a/R-package/vignettes/xgboost.aux b/R-package/vignettes/xgboost.aux new file mode 100644 index 000000000..6e6babc4c --- /dev/null +++ b/R-package/vignettes/xgboost.aux @@ -0,0 +1,28 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} +\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined +\global\let\oldcontentsline\contentsline +\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} +\global\let\oldnewlabel\newlabel +\gdef\newlabel#1#2{\newlabelxx{#1}#2} +\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} +\AtEndDocument{\ifx\hyper@anchor\@undefined +\let\contentsline\oldcontentsline +\let\newlabel\oldnewlabel +\fi} +\fi} +\global\let\hyper@last\relax +\gdef\HyperFirstAtBeginDocument#1{#1} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\citation{friedman2001greedy} +\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}} +\@writefile{toc}{\contentsline {section}{\numberline {2}Example with iris}{1}{section.2}} +\@writefile{toc}{\contentsline {section}{\numberline {3}Advanced Examples}{2}{section.3}} +\bibstyle{jss} +\citation{*} +\bibdata{xgboost} +\bibcite{friedman2000additive}{{1}{2000}{{Friedman \emph {et~al.}}}{{Friedman, Hastie, Tibshirani \emph {et~al.}}}} +\bibcite{friedman2001greedy}{{2}{2001}{{Friedman}}{{}}} +\@writefile{toc}{\contentsline {section}{\numberline {4}The Higgs Boson competition}{3}{section.4}} diff --git a/R-package/vignettes/xgboost.bbl b/R-package/vignettes/xgboost.bbl new file mode 100644 index 000000000..fdf58e763 --- /dev/null +++ b/R-package/vignettes/xgboost.bbl @@ -0,0 +1,24 @@ +\begin{thebibliography}{2} +\newcommand{\enquote}[1]{``#1''} +\providecommand{\natexlab}[1]{#1} +\providecommand{\url}[1]{\texttt{#1}} +\providecommand{\urlprefix}{URL } +\expandafter\ifx\csname urlstyle\endcsname\relax + \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else + \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup + \urlstyle{rm}\Url}\fi +\providecommand{\eprint}[2][]{\url{#2}} + +\bibitem[{Friedman \emph{et~al.}(2000)Friedman, Hastie, Tibshirani + \emph{et~al.}}]{friedman2000additive} +Friedman J, Hastie T, Tibshirani R, \emph{et~al.} (2000). +\newblock \enquote{Additive logistic regression: a statistical view of boosting + (with discussion and a rejoinder by the authors).} +\newblock \emph{The annals of statistics}, \textbf{28}(2), 337--407. + +\bibitem[{Friedman(2001)}]{friedman2001greedy} +Friedman JH (2001). +\newblock \enquote{Greedy function approximation: a gradient boosting machine.} +\newblock \emph{Annals of Statistics}, pp. 1189--1232. + +\end{thebibliography} diff --git a/R-package/vignettes/xgboost.blg b/R-package/vignettes/xgboost.blg new file mode 100644 index 000000000..2c0e87387 --- /dev/null +++ b/R-package/vignettes/xgboost.blg @@ -0,0 +1,47 @@ +This is BibTeX, Version 0.99d (TeX Live 2013/Debian) +Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 +The top-level auxiliary file: xgboost.aux +The style file: jss.bst +Database file #1: xgboost.bib +Reallocated wiz_functions (elt_size=4) to 6000 items from 3000. +You've used 2 entries, + 3140 wiz_defined-function locations, + 641 strings with 5430 characters, +and the built_in function-call counts, 1920 in all, are: += -- 162 +> -- 44 +< -- 2 ++ -- 17 +- -- 15 +* -- 149 +:= -- 256 +add.period$ -- 8 +call.type$ -- 2 +change.case$ -- 12 +chr.to.int$ -- 2 +cite$ -- 2 +duplicate$ -- 171 +empty$ -- 175 +format.name$ -- 19 +if$ -- 395 +int.to.chr$ -- 1 +int.to.str$ -- 1 +missing$ -- 24 +newline$ -- 21 +num.names$ -- 8 +pop$ -- 51 +preamble$ -- 1 +purify$ -- 12 +quote$ -- 0 +skip$ -- 53 +stack$ -- 0 +substring$ -- 181 +swap$ -- 65 +text.length$ -- 1 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 18 +warning$ -- 0 +while$ -- 16 +width$ -- 0 +write$ -- 36 diff --git a/R-package/vignettes/xgboost.out b/R-package/vignettes/xgboost.out new file mode 100644 index 000000000..6d60796a3 --- /dev/null +++ b/R-package/vignettes/xgboost.out @@ -0,0 +1,4 @@ +\BOOKMARK [1][-]{section.1}{Introduction}{}% 1 +\BOOKMARK [1][-]{section.2}{Example with iris}{}% 2 +\BOOKMARK [1][-]{section.3}{Advanced Examples}{}% 3 +\BOOKMARK [1][-]{section.4}{The Higgs Boson competition}{}% 4 diff --git a/R-package/vignettes/xgboost.tex b/R-package/vignettes/xgboost.tex new file mode 100644 index 000000000..0ed4015b7 --- /dev/null +++ b/R-package/vignettes/xgboost.tex @@ -0,0 +1,319 @@ +\documentclass{article}\usepackage[]{graphicx}\usepackage[]{color} +%% maxwidth is the original width if it is less than linewidth +%% otherwise use linewidth (to make sure the graphics do not exceed the margin) +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother + +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}% +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}% +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}% +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}% +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}% +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}% +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}% +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}% +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}% + +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{% + \def\at@end@of@kframe{}% + \ifinner\ifhmode% + \def\at@end@of@kframe{\end{minipage}}% + \begin{minipage}{\columnwidth}% + \fi\fi% + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + % There is no \\@totalrightmargin, so: + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}}% + {\par\unskip\endMakeFramed% + \at@end@of@kframe} +\makeatother + +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX + +\usepackage{alltt} +\RequirePackage{url} +\usepackage{hyperref} +\RequirePackage{amsmath} +\RequirePackage{natbib} +\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry} + +\makeatletter +% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting} +%\VignetteKeywords{xgboost, gbm, gradient boosting machines} +%\VignettePackage{xgboost} +% \VignetteEngine{knitr::knitr} +\makeatother +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +\begin{document} +%\SweaveOpts{concordance=TRUE} + + + +% + +% + + \begin{center} + \vspace*{6\baselineskip} + \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt} + \rule{\textwidth}{0.4pt}\\[2\baselineskip] + {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip] + \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt} + \rule{\textwidth}{1.6pt}\\[2\baselineskip] + {\Large Tianqi Chen, Tong He}\\[\baselineskip] + {\large Package Version: 0.3-0}\\[\baselineskip] + {\large \today}\par + \vfill + \end{center} + +\thispagestyle{empty} + +\clearpage + +\setcounter{page}{1} + +\section{Introduction} + +This is an introductory document of using the \verb@xgboost@ package in R. + +\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient + and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. +The package includes efficient linear model solver and tree learning algorithm. +It supports various objective functions, including regression, classification +and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features: +\begin{enumerate} + \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on + Windows and Linux, with openmp. It is generally over 10 times faster than + \verb@gbm@.} + \item{Input Type: }{\verb@xgboost@ takes several types of input data:} + \begin{itemize} + \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} + \item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@} + \item{Data File: }{Local data files} + \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} + \end{itemize} + \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster + and linear booster, and is optimized for sparse input.} + \item{Customization: }{\verb@xgboost@ supports customized objective function + and evaluation function} + \item{Performance: }{\verb@xgboost@ has better performance on several different + datasets.} +\end{enumerate} + + +\section{Example with iris} + +In this section, we will illustrate some common usage of \verb@xgboost@. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{library}\hlstd{(xgboost)} +\hlkwd{data}\hlstd{(iris)} +\hlstd{bst} \hlkwb{<-} \hlkwd{xgboost}\hlstd{(}\hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]),}\hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{),} + \hlkwc{nrounds} \hlstd{=} \hlnum{5}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [0] train-rmse:0.351971 +## [1] train-rmse:0.247769 +## [2] train-rmse:0.174418 +## [3] train-rmse:0.122783 +## [4] train-rmse:0.086435 +\end{verbatim} +\begin{alltt} +\hlkwd{xgb.save}\hlstd{(bst,} \hlstr{'model.save'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] TRUE +\end{verbatim} +\begin{alltt} +\hlstd{bst} \hlkwb{=} \hlkwd{xgb.load}\hlstd{(}\hlstr{'model.save'}\hlstd{)} +\hlstd{pred} \hlkwb{<-} \hlkwd{predict}\hlstd{(bst,} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]))} +\end{alltt} +\end{kframe} +\end{knitrout} + +\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. +\verb@predict@ does prediction on the model. + +Here we can save the model to a binary local file, and load it when needed. +We can't inspect the trees inside. However we have another function to save the +model in plain text. +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{xgb.dump}\hlstd{(bst,} \hlstr{'model.dump'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] TRUE +\end{verbatim} +\end{kframe} +\end{knitrout} + +The output looks like + +\begin{verbatim} +booster[0]: +0:[f2<2.45] yes=1,no=2,missing=1 + 1:leaf=0.147059 + 2:[f3<1.65] yes=3,no=4,missing=3 + 3:leaf=0.464151 + 4:leaf=0.722449 +booster[1]: +0:[f2<2.45] yes=1,no=2,missing=1 + 1:leaf=0.103806 + 2:[f2<4.85] yes=3,no=4,missing=3 + 3:leaf=0.316341 + 4:leaf=0.510365 +\end{verbatim} + +It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. +It speeds up \verb@xgboost@, and is needed for advanced features such as +training from initial prediction value, weighted training instance. + +We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{iris.mat} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{])} +\hlstd{iris.label} \hlkwb{<-} \hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{)} +\hlstd{diris} \hlkwb{<-} \hlkwd{xgb.DMatrix}\hlstd{(iris.mat,} \hlkwc{label} \hlstd{= iris.label)} +\hlkwd{class}\hlstd{(diris)} +\end{alltt} +\begin{verbatim} +## [1] "xgb.DMatrix" +\end{verbatim} +\begin{alltt} +\hlkwd{getinfo}\hlstd{(diris,}\hlstr{'label'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [141] 0 0 0 0 0 0 0 0 0 0 +\end{verbatim} +\end{kframe} +\end{knitrout} + +We can also save the matrix to a binary file. Then load it simply with +\verb@xgb.DMatrix@ +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlkwd{xgb.DMatrix.save}\hlstd{(diris,} \hlstr{'iris.xgb.DMatrix'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## 150x4 matrix with 600 entries is saved to iris.xgb.DMatrix +## [1] TRUE +\end{verbatim} +\begin{alltt} +\hlstd{diris} \hlkwb{=} \hlkwd{xgb.DMatrix}\hlstd{(}\hlstr{'iris.xgb.DMatrix'}\hlstd{)} +\end{alltt} +\begin{verbatim} +## 150x4 matrix with 600 entries is loaded from iris.xgb.DMatrix +\end{verbatim} +\end{kframe} +\end{knitrout} + +\section{Advanced Examples} + +The function \verb@xgboost@ is a simple function with less parameter, in order +to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. + +\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions. + +\begin{knitrout} +\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} +\begin{alltt} +\hlstd{logregobj} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} + \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} + \hlstd{preds} \hlkwb{<-} \hlnum{1}\hlopt{/}\hlstd{(}\hlnum{1} \hlopt{+} \hlkwd{exp}\hlstd{(}\hlopt{-}\hlstd{preds))} + \hlstd{grad} \hlkwb{<-} \hlstd{preds} \hlopt{-} \hlstd{labels} + \hlstd{hess} \hlkwb{<-} \hlstd{preds} \hlopt{*} \hlstd{(}\hlnum{1} \hlopt{-} \hlstd{preds)} + \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{grad} \hlstd{= grad,} \hlkwc{hess} \hlstd{= hess))} +\hlstd{\}} + +\hlstd{evalerror} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} + \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} + \hlstd{err} \hlkwb{<-} \hlkwd{sqrt}\hlstd{(}\hlkwd{mean}\hlstd{((preds}\hlopt{-}\hlstd{labels)}\hlopt{^}\hlnum{2}\hlstd{))} + \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{metric} \hlstd{=} \hlstr{"MSE"}\hlstd{,} \hlkwc{value} \hlstd{= err))} +\hlstd{\}} + +\hlstd{dtest} \hlkwb{<-} \hlkwd{slice}\hlstd{(diris,}\hlnum{1}\hlopt{:}\hlnum{100}\hlstd{)} +\hlstd{watchlist} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{eval} \hlstd{= dtest,} \hlkwc{train} \hlstd{= diris)} +\hlstd{param} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{max_depth} \hlstd{=} \hlnum{2}\hlstd{,} \hlkwc{eta} \hlstd{=} \hlnum{1}\hlstd{,} \hlkwc{silent} \hlstd{=} \hlnum{1}\hlstd{)} + +\hlstd{bst} \hlkwb{<-} \hlkwd{xgb.train}\hlstd{(param, diris,} \hlkwc{nround} \hlstd{=} \hlnum{2}\hlstd{, watchlist, logregobj, evalerror)} +\end{alltt} +\begin{verbatim} +## [1] eval-MSE:1.601 train-MSE:1.76 +## [2] eval-MSE:2.567 train-MSE:2.745 +\end{verbatim} +\end{kframe} +\end{knitrout} + +The gradient and second order gradient is required for the output of customized +objective function. + +We also have \verb@slice@ for row extraction. It is useful in +cross-validation. + +For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further +details. + +\section{The Higgs Boson competition} + +We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs +Boson Machine Learning Challenge}. + +Here are the instructions to make a submission +\begin{enumerate} + \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} + and extract them to \verb@data/@. + \item Run scripts under \verb@xgboost/demo/kaggle-higgs/@: + \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R} + and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}. + The computation will take less than a minute on Intel i7. + \item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page} + and submit your result. +\end{enumerate} + +We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script} +to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. +The training set contains 350000 records and 30 features. + +\verb@xgboost@ can automatically do parallel computation. On a machine with Intel +i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster +than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was +still about two times faster than \verb@gbm@. + +Meanwhile, the result from \verb@xgboost@ reaches +\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a +single model. This results stands in the +\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the +competition. + +\bibliographystyle{jss} +\nocite{*} % list uncited references +\bibliography{xgboost} + +\end{document} + From de08c5a3da691dd2be10ee4d45c5457eeb31e740 Mon Sep 17 00:00:00 2001 From: hetong Date: Fri, 5 Sep 2014 19:49:25 -0700 Subject: [PATCH 3/5] remove temp files --- R-package/vignettes/xgboost.aux | 28 --- R-package/vignettes/xgboost.bbl | 24 --- R-package/vignettes/xgboost.blg | 47 ----- R-package/vignettes/xgboost.out | 4 - R-package/vignettes/xgboost.tex | 319 -------------------------------- 5 files changed, 422 deletions(-) delete mode 100644 R-package/vignettes/xgboost.aux delete mode 100644 R-package/vignettes/xgboost.bbl delete mode 100644 R-package/vignettes/xgboost.blg delete mode 100644 R-package/vignettes/xgboost.out delete mode 100644 R-package/vignettes/xgboost.tex diff --git a/R-package/vignettes/xgboost.aux b/R-package/vignettes/xgboost.aux deleted file mode 100644 index 6e6babc4c..000000000 --- a/R-package/vignettes/xgboost.aux +++ /dev/null @@ -1,28 +0,0 @@ -\relax -\providecommand\hyper@newdestlabel[2]{} -\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} -\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined -\global\let\oldcontentsline\contentsline -\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} -\global\let\oldnewlabel\newlabel -\gdef\newlabel#1#2{\newlabelxx{#1}#2} -\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} -\AtEndDocument{\ifx\hyper@anchor\@undefined -\let\contentsline\oldcontentsline -\let\newlabel\oldnewlabel -\fi} -\fi} -\global\let\hyper@last\relax -\gdef\HyperFirstAtBeginDocument#1{#1} -\providecommand\HyField@AuxAddToFields[1]{} -\providecommand\HyField@AuxAddToCoFields[2]{} -\citation{friedman2001greedy} -\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}} -\@writefile{toc}{\contentsline {section}{\numberline {2}Example with iris}{1}{section.2}} -\@writefile{toc}{\contentsline {section}{\numberline {3}Advanced Examples}{2}{section.3}} -\bibstyle{jss} -\citation{*} -\bibdata{xgboost} -\bibcite{friedman2000additive}{{1}{2000}{{Friedman \emph {et~al.}}}{{Friedman, Hastie, Tibshirani \emph {et~al.}}}} -\bibcite{friedman2001greedy}{{2}{2001}{{Friedman}}{{}}} -\@writefile{toc}{\contentsline {section}{\numberline {4}The Higgs Boson competition}{3}{section.4}} diff --git a/R-package/vignettes/xgboost.bbl b/R-package/vignettes/xgboost.bbl deleted file mode 100644 index fdf58e763..000000000 --- a/R-package/vignettes/xgboost.bbl +++ /dev/null @@ -1,24 +0,0 @@ -\begin{thebibliography}{2} -\newcommand{\enquote}[1]{``#1''} -\providecommand{\natexlab}[1]{#1} -\providecommand{\url}[1]{\texttt{#1}} -\providecommand{\urlprefix}{URL } -\expandafter\ifx\csname urlstyle\endcsname\relax - \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else - \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup - \urlstyle{rm}\Url}\fi -\providecommand{\eprint}[2][]{\url{#2}} - -\bibitem[{Friedman \emph{et~al.}(2000)Friedman, Hastie, Tibshirani - \emph{et~al.}}]{friedman2000additive} -Friedman J, Hastie T, Tibshirani R, \emph{et~al.} (2000). -\newblock \enquote{Additive logistic regression: a statistical view of boosting - (with discussion and a rejoinder by the authors).} -\newblock \emph{The annals of statistics}, \textbf{28}(2), 337--407. - -\bibitem[{Friedman(2001)}]{friedman2001greedy} -Friedman JH (2001). -\newblock \enquote{Greedy function approximation: a gradient boosting machine.} -\newblock \emph{Annals of Statistics}, pp. 1189--1232. - -\end{thebibliography} diff --git a/R-package/vignettes/xgboost.blg b/R-package/vignettes/xgboost.blg deleted file mode 100644 index 2c0e87387..000000000 --- a/R-package/vignettes/xgboost.blg +++ /dev/null @@ -1,47 +0,0 @@ -This is BibTeX, Version 0.99d (TeX Live 2013/Debian) -Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 -The top-level auxiliary file: xgboost.aux -The style file: jss.bst -Database file #1: xgboost.bib -Reallocated wiz_functions (elt_size=4) to 6000 items from 3000. -You've used 2 entries, - 3140 wiz_defined-function locations, - 641 strings with 5430 characters, -and the built_in function-call counts, 1920 in all, are: -= -- 162 -> -- 44 -< -- 2 -+ -- 17 -- -- 15 -* -- 149 -:= -- 256 -add.period$ -- 8 -call.type$ -- 2 -change.case$ -- 12 -chr.to.int$ -- 2 -cite$ -- 2 -duplicate$ -- 171 -empty$ -- 175 -format.name$ -- 19 -if$ -- 395 -int.to.chr$ -- 1 -int.to.str$ -- 1 -missing$ -- 24 -newline$ -- 21 -num.names$ -- 8 -pop$ -- 51 -preamble$ -- 1 -purify$ -- 12 -quote$ -- 0 -skip$ -- 53 -stack$ -- 0 -substring$ -- 181 -swap$ -- 65 -text.length$ -- 1 -text.prefix$ -- 0 -top$ -- 0 -type$ -- 18 -warning$ -- 0 -while$ -- 16 -width$ -- 0 -write$ -- 36 diff --git a/R-package/vignettes/xgboost.out b/R-package/vignettes/xgboost.out deleted file mode 100644 index 6d60796a3..000000000 --- a/R-package/vignettes/xgboost.out +++ /dev/null @@ -1,4 +0,0 @@ -\BOOKMARK [1][-]{section.1}{Introduction}{}% 1 -\BOOKMARK [1][-]{section.2}{Example with iris}{}% 2 -\BOOKMARK [1][-]{section.3}{Advanced Examples}{}% 3 -\BOOKMARK [1][-]{section.4}{The Higgs Boson competition}{}% 4 diff --git a/R-package/vignettes/xgboost.tex b/R-package/vignettes/xgboost.tex deleted file mode 100644 index 0ed4015b7..000000000 --- a/R-package/vignettes/xgboost.tex +++ /dev/null @@ -1,319 +0,0 @@ -\documentclass{article}\usepackage[]{graphicx}\usepackage[]{color} -%% maxwidth is the original width if it is less than linewidth -%% otherwise use linewidth (to make sure the graphics do not exceed the margin) -\makeatletter -\def\maxwidth{ % - \ifdim\Gin@nat@width>\linewidth - \linewidth - \else - \Gin@nat@width - \fi -} -\makeatother - -\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} -\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}% -\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}% -\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}% -\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}% -\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}% -\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}% -\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}% -\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}% -\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}% - -\usepackage{framed} -\makeatletter -\newenvironment{kframe}{% - \def\at@end@of@kframe{}% - \ifinner\ifhmode% - \def\at@end@of@kframe{\end{minipage}}% - \begin{minipage}{\columnwidth}% - \fi\fi% - \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep - \colorbox{shadecolor}{##1}\hskip-\fboxsep - % There is no \\@totalrightmargin, so: - \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% - \MakeFramed {\advance\hsize-\width - \@totalleftmargin\z@ \linewidth\hsize - \@setminipage}}% - {\par\unskip\endMakeFramed% - \at@end@of@kframe} -\makeatother - -\definecolor{shadecolor}{rgb}{.97, .97, .97} -\definecolor{messagecolor}{rgb}{0, 0, 0} -\definecolor{warningcolor}{rgb}{1, 0, 1} -\definecolor{errorcolor}{rgb}{1, 0, 0} -\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX - -\usepackage{alltt} -\RequirePackage{url} -\usepackage{hyperref} -\RequirePackage{amsmath} -\RequirePackage{natbib} -\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry} - -\makeatletter -% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting} -%\VignetteKeywords{xgboost, gbm, gradient boosting machines} -%\VignettePackage{xgboost} -% \VignetteEngine{knitr::knitr} -\makeatother -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\begin{document} -%\SweaveOpts{concordance=TRUE} - - - -% - -% - - \begin{center} - \vspace*{6\baselineskip} - \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt} - \rule{\textwidth}{0.4pt}\\[2\baselineskip] - {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip] - \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt} - \rule{\textwidth}{1.6pt}\\[2\baselineskip] - {\Large Tianqi Chen, Tong He}\\[\baselineskip] - {\large Package Version: 0.3-0}\\[\baselineskip] - {\large \today}\par - \vfill - \end{center} - -\thispagestyle{empty} - -\clearpage - -\setcounter{page}{1} - -\section{Introduction} - -This is an introductory document of using the \verb@xgboost@ package in R. - -\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient - and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}. -The package includes efficient linear model solver and tree learning algorithm. -It supports various objective functions, including regression, classification -and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features: -\begin{enumerate} - \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on - Windows and Linux, with openmp. It is generally over 10 times faster than - \verb@gbm@.} - \item{Input Type: }{\verb@xgboost@ takes several types of input data:} - \begin{itemize} - \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} - \item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@} - \item{Data File: }{Local data files} - \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} - \end{itemize} - \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster - and linear booster, and is optimized for sparse input.} - \item{Customization: }{\verb@xgboost@ supports customized objective function - and evaluation function} - \item{Performance: }{\verb@xgboost@ has better performance on several different - datasets.} -\end{enumerate} - - -\section{Example with iris} - -In this section, we will illustrate some common usage of \verb@xgboost@. - -\begin{knitrout} -\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} -\begin{alltt} -\hlkwd{library}\hlstd{(xgboost)} -\hlkwd{data}\hlstd{(iris)} -\hlstd{bst} \hlkwb{<-} \hlkwd{xgboost}\hlstd{(}\hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]),}\hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{),} - \hlkwc{nrounds} \hlstd{=} \hlnum{5}\hlstd{)} -\end{alltt} -\begin{verbatim} -## [0] train-rmse:0.351971 -## [1] train-rmse:0.247769 -## [2] train-rmse:0.174418 -## [3] train-rmse:0.122783 -## [4] train-rmse:0.086435 -\end{verbatim} -\begin{alltt} -\hlkwd{xgb.save}\hlstd{(bst,} \hlstr{'model.save'}\hlstd{)} -\end{alltt} -\begin{verbatim} -## [1] TRUE -\end{verbatim} -\begin{alltt} -\hlstd{bst} \hlkwb{=} \hlkwd{xgb.load}\hlstd{(}\hlstr{'model.save'}\hlstd{)} -\hlstd{pred} \hlkwb{<-} \hlkwd{predict}\hlstd{(bst,} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{]))} -\end{alltt} -\end{kframe} -\end{knitrout} - -\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. -\verb@predict@ does prediction on the model. - -Here we can save the model to a binary local file, and load it when needed. -We can't inspect the trees inside. However we have another function to save the -model in plain text. -\begin{knitrout} -\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} -\begin{alltt} -\hlkwd{xgb.dump}\hlstd{(bst,} \hlstr{'model.dump'}\hlstd{)} -\end{alltt} -\begin{verbatim} -## [1] TRUE -\end{verbatim} -\end{kframe} -\end{knitrout} - -The output looks like - -\begin{verbatim} -booster[0]: -0:[f2<2.45] yes=1,no=2,missing=1 - 1:leaf=0.147059 - 2:[f3<1.65] yes=3,no=4,missing=3 - 3:leaf=0.464151 - 4:leaf=0.722449 -booster[1]: -0:[f2<2.45] yes=1,no=2,missing=1 - 1:leaf=0.103806 - 2:[f2<4.85] yes=3,no=4,missing=3 - 3:leaf=0.316341 - 4:leaf=0.510365 -\end{verbatim} - -It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. -It speeds up \verb@xgboost@, and is needed for advanced features such as -training from initial prediction value, weighted training instance. - -We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: -\begin{knitrout} -\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} -\begin{alltt} -\hlstd{iris.mat} \hlkwb{<-} \hlkwd{as.matrix}\hlstd{(iris[,}\hlnum{1}\hlopt{:}\hlnum{4}\hlstd{])} -\hlstd{iris.label} \hlkwb{<-} \hlkwd{as.numeric}\hlstd{(iris[,}\hlnum{5}\hlstd{]}\hlopt{==}\hlstr{'setosa'}\hlstd{)} -\hlstd{diris} \hlkwb{<-} \hlkwd{xgb.DMatrix}\hlstd{(iris.mat,} \hlkwc{label} \hlstd{= iris.label)} -\hlkwd{class}\hlstd{(diris)} -\end{alltt} -\begin{verbatim} -## [1] "xgb.DMatrix" -\end{verbatim} -\begin{alltt} -\hlkwd{getinfo}\hlstd{(diris,}\hlstr{'label'}\hlstd{)} -\end{alltt} -\begin{verbatim} -## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -## [141] 0 0 0 0 0 0 0 0 0 0 -\end{verbatim} -\end{kframe} -\end{knitrout} - -We can also save the matrix to a binary file. Then load it simply with -\verb@xgb.DMatrix@ -\begin{knitrout} -\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} -\begin{alltt} -\hlkwd{xgb.DMatrix.save}\hlstd{(diris,} \hlstr{'iris.xgb.DMatrix'}\hlstd{)} -\end{alltt} -\begin{verbatim} -## 150x4 matrix with 600 entries is saved to iris.xgb.DMatrix -## [1] TRUE -\end{verbatim} -\begin{alltt} -\hlstd{diris} \hlkwb{=} \hlkwd{xgb.DMatrix}\hlstd{(}\hlstr{'iris.xgb.DMatrix'}\hlstd{)} -\end{alltt} -\begin{verbatim} -## 150x4 matrix with 600 entries is loaded from iris.xgb.DMatrix -\end{verbatim} -\end{kframe} -\end{knitrout} - -\section{Advanced Examples} - -The function \verb@xgboost@ is a simple function with less parameter, in order -to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. - -\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions. - -\begin{knitrout} -\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe} -\begin{alltt} -\hlstd{logregobj} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} - \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} - \hlstd{preds} \hlkwb{<-} \hlnum{1}\hlopt{/}\hlstd{(}\hlnum{1} \hlopt{+} \hlkwd{exp}\hlstd{(}\hlopt{-}\hlstd{preds))} - \hlstd{grad} \hlkwb{<-} \hlstd{preds} \hlopt{-} \hlstd{labels} - \hlstd{hess} \hlkwb{<-} \hlstd{preds} \hlopt{*} \hlstd{(}\hlnum{1} \hlopt{-} \hlstd{preds)} - \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{grad} \hlstd{= grad,} \hlkwc{hess} \hlstd{= hess))} -\hlstd{\}} - -\hlstd{evalerror} \hlkwb{<-} \hlkwa{function}\hlstd{(}\hlkwc{preds}\hlstd{,} \hlkwc{dtrain}\hlstd{) \{} - \hlstd{labels} \hlkwb{<-} \hlkwd{getinfo}\hlstd{(dtrain,} \hlstr{"label"}\hlstd{)} - \hlstd{err} \hlkwb{<-} \hlkwd{sqrt}\hlstd{(}\hlkwd{mean}\hlstd{((preds}\hlopt{-}\hlstd{labels)}\hlopt{^}\hlnum{2}\hlstd{))} - \hlkwd{return}\hlstd{(}\hlkwd{list}\hlstd{(}\hlkwc{metric} \hlstd{=} \hlstr{"MSE"}\hlstd{,} \hlkwc{value} \hlstd{= err))} -\hlstd{\}} - -\hlstd{dtest} \hlkwb{<-} \hlkwd{slice}\hlstd{(diris,}\hlnum{1}\hlopt{:}\hlnum{100}\hlstd{)} -\hlstd{watchlist} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{eval} \hlstd{= dtest,} \hlkwc{train} \hlstd{= diris)} -\hlstd{param} \hlkwb{<-} \hlkwd{list}\hlstd{(}\hlkwc{max_depth} \hlstd{=} \hlnum{2}\hlstd{,} \hlkwc{eta} \hlstd{=} \hlnum{1}\hlstd{,} \hlkwc{silent} \hlstd{=} \hlnum{1}\hlstd{)} - -\hlstd{bst} \hlkwb{<-} \hlkwd{xgb.train}\hlstd{(param, diris,} \hlkwc{nround} \hlstd{=} \hlnum{2}\hlstd{, watchlist, logregobj, evalerror)} -\end{alltt} -\begin{verbatim} -## [1] eval-MSE:1.601 train-MSE:1.76 -## [2] eval-MSE:2.567 train-MSE:2.745 -\end{verbatim} -\end{kframe} -\end{knitrout} - -The gradient and second order gradient is required for the output of customized -objective function. - -We also have \verb@slice@ for row extraction. It is useful in -cross-validation. - -For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further -details. - -\section{The Higgs Boson competition} - -We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs -Boson Machine Learning Challenge}. - -Here are the instructions to make a submission -\begin{enumerate} - \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} - and extract them to \verb@data/@. - \item Run scripts under \verb@xgboost/demo/kaggle-higgs/@: - \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R} - and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}. - The computation will take less than a minute on Intel i7. - \item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page} - and submit your result. -\end{enumerate} - -We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script} -to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. -The training set contains 350000 records and 30 features. - -\verb@xgboost@ can automatically do parallel computation. On a machine with Intel -i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster -than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was -still about two times faster than \verb@gbm@. - -Meanwhile, the result from \verb@xgboost@ reaches -\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a -single model. This results stands in the -\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the -competition. - -\bibliographystyle{jss} -\nocite{*} % list uncited references -\bibliography{xgboost} - -\end{document} - From 63dd037db6224eb19e803660e6ef7cbbc8e5ff5d Mon Sep 17 00:00:00 2001 From: hetong Date: Fri, 5 Sep 2014 20:25:38 -0700 Subject: [PATCH 4/5] add r basic walkthrough --- demo/guide-R/basic_walkthrough.R | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 demo/guide-R/basic_walkthrough.R diff --git a/demo/guide-R/basic_walkthrough.R b/demo/guide-R/basic_walkthrough.R new file mode 100644 index 000000000..959e5f0ed --- /dev/null +++ b/demo/guide-R/basic_walkthrough.R @@ -0,0 +1,53 @@ +require(xgboost) + +dtrain <- xgb.DMatrix('../data/agaricus.txt.train') +dtest <- xgb.DMatrix('../data/agaricus.txt.test') +param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') +watchlist <- list(eval = dtest, train = dtrain) +num_round <- 2 +bst <- xgb.train(param, dtrain, num_round, watchlist) +preds <- predict(bst, dtest) +labels <- getinfo(dtest,'label') +cat('error=', mean(as.numeric(preds>0.5)!=labels),'\n') +xgb.save(bst, 'xgb.model') +xgb.dump(bst, 'dump.raw.txt') +xgb.dump(bst, 'dump.nuce.txt','../data/featmap.txt') + +bst2 <- xgb.load('xgb.model') +preds2 <- predict(bst2,dtest) +stopifnot(sum((preds-preds2)^2)==0) + + +cat('start running example of build DMatrix from scipy.sparse CSR Matrix\n') +read.libsvm <- function(fname, maxcol) { + content <- readLines(fname) + nline <- length(content) + label <- numeric(nline) + mat <- matrix(0, nline, maxcol + 1) + for (i in 1:nline) { + arr <- as.vector(strsplit(content[i], " ")[[1]]) + label[i] <- as.numeric(arr[[1]]) + for (j in 2:length(arr)) { + kv <- strsplit(arr[j], ":")[[1]] + # to avoid 0 index + findex <- as.integer(kv[1]) + 1 + fvalue <- as.numeric(kv[2]) + mat[i, findex] <- fvalue + } + } + mat <- as(mat, "sparseMatrix") + return(list(label = label, data = mat)) +} +csc <- read.libsvm("../data/agaricus.txt.train", 126) +y <- csc$label +x <- csc$data +class(x) +dtrain <- xgb.DMatrix(x, label = y) +bst <- xgb.train(param, dtrain, num_round, watchlist) + +cat('start running example of build DMatrix from numpy array\n') +x <- as.matrix(x) +class(x) +dtrain <- xgb.DMatrix(x, label = y) +bst <- xgb.train(param, dtrain, num_round, watchlist) + From af07f5135a56a985c0f16fadb667de93d0873d29 Mon Sep 17 00:00:00 2001 From: hetong Date: Fri, 5 Sep 2014 20:33:39 -0700 Subject: [PATCH 5/5] cleaning