Merge branch 'master' of https://github.com/tqchen/xgboost into tqchen-master
Conflicts: README.md
This commit is contained in:
commit
909a61edac
6
.gitignore
vendored
6
.gitignore
vendored
@ -6,12 +6,15 @@
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
*.page
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*~
|
||||
*.Rcheck
|
||||
*.rds
|
||||
*.tar.gz
|
||||
*txt*
|
||||
*conf
|
||||
*buffer
|
||||
@ -40,3 +43,4 @@ Debug
|
||||
*x64
|
||||
*dump
|
||||
*save
|
||||
*csv
|
||||
|
||||
@ -11,7 +11,7 @@ xgboost-0.2x
|
||||
* Weighted samples instances
|
||||
* Initial version of pairwise rank
|
||||
|
||||
xgboost-unity
|
||||
xgboost-0.3
|
||||
=====
|
||||
* Faster tree construction module
|
||||
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
||||
|
||||
57
Makefile
57
Makefile
@ -1,32 +1,32 @@
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
export LDFLAGS= -pthread -lm
|
||||
# note for R module
|
||||
# add include path to Rinternals.h here
|
||||
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
|
||||
|
||||
ifeq ($(no_omp),1)
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -DDISABLE_OPENMP
|
||||
else
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||
CFLAGS += -DDISABLE_OPENMP
|
||||
else
|
||||
CFLAGS += -fopenmp
|
||||
endif
|
||||
|
||||
# expose these flags to R CMD SHLIB
|
||||
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
|
||||
|
||||
# specify tensor path
|
||||
BIN = xgboost
|
||||
OBJ =
|
||||
OBJ = updater.o gbm.o io.o
|
||||
SLIB = wrapper/libxgboostwrapper.so
|
||||
RLIB = wrapper/libxgboostR.so
|
||||
.PHONY: clean all R
|
||||
|
||||
all: $(BIN) wrapper/libxgboostwrapper.so
|
||||
R: wrapper/libxgboostR.so
|
||||
.PHONY: clean all python Rpack
|
||||
|
||||
xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp
|
||||
all: $(BIN) $(OBJ) $(SLIB)
|
||||
|
||||
python: wrapper/libxgboostwrapper.so
|
||||
# now the wrapper takes in two files. io and wrapper part
|
||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h
|
||||
wrapper/libxgboostR.so: wrapper/xgboost_wrapper.cpp wrapper/xgboost_R.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h
|
||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
|
||||
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
|
||||
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
|
||||
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
|
||||
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
|
||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
@ -34,14 +34,31 @@ $(BIN) :
|
||||
$(SLIB) :
|
||||
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
$(RLIB) :
|
||||
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
$(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
install:
|
||||
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||
|
||||
Rpack:
|
||||
make clean
|
||||
rm -rf xgboost xgboost*.tar.gz
|
||||
cp -r R-package xgboost
|
||||
rm -rf xgboost/inst/examples/*.buffer
|
||||
rm -rf xgboost/inst/examples/*.model
|
||||
rm -rf xgboost/inst/examples/dump*
|
||||
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
|
||||
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer
|
||||
cp -r src xgboost/src/src
|
||||
mkdir xgboost/src/wrapper
|
||||
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
|
||||
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
|
||||
cp ./LICENSE xgboost
|
||||
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
|
||||
cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
|
||||
R CMD build xgboost
|
||||
rm -rf xgboost
|
||||
R CMD check --as-cran xgboost*.tar.gz
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) $(SLIB) $(RLIB) *~ */*~ */*/*~
|
||||
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||
|
||||
@ -1,12 +1,20 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: R wrapper of xgboost
|
||||
Version: 0.3-0
|
||||
Title: eXtreme Gradient Boosting
|
||||
Version: 0.3-1
|
||||
Date: 2014-08-23
|
||||
Author: Tianqi Chen
|
||||
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>
|
||||
Description: xgboost
|
||||
License: See LICENSE file
|
||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
|
||||
Maintainer: Tong He <hetong007@gmail.com>
|
||||
Description: This package is a R wrapper of xgboost, which is short for eXtreme
|
||||
Gradient Boosting. It is an efficient and scalable implementation of
|
||||
gradient boosting framework. The package includes efficient linear model
|
||||
solver and tree learning algorithms. The package can automatically do
|
||||
parallel computation with OpenMP, and it can be more than 10 times faster
|
||||
than existing gradient boosting packages such as gbm. It supports various
|
||||
objective functions, including regression, classification and ranking. The
|
||||
package is made to be extensible, so that users are also allowed to define
|
||||
their own objectives easily.
|
||||
License: Apache License (== 2.0) | file LICENSE
|
||||
URL: https://github.com/tqchen/xgboost
|
||||
BugReports: https://github.com/tqchen/xgboost/issues
|
||||
Depends:
|
||||
|
||||
@ -1,10 +1,15 @@
|
||||
importClassesFrom("Matrix", dgCMatrix, dgeMatrix)
|
||||
# Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
|
||||
export(xgboost)
|
||||
export(getinfo)
|
||||
export(slice)
|
||||
export(xgb.DMatrix)
|
||||
export(xgb.getinfo)
|
||||
exportMethods(predict)
|
||||
export(xgb.train)
|
||||
export(xgb.save)
|
||||
export(xgb.load)
|
||||
export(xgb.DMatrix.save)
|
||||
export(xgb.dump)
|
||||
export(xgb.load)
|
||||
export(xgb.save)
|
||||
export(xgb.train)
|
||||
export(xgboost)
|
||||
exportMethods(predict)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
|
||||
38
R-package/R/getinfo.xgb.DMatrix.R
Normal file
38
R-package/R/getinfo.xgb.DMatrix.R
Normal file
@ -0,0 +1,38 @@
|
||||
setClass('xgb.DMatrix')
|
||||
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#'
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
#'
|
||||
getinfo <- function(object, ...){
|
||||
UseMethod("getinfo")
|
||||
}
|
||||
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param name the name of the field to get
|
||||
#' @param ... other parameters
|
||||
#' @rdname getinfo
|
||||
#' @method getinfo xgb.DMatrix
|
||||
setMethod("getinfo", signature = "xgb.DMatrix",
|
||||
definition = function(object, name) {
|
||||
if (typeof(name) != "character") {
|
||||
stop("xgb.getinfo: name must be character")
|
||||
}
|
||||
if (class(object) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
if (name != "label" && name != "weight" && name != "base_margin") {
|
||||
stop(paste("xgb.getinfo: unknown info name", name))
|
||||
}
|
||||
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
|
||||
return(ret)
|
||||
})
|
||||
|
||||
@ -1,16 +1,37 @@
|
||||
#' @export
|
||||
setClass("xgb.Booster")
|
||||
|
||||
#' Predict method for eXtreme Gradient Boosting model
|
||||
#'
|
||||
#' Predicted values based on xgboost model object.
|
||||
#'
|
||||
#' @param object Object of class "xgb.Boost"
|
||||
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param outputmargin whether the prediction should be shown in the original
|
||||
#' value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
#' untransformed margin value. In logistic regression, outputmargin=T will
|
||||
#' output value before logistic transformation.
|
||||
#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
|
||||
#' set it to be value bigger than 0. It will use all trees by default.
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
#' @export
|
||||
setMethod("predict",
|
||||
signature = "xgb.Booster",
|
||||
definition = function(object, newdata, outputmargin = FALSE)
|
||||
{
|
||||
if (class(newdata) != "xgb.DMatrix") {
|
||||
newdata = xgb.DMatrix(newdata)
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", object, newdata,
|
||||
as.integer(outputmargin), PACKAGE="xgboost")
|
||||
return(ret)
|
||||
})
|
||||
|
||||
#'
|
||||
setMethod("predict", signature = "xgb.Booster",
|
||||
definition = function(object, newdata, outputmargin = FALSE, ntreelimit = NULL) {
|
||||
if (class(newdata) != "xgb.DMatrix") {
|
||||
newdata <- xgb.DMatrix(newdata)
|
||||
}
|
||||
if (is.null(ntreelimit)) {
|
||||
ntreelimit <- 0
|
||||
} else {
|
||||
if (ntreelimit < 1){
|
||||
stop("predict: ntreelimit must be equal to or greater than 1")
|
||||
}
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
|
||||
return(ret)
|
||||
})
|
||||
|
||||
|
||||
33
R-package/R/slice.xgb.DMatrix.R
Normal file
33
R-package/R/slice.xgb.DMatrix.R
Normal file
@ -0,0 +1,33 @@
|
||||
setClass('xgb.DMatrix')
|
||||
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' orginal xgb.DMatrix object
|
||||
#'
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' orginal xgb.DMatrix object
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' dsub <- slice(dtrain, 1:3)
|
||||
#' @rdname slice
|
||||
#' @export
|
||||
#'
|
||||
slice <- function(object, ...){
|
||||
UseMethod("slice")
|
||||
}
|
||||
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param idxset a integer vector of indices of rows needed
|
||||
#' @param ... other parameters
|
||||
#' @rdname slice
|
||||
#' @method slice xgb.DMatrix
|
||||
setMethod("slice", signature = "xgb.DMatrix",
|
||||
definition = function(object, idxset, ...) {
|
||||
if (class(object) != "xgb.DMatrix") {
|
||||
stop("slice: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
|
||||
return(structure(ret, class = "xgb.DMatrix"))
|
||||
})
|
||||
@ -1,128 +1,139 @@
|
||||
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
||||
#' @import methods
|
||||
|
||||
# depends on matrix
|
||||
.onLoad <- function(libname, pkgname) {
|
||||
library.dynam("xgboost", pkgname, libname);
|
||||
library.dynam("xgboost", pkgname, libname)
|
||||
}
|
||||
.onUnload <- function(libpath) {
|
||||
library.dynam.unload("xgboost", libpath);
|
||||
library.dynam.unload("xgboost", libpath)
|
||||
}
|
||||
|
||||
# set information into dmatrix, this mutate dmatrix
|
||||
xgb.setinfo <- function(dmat, name, info) {
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
if (name == "label") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "weight") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "base_margin") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "group") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop(paste("xgb.setinfo: unknown info name", name))
|
||||
return(FALSE)
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
if (name == "label") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "weight") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "base_margin") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "group") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop(paste("xgb.setinfo: unknown info name", name))
|
||||
return(FALSE)
|
||||
}
|
||||
|
||||
# construct a Booster from cachelist
|
||||
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
if (typeof(cachelist) != "list") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
if (typeof(cachelist) != "list") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
for (dm in cachelist) {
|
||||
if (class(dm) != "xgb.DMatrix") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
for (dm in cachelist) {
|
||||
if (class(dm) != "xgb.DMatrix") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
}
|
||||
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
|
||||
if (length(params) != 0) {
|
||||
for (i in 1:length(params)) {
|
||||
p <- params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p),
|
||||
PACKAGE = "xgboost")
|
||||
}
|
||||
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE="xgboost")
|
||||
.Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost")
|
||||
if (length(params) != 0) {
|
||||
for (i in 1:length(params)) {
|
||||
p <- params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost")
|
||||
}
|
||||
}
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) != "character") {
|
||||
stop("xgb.Booster: modelfile must be character")
|
||||
}
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) != "character"){
|
||||
stop("xgb.Booster: modelfile must be character");
|
||||
}
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE="xgboost")
|
||||
}
|
||||
return(structure(handle, class="xgb.Booster"))
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
|
||||
}
|
||||
return(structure(handle, class = "xgb.Booster"))
|
||||
}
|
||||
|
||||
|
||||
# predict, depreciated
|
||||
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.predict: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost")
|
||||
return(ret)
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.predict: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin),
|
||||
PACKAGE = "xgboost")
|
||||
return(ret)
|
||||
}
|
||||
|
||||
##--------------------------------------
|
||||
# the following are low level iteratively function, not needed
|
||||
# if you do not want to use them
|
||||
#---------------------------------------
|
||||
## ----the following are low level iteratively function, not needed if
|
||||
## you do not want to use them ---------------------------------------
|
||||
|
||||
# iteratively update booster with dtrain
|
||||
xgb.iter.update <- function(booster, dtrain, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
# iteratively update booster with customized statistics
|
||||
xgb.iter.boost <- function(booster, dtrain, gpair) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess,
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
# iteratively evaluate one iteration
|
||||
xgb.iter.eval <- function(booster, watchlist, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(watchlist) != "list") {
|
||||
stop("xgb.eval: only accepts list of DMatrix as watchlist")
|
||||
}
|
||||
for (w in watchlist) {
|
||||
if (class(w) != "xgb.DMatrix") {
|
||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
||||
}
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(watchlist) != "list") {
|
||||
stop("xgb.eval: only accepts list of DMatrix as watchlist")
|
||||
}
|
||||
for (w in watchlist) {
|
||||
if (class(w) != "xgb.DMatrix") {
|
||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
||||
}
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
evnames <- list()
|
||||
if (length(watchlist) != 0) {
|
||||
for (i in 1:length(watchlist)) {
|
||||
w <- watchlist[i]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
evnames <- append(evnames, names(w))
|
||||
}
|
||||
for (i in 1:length(watchlist)) {
|
||||
w <- watchlist[i]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
evnames <- append(evnames, names(w))
|
||||
}
|
||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost")
|
||||
return(msg)
|
||||
}
|
||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
|
||||
evnames, PACKAGE = "xgboost")
|
||||
} else {
|
||||
msg <- ""
|
||||
}
|
||||
return(msg)
|
||||
}
|
||||
|
||||
@ -1,22 +1,45 @@
|
||||
# constructing DMatrix
|
||||
xgb.DMatrix <- function(data, missing=0.0, ...) {
|
||||
if (typeof(data) == "character") {
|
||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost")
|
||||
} else if(is.matrix(data)) {
|
||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost")
|
||||
} else if(class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost")
|
||||
} else {
|
||||
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
|
||||
}
|
||||
dmat <- structure(handle, class="xgb.DMatrix")
|
||||
|
||||
info = list(...)
|
||||
if (length(info)==0)
|
||||
return(dmat)
|
||||
for (i in 1:length(info)) {
|
||||
p = info[i]
|
||||
xgb.setinfo(dmat, names(p), p[[1]])
|
||||
}
|
||||
#' Contruct xgb.DMatrix object
|
||||
#'
|
||||
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
|
||||
#'
|
||||
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
|
||||
#' indicating the data file.
|
||||
#' @param info a list of information of the xgb.DMatrix object
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#
|
||||
#' @param ... other information to pass to \code{info}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||
#' @export
|
||||
#'
|
||||
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
|
||||
if (typeof(data) == "character") {
|
||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
|
||||
PACKAGE = "xgboost")
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
|
||||
PACKAGE = "xgboost")
|
||||
} else if (class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
|
||||
PACKAGE = "xgboost")
|
||||
} else {
|
||||
stop(paste("xgb.DMatrix: does not support to construct from ",
|
||||
typeof(data)))
|
||||
}
|
||||
dmat <- structure(handle, class = "xgb.DMatrix")
|
||||
|
||||
info <- append(info, list(...))
|
||||
if (length(info) == 0)
|
||||
return(dmat)
|
||||
}
|
||||
for (i in 1:length(info)) {
|
||||
p <- info[i]
|
||||
xgb.setinfo(dmat, names(p), p[[1]])
|
||||
}
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
27
R-package/R/xgb.DMatrix.save.R
Normal file
27
R-package/R/xgb.DMatrix.save.R
Normal file
@ -0,0 +1,27 @@
|
||||
#' Save xgb.DMatrix object to binary file
|
||||
#'
|
||||
#' Save xgb.DMatrix object to binary file
|
||||
#'
|
||||
#' @param DMatrix the model object.
|
||||
#' @param fname the name of the binary file.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||
#' @export
|
||||
#'
|
||||
xgb.DMatrix.save <- function(DMatrix, fname) {
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.save: fname must be character")
|
||||
}
|
||||
if (class(DMatrix) == "xgb.DMatrix") {
|
||||
.Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||
return(FALSE)
|
||||
}
|
||||
@ -1,11 +1,29 @@
|
||||
# dump model
|
||||
xgb.dump <- function(booster, fname, fmap = "") {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character"){
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
#' Save xgboost model to text file
|
||||
#'
|
||||
#' Save a xgboost model to text file. Could be parsed later.
|
||||
#'
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the binary file.
|
||||
#' @param fmap feature map file representing the type of feature.
|
||||
#' Detailed description could be found at
|
||||
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
#' Run inst/examples/demo.R for the result and inst/examples/featmap.txt
|
||||
#' for example Format.
|
||||
#'
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
#' xgb.dump(bst, 'iris.xgb.model.dump')
|
||||
#' @export
|
||||
#'
|
||||
xgb.dump <- function(model, fname, fmap = "") {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", model, fname, fmap, PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
# get information from dmatrix
|
||||
xgb.getinfo <- function(dmat, name) {
|
||||
if (typeof(name) != "character") {
|
||||
stop("xgb.getinfo: name must be character")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
if (name != "label" &&
|
||||
name != "weight" &&
|
||||
name != "base_margin" ) {
|
||||
stop(paste("xgb.getinfo: unknown info name", name))
|
||||
}
|
||||
ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost")
|
||||
return(ret)
|
||||
}
|
||||
@ -1,5 +1,19 @@
|
||||
#' Load xgboost model from binary file
|
||||
#'
|
||||
#' Load xgboost model from the binary model file
|
||||
#'
|
||||
#' @param modelfile the name of the binary file.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
#' xgb.save(bst, 'iris.xgb.model')
|
||||
#' bst <- xgb.load('iris.xgb.model')
|
||||
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
#' @export
|
||||
#'
|
||||
xgb.load <- function(modelfile) {
|
||||
if (is.null(modelfile))
|
||||
stop('xgb.load: modelfile cannot be NULL')
|
||||
xgb.Booster(modelfile=modelfile)
|
||||
}
|
||||
if (is.null(modelfile))
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
xgb.Booster(modelfile = modelfile)
|
||||
}
|
||||
|
||||
@ -1,16 +1,27 @@
|
||||
# save model or DMatrix to file
|
||||
xgb.save <- function(handle, fname) {
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.save: fname must be character")
|
||||
}
|
||||
if (class(handle) == "xgb.Booster") {
|
||||
.Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
if (class(handle) == "xgb.DMatrix") {
|
||||
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||
return(FALSE)
|
||||
}
|
||||
#' Save xgboost model to binary file
|
||||
#'
|
||||
#' Save xgboost model from xgboost or xgb.train
|
||||
#'
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the binary file.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
#' xgb.save(bst, 'iris.xgb.model')
|
||||
#' bst <- xgb.load('iris.xgb.model')
|
||||
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
#' @export
|
||||
#'
|
||||
xgb.save <- function(model, fname) {
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.save: fname must be character")
|
||||
}
|
||||
if (class(model) == "xgb.Booster") {
|
||||
.Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
|
||||
xgb.DMatrix object.")
|
||||
return(FALSE)
|
||||
}
|
||||
|
||||
@ -1,38 +1,112 @@
|
||||
# train a model using given parameters
|
||||
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.train: first argument params must be list");
|
||||
#' eXtreme Gradient Boosting Training
|
||||
#'
|
||||
#' The training function of xgboost
|
||||
#'
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
#' \itemize{
|
||||
#' \item \code{reg:linear} linear regression
|
||||
#' \item \code{binary:logistic} logistic regression for classification
|
||||
#' }
|
||||
#' \item \code{eta} step size of each boosting step
|
||||
#' \item \code{max_depth} maximum depth of the tree
|
||||
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
#' }
|
||||
#'
|
||||
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
#' further details. See also inst/examples/demo.R for walkthrough example in R.
|
||||
#' @param dtrain takes an \code{xgb.DMatrix} as the input.
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param watchlist what information should be printed when \code{verbose=1} or
|
||||
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
|
||||
#' during training. For example user can specify
|
||||
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
|
||||
#' the performance of each round's model on mat1 and mat2
|
||||
#'
|
||||
#' @param obj customized objective function. Returns gradient and second order
|
||||
#' gradient with given prediction and dtrain,
|
||||
#' @param feval custimized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain,
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the training function for xgboost.
|
||||
#'
|
||||
#' Parallelization is automatically enabled if OpenMP is present.
|
||||
#' Number of threads can also be manually specified via "nthread" parameter.
|
||||
#'
|
||||
#' This function only accepts an \code{xgb.DMatrix} object as the input.
|
||||
#' It supports advanced features such as watchlist, customized objective function,
|
||||
#' therefore it is more flexible than \code{\link{xgboost}}.
|
||||
#'
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' iris[,5] <- as.numeric(iris[,5])
|
||||
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
#' dtest <- dtrain
|
||||
#' watchlist <- list(eval = dtest, train = dtrain)
|
||||
#' param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||
#' logregobj <- function(preds, dtrain) {
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' preds <- 1/(1 + exp(-preds))
|
||||
#' grad <- preds - labels
|
||||
#' hess <- preds * (1 - preds)
|
||||
#' return(list(grad = grad, hess = hess))
|
||||
#' }
|
||||
#' evalerror <- function(preds, dtrain) {
|
||||
#' labels <- getinfo(dtrain, "label")
|
||||
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
#' return(list(metric = "error", value = err))
|
||||
#' }
|
||||
#' bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||
#' @export
|
||||
#'
|
||||
xgb.train <- function(params=list(), dtrain, nrounds, watchlist = list(),
|
||||
obj = NULL, feval = NULL, ...) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.train: first argument params must be list")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
params = append(params, list(...))
|
||||
bst <- xgb.Booster(params, append(watchlist, dtrain))
|
||||
for (i in 1:nrounds) {
|
||||
if (is.null(obj)) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i - 1)
|
||||
} else {
|
||||
pred <- xgb.predict(bst, dtrain)
|
||||
gpair <- obj(pred, dtrain)
|
||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
bst <- xgb.Booster(params, append(watchlist,dtrain))
|
||||
for (i in 1:nrounds) {
|
||||
if (is.null(obj)) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i-1)
|
||||
} else {
|
||||
pred <- xgb.predict(bst, dtrain)
|
||||
gpair <- obj(pred, dtrain)
|
||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
if (is.null(feval)) {
|
||||
msg <- xgb.iter.eval(bst, watchlist, i-1)
|
||||
cat(msg); cat("\n")
|
||||
} else {
|
||||
cat("["); cat(i); cat("]");
|
||||
for (j in 1:length(watchlist)) {
|
||||
w <- watchlist[j]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
||||
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
|
||||
cat(":"); cat(ret$value)
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
if (is.null(feval)) {
|
||||
msg <- xgb.iter.eval(bst, watchlist, i - 1)
|
||||
cat(msg)
|
||||
cat("\n")
|
||||
} else {
|
||||
cat("[")
|
||||
cat(i)
|
||||
cat("]")
|
||||
for (j in 1:length(watchlist)) {
|
||||
w <- watchlist[j]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
||||
cat("\t")
|
||||
cat(names(w))
|
||||
cat("-")
|
||||
cat(ret$metric)
|
||||
cat(":")
|
||||
cat(ret$value)
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
}
|
||||
return(bst)
|
||||
}
|
||||
}
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -1,49 +1,71 @@
|
||||
# Main function for xgboost-package
|
||||
|
||||
xgboost = function(x=NULL,y=NULL,DMatrix=NULL, file=NULL, validation=NULL,
|
||||
nrounds=10, obj=NULL, feval=NULL, margin=NULL, verbose = T, ...)
|
||||
{
|
||||
if (!is.null(DMatrix))
|
||||
dtrain = DMatrix
|
||||
else
|
||||
{
|
||||
if (is.null(x) && is.null(y))
|
||||
{
|
||||
if (is.null(file))
|
||||
stop('xgboost need input data, either R objects, local files or DMatrix object.')
|
||||
dtrain = xgb.DMatrix(file)
|
||||
}
|
||||
else
|
||||
dtrain = xgb.DMatrix(x, label=y)
|
||||
if (!is.null(margin))
|
||||
{
|
||||
succ <- xgb.setinfo(dtrain, "base_margin", margin)
|
||||
if (!succ)
|
||||
warning('Attemp to use margin failed.')
|
||||
}
|
||||
}
|
||||
|
||||
params = list(...)
|
||||
|
||||
watchlist=list()
|
||||
if (verbose)
|
||||
{
|
||||
if (!is.null(validation))
|
||||
{
|
||||
if (class(validation)!='xgb.DMatrix')
|
||||
dtest = xgb.DMatrix(validation)
|
||||
else
|
||||
dtest = validation
|
||||
watchlist = list(eval=dtest,train=dtrain)
|
||||
}
|
||||
|
||||
else
|
||||
watchlist = list(train=dtrain)
|
||||
}
|
||||
|
||||
bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval)
|
||||
|
||||
return(bst)
|
||||
}
|
||||
|
||||
|
||||
#' eXtreme Gradient Boosting (Tree) library
|
||||
#'
|
||||
#' A simple interface for xgboost in R
|
||||
#'
|
||||
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param label the response variable. User should not set this field,
|
||||
# if data is local data file or \code{xgb.DMatrix}.
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
#' \itemize{
|
||||
#' \item \code{reg:linear} linear regression
|
||||
#' \item \code{binary:logistic} logistic regression for classification
|
||||
#' }
|
||||
#' \item \code{eta} step size of each boosting step
|
||||
#' \item \code{max_depth} maximum depth of the tree
|
||||
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
#' }
|
||||
#'
|
||||
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
#' further details. See also inst/examples/demo.R for walkthrough example in R.
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
#' information of performance. If 2, xgboost will print information of both
|
||||
#' performance and construction progress information
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the modeling function for xgboost.
|
||||
#'
|
||||
#' Parallelization is automatically enabled if OpenMP is present.
|
||||
#' Number of threads can also be manually specified via "nthread" parameter
|
||||
#'
|
||||
#' @examples
|
||||
#' data(iris)
|
||||
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
#' @export
|
||||
#'
|
||||
xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
|
||||
verbose = 1, ...) {
|
||||
inClass <- class(data)
|
||||
if (inClass == "dgCMatrix" || inClass == "matrix") {
|
||||
if (is.null(label))
|
||||
stop("xgboost: need label when data is a matrix")
|
||||
dtrain <- xgb.DMatrix(data, label = label)
|
||||
} else {
|
||||
if (!is.null(label))
|
||||
warning("xgboost: label will be ignored.")
|
||||
if (inClass == "character")
|
||||
dtrain <- xgb.DMatrix(data) else if (inClass == "xgb.DMatrix")
|
||||
dtrain <- data else stop("xgboost: Invalid input of data")
|
||||
}
|
||||
|
||||
if (verbose > 1) {
|
||||
silent <- 0
|
||||
} else {
|
||||
silent <- 1
|
||||
}
|
||||
|
||||
params <- append(params, list(silent = silent))
|
||||
params <- append(params, list(...))
|
||||
|
||||
if (verbose > 0)
|
||||
watchlist <- list(train = dtrain) else watchlist <- list()
|
||||
|
||||
bst <- xgb.train(params, dtrain, nrounds, watchlist)
|
||||
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -1,10 +1,21 @@
|
||||
This is subfolder for experimental version of R package.
|
||||
# R package for xgboost.
|
||||
|
||||
Not yet ready.
|
||||
## Installation
|
||||
|
||||
Installation:
|
||||
For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
||||
|
||||
```r
|
||||
require(devtools)
|
||||
install_github('xgboost','tqchen',subdir='R-package')
|
||||
```
|
||||
|
||||
For stable version on CRAN, please run
|
||||
|
||||
```r
|
||||
install.packages('xgboost')
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example.
|
||||
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.
|
||||
|
||||
@ -1,133 +0,0 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
# helper function to read libsvm format
|
||||
# this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose
|
||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||
read.libsvm = function(fname, maxcol) {
|
||||
content = readLines(fname)
|
||||
nline = length(content)
|
||||
label = numeric(nline)
|
||||
mat = matrix(0, nline, maxcol+1)
|
||||
for (i in 1:nline) {
|
||||
arr = as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] = as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv = strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex = as.integer(kv[1]) + 1
|
||||
fvalue = as.numeric(kv[2])
|
||||
mat[i,findex] = fvalue
|
||||
}
|
||||
}
|
||||
mat = as(mat, "sparseMatrix")
|
||||
return(list(label=label, data=mat))
|
||||
}
|
||||
|
||||
############################
|
||||
# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
||||
############################
|
||||
|
||||
# Directly read in local file
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||
class(dtrain)
|
||||
|
||||
# read file in R
|
||||
csc = read.libsvm("agaricus.txt.train", 126)
|
||||
y = csc$label
|
||||
x = csc$data
|
||||
|
||||
# x as Sparse Matrix
|
||||
class(x)
|
||||
dtrain = xgb.DMatrix(x, label=y)
|
||||
|
||||
# x as dense matrix
|
||||
dense.x = as.matrix(x)
|
||||
dtrain = xgb.DMatrix(dense.x, label=y)
|
||||
|
||||
############################
|
||||
# Test xgboost with local file, sparse matrix and dense matrix in R.
|
||||
############################
|
||||
|
||||
# Test with DMatrix object
|
||||
bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
|
||||
# Test with local file
|
||||
bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
|
||||
# Test with Sparse Matrix
|
||||
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
|
||||
# Test with dense Matrix
|
||||
bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
|
||||
# Test with validation set
|
||||
bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test',
|
||||
max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
|
||||
############################
|
||||
# Test predict
|
||||
############################
|
||||
|
||||
# Prediction with DMatrix object
|
||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||
pred = predict(bst, dtest)
|
||||
|
||||
# Prediction with local test file
|
||||
pred = predict(bst, 'agaricus.txt.test')
|
||||
|
||||
# Prediction with Sparse Matrix
|
||||
csc = read.libsvm("agaricus.txt.test", 126)
|
||||
test.y = csc$label
|
||||
test.x = csc$data
|
||||
pred = predict(bst, test.x)
|
||||
|
||||
# Extrac label with xgb.getinfo
|
||||
labels = xgb.getinfo(dtest, "label")
|
||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
||||
print(paste("error=",err))
|
||||
|
||||
############################
|
||||
# Save and load model to hard disk
|
||||
############################
|
||||
|
||||
# save model to binary local file
|
||||
xgb.save(bst, 'model.save')
|
||||
|
||||
# load binary model to R
|
||||
bst = xgb.load('model.save')
|
||||
pred = predict(bst, test.x)
|
||||
|
||||
# save model to text file
|
||||
xgb.dump(bst, 'model.dump')
|
||||
|
||||
############################
|
||||
# Customized objective and evaluation function
|
||||
############################
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj = function(preds, dtrain) {
|
||||
labels = xgb.getinfo(dtrain, "label")
|
||||
preds = 1.0 / (1.0 + exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return(list(grad=grad, hess=hess))
|
||||
}
|
||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror = function(preds, dtrain) {
|
||||
labels = xgb.getinfo(dtrain, "label")
|
||||
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
||||
return(list(metric="error", value=err))
|
||||
}
|
||||
|
||||
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
|
||||
obj=logregobj, feval=evalerror)
|
||||
|
||||
|
||||
@ -1,127 +0,0 @@
|
||||
# load xgboost library
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
# helper function to read libsvm format
|
||||
# this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose
|
||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||
read.libsvm <- function(fname, maxcol) {
|
||||
content <- readLines(fname)
|
||||
nline <- length(content)
|
||||
label <- numeric(nline)
|
||||
mat <- matrix(0, nline, maxcol+1)
|
||||
for (i in 1:nline) {
|
||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] <- as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv <- strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex <- as.integer(kv[1]) + 1
|
||||
fvalue <- as.numeric(kv[2])
|
||||
mat[i,findex] <- fvalue
|
||||
}
|
||||
}
|
||||
mat <- as(mat, "sparseMatrix")
|
||||
return(list(label=label, data=mat))
|
||||
}
|
||||
|
||||
# test code here
|
||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
# training xgboost model
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
# make prediction
|
||||
preds <- xgb.predict(bst, dtest)
|
||||
labels <- xgb.getinfo(dtest, "label")
|
||||
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
|
||||
# print error rate
|
||||
print(paste("error=",err))
|
||||
|
||||
# dump model
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# dump model with feature map
|
||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
succ <- xgb.save(dtest, "dtest.buffer")
|
||||
# save model into file
|
||||
succ <- xgb.save(bst, "xgb.model")
|
||||
# load model and data in
|
||||
bst2 <- xgb.Booster(modelfile="xgb.model")
|
||||
dtest2 <- xgb.DMatrix("dtest.buffer")
|
||||
preds2 <- xgb.predict(bst2, dtest2)
|
||||
# assert they are the same
|
||||
stopifnot(sum(abs(preds2-preds)) == 0)
|
||||
|
||||
###
|
||||
# build dmatrix from sparseMatrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.sparseMatrix')
|
||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
||||
label <- csc$label
|
||||
data <- csc$data
|
||||
dtrain <- xgb.DMatrix(data, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# build dmatrix from dense matrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.Matrix')
|
||||
mat = as.matrix(data)
|
||||
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
#
|
||||
print("start running example to used cutomized objective function")
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- xgb.getinfo(dtrain, "label")
|
||||
preds <- 1.0 / (1.0 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1.0-preds)
|
||||
return(list(grad=grad, hess=hess))
|
||||
}
|
||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- xgb.getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
||||
return(list(metric="error", value=err))
|
||||
}
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py"s implementation of train
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ("start running example to start from a initial prediction")
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
# train xgboost for 1 round
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
|
||||
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
|
||||
succ <- xgb.setinfo(dtest, "base_margin", ptest)
|
||||
print ("this is result of running from initial prediction")
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
@ -1,103 +1,153 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
# helper function to read libsvm format
|
||||
# this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose
|
||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||
read.libsvm = function(fname, maxcol) {
|
||||
content = readLines(fname)
|
||||
nline = length(content)
|
||||
label = numeric(nline)
|
||||
mat = matrix(0, nline, maxcol+1)
|
||||
for (i in 1:nline) {
|
||||
arr = as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] = as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv = strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex = as.integer(kv[1]) + 1
|
||||
fvalue = as.numeric(kv[2])
|
||||
mat[i,findex] = fvalue
|
||||
}
|
||||
# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose adopted from
|
||||
# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||
read.libsvm <- function(fname, maxcol) {
|
||||
content <- readLines(fname)
|
||||
nline <- length(content)
|
||||
label <- numeric(nline)
|
||||
mat <- matrix(0, nline, maxcol + 1)
|
||||
for (i in 1:nline) {
|
||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] <- as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv <- strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex <- as.integer(kv[1]) + 1
|
||||
fvalue <- as.numeric(kv[2])
|
||||
mat[i, findex] <- fvalue
|
||||
}
|
||||
mat = as(mat, "sparseMatrix")
|
||||
return(list(label=label, data=mat))
|
||||
}
|
||||
mat <- as(mat, "sparseMatrix")
|
||||
return(list(label = label, data = mat))
|
||||
}
|
||||
|
||||
# Parameter setting
|
||||
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
||||
|
||||
# Directly read in local file
|
||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
watchlist = list("eval"=dtest,"train"=dtrain)
|
||||
class(dtrain)
|
||||
|
||||
###########################
|
||||
# Train from local file
|
||||
###########################
|
||||
# read file in R
|
||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
||||
y <- csc$label
|
||||
x <- csc$data
|
||||
|
||||
# Training
|
||||
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
|
||||
# Prediction
|
||||
pred = predict(bst, 'agaricus.txt.test')
|
||||
# Performance
|
||||
labels = xgb.getinfo(dtest, "label")
|
||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
||||
print(paste("error=",err))
|
||||
|
||||
###########################
|
||||
# Train from R object
|
||||
###########################
|
||||
|
||||
csc = read.libsvm("agaricus.txt.train", 126)
|
||||
y = csc$label
|
||||
x = csc$data
|
||||
# x as Sparse Matrix
|
||||
class(x)
|
||||
dtrain <- xgb.DMatrix(x, label = y)
|
||||
|
||||
# Training
|
||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
||||
# Prediction
|
||||
pred = predict(bst, 'agaricus.txt.test')
|
||||
# Performance
|
||||
labels = xgb.getinfo(dtest, "label")
|
||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
||||
print(paste("error=",err))
|
||||
# x as dense matrix
|
||||
dense.x <- as.matrix(x)
|
||||
dtrain <- xgb.DMatrix(dense.x, label = y)
|
||||
|
||||
# Training with dense matrix
|
||||
x = as.matrix(x)
|
||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
||||
############################ Test xgboost with local file, sparse matrix and dense matrix in R.
|
||||
|
||||
###########################
|
||||
# Train with customization
|
||||
###########################
|
||||
# Test with DMatrix object
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj = function(preds, dtrain) {
|
||||
labels = xgb.getinfo(dtrain, "label")
|
||||
preds = 1.0 / (1.0 + exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return(list(grad=grad, hess=hess))
|
||||
# Verbose = 0,1,2
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 0)
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 1)
|
||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 2)
|
||||
|
||||
# Test with local file
|
||||
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# Test with Sparse Matrix
|
||||
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# Test with dense Matrix
|
||||
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
|
||||
############################ Test predict
|
||||
|
||||
# Prediction with DMatrix object
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
pred <- predict(bst, dtest)
|
||||
|
||||
# Prediction with local test file
|
||||
pred <- predict(bst, "agaricus.txt.test")
|
||||
|
||||
# Prediction with Sparse Matrix
|
||||
csc <- read.libsvm("agaricus.txt.test", 126)
|
||||
test.y <- csc$label
|
||||
test.x <- csc$data
|
||||
pred <- predict(bst, test.x)
|
||||
|
||||
# Extrac label with getinfo
|
||||
labels <- getinfo(dtest, "label")
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
|
||||
print(paste("error=", err))
|
||||
|
||||
############################ Save and load model to hard disk
|
||||
|
||||
# save model to binary local file
|
||||
xgb.save(bst, "xgboost.model")
|
||||
|
||||
# load binary model to R
|
||||
bst <- xgb.load("xgboost.model")
|
||||
pred <- predict(bst, test.x)
|
||||
|
||||
# save model to text file
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# save model to text file, with feature map
|
||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
||||
|
||||
# save a DMatrix object to hard disk
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
|
||||
# load a DMatrix object to R
|
||||
dtrain <- xgb.DMatrix("dtrain.buffer")
|
||||
|
||||
############################ More flexible training function xgb.train
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
|
||||
# training xgboost model
|
||||
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
|
||||
|
||||
############################ cutomsized loss function
|
||||
|
||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||
|
||||
# note: for customized objective function, we leave objective as default note: what we are getting is
|
||||
# margin value in prediction you must know what you are doing
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient this is
|
||||
# loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror = function(preds, dtrain) {
|
||||
labels = xgb.getinfo(dtrain, "label")
|
||||
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
||||
return(list(metric="error", value=err))
|
||||
# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
|
||||
# you do customized loss function, the default prediction value is margin this may make buildin
|
||||
# evalution metric not function properly for example, we are doing logistic loss, the prediction is
|
||||
# score before logistic transformation the buildin evaluation error assumes input is after logistic
|
||||
# transformation Take this in mind when you use the customization, and maybe you need write customized
|
||||
# evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror)
|
||||
# training with customized objective, we can also do step by step training simply look at xgboost.py's
|
||||
# implementation of train
|
||||
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||
|
||||
############################
|
||||
# Train with previous result
|
||||
############################
|
||||
|
||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
||||
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
|
||||
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)
|
||||
|
||||
|
||||
28
R-package/man/getinfo.Rd
Normal file
28
R-package/man/getinfo.Rd
Normal file
@ -0,0 +1,28 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{getinfo}
|
||||
\alias{getinfo}
|
||||
\alias{getinfo,xgb.DMatrix-method}
|
||||
\title{Get information of an xgb.DMatrix object}
|
||||
\usage{
|
||||
getinfo(object, ...)
|
||||
|
||||
\S4method{getinfo}{xgb.DMatrix}(object, name)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Get information of an xgb.DMatrix object
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
labels <- getinfo(dtrain, "label")
|
||||
}
|
||||
|
||||
32
R-package/man/predict-xgb.Booster-method.Rd
Normal file
32
R-package/man/predict-xgb.Booster-method.Rd
Normal file
@ -0,0 +1,32 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{predict,xgb.Booster-method}
|
||||
\alias{predict,xgb.Booster-method}
|
||||
\title{Predict method for eXtreme Gradient Boosting model}
|
||||
\usage{
|
||||
\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
|
||||
ntreelimit = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.Boost"}
|
||||
|
||||
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{outputmargin}{whether the prediction should be shown in the original
|
||||
value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
untransformed margin value. In logistic regression, outputmargin=T will
|
||||
output value before logistic transformation.}
|
||||
|
||||
\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
|
||||
set it to be value bigger than 0. It will use all trees by default.}
|
||||
}
|
||||
\description{
|
||||
Predicted values based on xgboost model object.
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
}
|
||||
|
||||
30
R-package/man/slice.Rd
Normal file
30
R-package/man/slice.Rd
Normal file
@ -0,0 +1,30 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\docType{methods}
|
||||
\name{slice}
|
||||
\alias{slice}
|
||||
\alias{slice,xgb.DMatrix-method}
|
||||
\title{Get a new DMatrix containing the specified rows of
|
||||
orginal xgb.DMatrix object}
|
||||
\usage{
|
||||
slice(object, ...)
|
||||
|
||||
\S4method{slice}{xgb.DMatrix}(object, idxset, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\description{
|
||||
Get a new DMatrix containing the specified rows of
|
||||
orginal xgb.DMatrix object
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
dsub <- slice(dtrain, 1:3)
|
||||
}
|
||||
|
||||
28
R-package/man/xgb.DMatrix.Rd
Normal file
28
R-package/man/xgb.DMatrix.Rd
Normal file
@ -0,0 +1,28 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.DMatrix}
|
||||
\alias{xgb.DMatrix}
|
||||
\title{Contruct xgb.DMatrix object}
|
||||
\usage{
|
||||
xgb.DMatrix(data, info = list(), missing = 0, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
|
||||
indicating the data file.}
|
||||
|
||||
\item{info}{a list of information of the xgb.DMatrix object}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{...}{other information to pass to \code{info}.}
|
||||
}
|
||||
\description{
|
||||
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||
}
|
||||
|
||||
23
R-package/man/xgb.DMatrix.save.Rd
Normal file
23
R-package/man/xgb.DMatrix.save.Rd
Normal file
@ -0,0 +1,23 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.DMatrix.save}
|
||||
\alias{xgb.DMatrix.save}
|
||||
\title{Save xgb.DMatrix object to binary file}
|
||||
\usage{
|
||||
xgb.DMatrix.save(DMatrix, fname)
|
||||
}
|
||||
\arguments{
|
||||
\item{DMatrix}{the model object.}
|
||||
|
||||
\item{fname}{the name of the binary file.}
|
||||
}
|
||||
\description{
|
||||
Save xgb.DMatrix object to binary file
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||
}
|
||||
|
||||
27
R-package/man/xgb.dump.Rd
Normal file
27
R-package/man/xgb.dump.Rd
Normal file
@ -0,0 +1,27 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.dump}
|
||||
\alias{xgb.dump}
|
||||
\title{Save xgboost model to text file}
|
||||
\usage{
|
||||
xgb.dump(model, fname, fmap = "")
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
|
||||
\item{fname}{the name of the binary file.}
|
||||
|
||||
\item{fmap}{feature map file representing the type of feature.
|
||||
Detailed description could be found at
|
||||
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
Run inst/examples/demo.R for the result and inst/examples/featmap.txt
|
||||
for example Format.}
|
||||
}
|
||||
\description{
|
||||
Save a xgboost model to text file. Could be parsed later.
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
xgb.dump(bst, 'iris.xgb.model.dump')
|
||||
}
|
||||
|
||||
21
R-package/man/xgb.load.Rd
Normal file
21
R-package/man/xgb.load.Rd
Normal file
@ -0,0 +1,21 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.load}
|
||||
\alias{xgb.load}
|
||||
\title{Load xgboost model from binary file}
|
||||
\usage{
|
||||
xgb.load(modelfile)
|
||||
}
|
||||
\arguments{
|
||||
\item{modelfile}{the name of the binary file.}
|
||||
}
|
||||
\description{
|
||||
Load xgboost model from the binary model file
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
xgb.save(bst, 'iris.xgb.model')
|
||||
bst <- xgb.load('iris.xgb.model')
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
}
|
||||
|
||||
23
R-package/man/xgb.save.Rd
Normal file
23
R-package/man/xgb.save.Rd
Normal file
@ -0,0 +1,23 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.save}
|
||||
\alias{xgb.save}
|
||||
\title{Save xgboost model to binary file}
|
||||
\usage{
|
||||
xgb.save(model, fname)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
|
||||
\item{fname}{the name of the binary file.}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model from xgboost or xgb.train
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
xgb.save(bst, 'iris.xgb.model')
|
||||
bst <- xgb.load('iris.xgb.model')
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
}
|
||||
|
||||
78
R-package/man/xgb.train.Rd
Normal file
78
R-package/man/xgb.train.Rd
Normal file
@ -0,0 +1,78 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgb.train}
|
||||
\alias{xgb.train}
|
||||
\title{eXtreme Gradient Boosting Training}
|
||||
\usage{
|
||||
xgb.train(params = list(), dtrain, nrounds, watchlist = list(),
|
||||
obj = NULL, feval = NULL, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:linear} linear regression
|
||||
\item \code{binary:logistic} logistic regression for classification
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
}
|
||||
|
||||
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
further details. See also inst/examples/demo.R for walkthrough example in R.}
|
||||
|
||||
\item{dtrain}{takes an \code{xgb.DMatrix} as the input.}
|
||||
|
||||
\item{nrounds}{the max number of iterations}
|
||||
|
||||
\item{watchlist}{what information should be printed when \code{verbose=1} or
|
||||
\code{verbose=2}. Watchlist is used to specify validation set monitoring
|
||||
during training. For example user can specify
|
||||
watchlist=list(validation1=mat1, validation2=mat2) to watch
|
||||
the performance of each round's model on mat1 and mat2}
|
||||
|
||||
\item{obj}{customized objective function. Returns gradient and second order
|
||||
gradient with given prediction and dtrain,}
|
||||
|
||||
\item{feval}{custimized evaluation function. Returns
|
||||
\code{list(metric='metric-name', value='metric-value')} with given
|
||||
prediction and dtrain,}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
}
|
||||
\description{
|
||||
The training function of xgboost
|
||||
}
|
||||
\details{
|
||||
This is the training function for xgboost.
|
||||
|
||||
Parallelization is automatically enabled if OpenMP is present.
|
||||
Number of threads can also be manually specified via "nthread" parameter.
|
||||
|
||||
This function only accepts an \code{xgb.DMatrix} object as the input.
|
||||
It supports advanced features such as watchlist, customized objective function,
|
||||
therefore it is more flexible than \code{\link{xgboost}}.
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
iris[,5] <- as.numeric(iris[,5])
|
||||
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||
dtest <- dtrain
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||
}
|
||||
|
||||
52
R-package/man/xgboost.Rd
Normal file
52
R-package/man/xgboost.Rd
Normal file
@ -0,0 +1,52 @@
|
||||
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||
\name{xgboost}
|
||||
\alias{xgboost}
|
||||
\title{eXtreme Gradient Boosting (Tree) library}
|
||||
\usage{
|
||||
xgboost(data = NULL, label = NULL, params = list(), nrounds,
|
||||
verbose = 1, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{label}{the response variable. User should not set this field,}
|
||||
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:linear} linear regression
|
||||
\item \code{binary:logistic} logistic regression for classification
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
}
|
||||
|
||||
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
further details. See also inst/examples/demo.R for walkthrough example in R.}
|
||||
|
||||
\item{nrounds}{the max number of iterations}
|
||||
|
||||
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
information of performance. If 2, xgboost will print information of both
|
||||
performance and construction progress information}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
}
|
||||
\description{
|
||||
A simple interface for xgboost in R
|
||||
}
|
||||
\details{
|
||||
This is the modeling function for xgboost.
|
||||
|
||||
Parallelization is automatically enabled if OpenMP is present.
|
||||
Number of threads can also be manually specified via "nthread" parameter
|
||||
}
|
||||
\examples{
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
}
|
||||
|
||||
@ -1,28 +1,9 @@
|
||||
# package root
|
||||
PKGROOT=../../
|
||||
# _*_ mode: Makefile; _*_
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
|
||||
# expose these flags to R CMD SHLIB
|
||||
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
|
||||
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
|
||||
|
||||
ifeq ($(no_omp),1)
|
||||
PKG_CPPFLAGS += -DDISABLE_OPENMP
|
||||
endif
|
||||
|
||||
CXXOBJ= xgboost_wrapper.o xgboost_io.o
|
||||
OBJECTS= xgboost_R.o $(CXXOBJ)
|
||||
|
||||
.PHONY: all clean
|
||||
all: $(SHLIB)
|
||||
$(SHLIB): $(OBJECTS)
|
||||
|
||||
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
|
||||
xgboost_io.o: ../../src/io/io.cpp
|
||||
|
||||
$(CXXOBJ) :
|
||||
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
clean:
|
||||
rm -rf *.so *.o *~ *.dll
|
||||
|
||||
|
||||
@ -1,32 +1,7 @@
|
||||
# package root
|
||||
PKGROOT=../../
|
||||
# _*_ mode: Makefile; _*_
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
|
||||
# expose these flags to R CMD SHLIB
|
||||
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fopenmp -fPIC $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
|
||||
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||
|
||||
# add flag to build native code even in cross compiler
|
||||
ifeq "$(WIN)" "64"
|
||||
PKG_CPPFLAGS += -m64
|
||||
endif
|
||||
|
||||
ifeq ($(no_omp),1)
|
||||
PKG_CPPFLAGS += -DDISABLE_OPENMP
|
||||
endif
|
||||
|
||||
CXXOBJ= xgboost_wrapper.o xgboost_io.o
|
||||
OBJECTS= xgboost_R.o $(CXXOBJ)
|
||||
|
||||
.PHONY: all clean
|
||||
all: $(SHLIB)
|
||||
$(SHLIB): $(OBJECTS)
|
||||
|
||||
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
|
||||
xgboost_io.o: ../../src/io/io.cpp
|
||||
|
||||
$(CXXOBJ) :
|
||||
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
clean:
|
||||
rm -rf *.so *.o *~ *.dll
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
|
||||
|
||||
@ -2,25 +2,54 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include "xgboost_R.h"
|
||||
#include "../../wrapper/xgboost_wrapper.h"
|
||||
#include "../../src/utils/utils.h"
|
||||
#include "../../src/utils/omp.h"
|
||||
#include "../../src/utils/matrix_csr.h"
|
||||
|
||||
#include "wrapper/xgboost_wrapper.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/utils/omp.h"
|
||||
using namespace std;
|
||||
using namespace xgboost;
|
||||
|
||||
extern "C" {
|
||||
void XGBoostAssert_R(int exp, const char *fmt, ...);
|
||||
void XGBoostCheck_R(int exp, const char *fmt, ...);
|
||||
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
|
||||
}
|
||||
|
||||
// implements error handling
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
void HandleAssertError(const char *msg) {
|
||||
error("%s", msg);
|
||||
}
|
||||
void HandleCheckError(const char *msg) {
|
||||
error("%s", msg);
|
||||
extern "C" {
|
||||
void (*Printf)(const char *fmt, ...) = Rprintf;
|
||||
int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
|
||||
void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
|
||||
void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
|
||||
void (*Error)(const char *fmt, ...) = error;
|
||||
}
|
||||
} // namespace utils
|
||||
|
||||
namespace random {
|
||||
void Seed(unsigned seed) {
|
||||
warning("parameter seed is ignored, please set random seed using set.seed");
|
||||
}
|
||||
double Uniform(void) {
|
||||
return unif_rand();
|
||||
}
|
||||
double Normal(void) {
|
||||
return norm_rand();
|
||||
}
|
||||
} // namespace random
|
||||
} // namespace xgboost
|
||||
|
||||
// call before wrapper starts
|
||||
inline void _WrapperBegin(void) {
|
||||
GetRNGstate();
|
||||
}
|
||||
// call after wrapper starts
|
||||
inline void _WrapperEnd(void) {
|
||||
PutRNGstate();
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
void _DMatrixFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
@ -28,14 +57,17 @@ extern "C" {
|
||||
R_ClearExternalPtr(ext);
|
||||
}
|
||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
||||
_WrapperBegin();
|
||||
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
SEXP missing) {
|
||||
_WrapperBegin();
|
||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
||||
int nrow = INTEGER(dim)[0];
|
||||
int ncol = INTEGER(dim)[1];
|
||||
@ -47,55 +79,64 @@ extern "C" {
|
||||
data[i * ncol +j] = din[i + nrow * j];
|
||||
}
|
||||
}
|
||||
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
|
||||
void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data) {
|
||||
const int *col_ptr = INTEGER(indptr);
|
||||
const int *row_index = INTEGER(indices);
|
||||
const double *col_data = REAL(data);
|
||||
int ncol = length(indptr) - 1;
|
||||
_WrapperBegin();
|
||||
const int *p_indptr = INTEGER(indptr);
|
||||
const int *p_indices = INTEGER(indices);
|
||||
const double *p_data = REAL(data);
|
||||
int nindptr = length(indptr);
|
||||
int ndata = length(data);
|
||||
// transform into CSR format
|
||||
std::vector<bst_ulong> row_ptr;
|
||||
std::vector< std::pair<unsigned, float> > csr_data;
|
||||
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
|
||||
builder.InitBudget();
|
||||
for (int i = 0; i < ncol; ++i) {
|
||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(row_index[j]);
|
||||
}
|
||||
std::vector<bst_ulong> col_ptr_(nindptr);
|
||||
std::vector<unsigned> indices_(ndata);
|
||||
std::vector<float> data_(ndata);
|
||||
|
||||
for (int i = 0; i < nindptr; ++i) {
|
||||
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
|
||||
}
|
||||
builder.InitStorage();
|
||||
for (int i = 0; i < ncol; ++i) {
|
||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
|
||||
}
|
||||
}
|
||||
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
|
||||
std::vector<float> row_data(ndata);
|
||||
std::vector<unsigned> col_index(ndata);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < ndata; ++i) {
|
||||
col_index[i] = csr_data[i].first;
|
||||
row_data[i] = csr_data[i].second;
|
||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||
data_[i] = static_cast<float>(p_data[i]);
|
||||
}
|
||||
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
|
||||
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||
BeginPtr(data_), nindptr, ndata);
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
|
||||
_WrapperBegin();
|
||||
int len = length(idxset);
|
||||
std::vector<int> idxvec(len);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
idxvec[i] = INTEGER(idxset)[i] - 1;
|
||||
}
|
||||
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||
_WrapperBegin();
|
||||
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fname)), asInteger(silent));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||
_WrapperBegin();
|
||||
int len = length(array);
|
||||
const char *name = CHAR(asChar(field));
|
||||
if (!strcmp("group", name)) {
|
||||
@ -104,7 +145,8 @@ extern "C" {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
||||
}
|
||||
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
|
||||
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
|
||||
_WrapperEnd();
|
||||
return;
|
||||
}
|
||||
{
|
||||
@ -115,10 +157,12 @@ extern "C" {
|
||||
}
|
||||
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(field)),
|
||||
&vec[0], len);
|
||||
BeginPtr(vec), len);
|
||||
}
|
||||
_WrapperEnd();
|
||||
}
|
||||
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(field)), &olen);
|
||||
@ -127,6 +171,7 @@ extern "C" {
|
||||
REAL(ret)[i] = res[i];
|
||||
}
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
// functions related to booster
|
||||
@ -136,28 +181,35 @@ extern "C" {
|
||||
R_ClearExternalPtr(ext);
|
||||
}
|
||||
SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
_WrapperBegin();
|
||||
int len = length(dmats);
|
||||
std::vector<void*> dvec;
|
||||
for (int i = 0; i < len; ++i){
|
||||
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
||||
}
|
||||
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
|
||||
void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
_WrapperBegin();
|
||||
XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(name)),
|
||||
CHAR(asChar(val)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||
_WrapperBegin();
|
||||
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
||||
_WrapperBegin();
|
||||
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
|
||||
int len = length(grad);
|
||||
std::vector<float> tgrad(len), thess(len);
|
||||
@ -168,9 +220,11 @@ extern "C" {
|
||||
}
|
||||
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
||||
R_ExternalPtrAddr(dtrain),
|
||||
&tgrad[0], &thess[0], len);
|
||||
BeginPtr(tgrad), BeginPtr(thess), len);
|
||||
_WrapperEnd();
|
||||
}
|
||||
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
||||
_WrapperBegin();
|
||||
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
|
||||
int len = length(dmats);
|
||||
std::vector<void*> vec_dmats;
|
||||
@ -185,28 +239,37 @@ extern "C" {
|
||||
}
|
||||
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
&vec_dmats[0], &vec_sptr[0], len));
|
||||
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
|
||||
_WrapperEnd();
|
||||
}
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
|
||||
R_ExternalPtrAddr(dmat),
|
||||
asInteger(output_margin),
|
||||
asInteger(ntree_limit),
|
||||
&olen);
|
||||
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
REAL(ret)[i] = res[i];
|
||||
}
|
||||
UNPROTECT(1);
|
||||
_WrapperEnd();
|
||||
return ret;
|
||||
}
|
||||
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
||||
_WrapperBegin();
|
||||
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
||||
_WrapperBegin();
|
||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fmap)),
|
||||
@ -217,5 +280,6 @@ extern "C" {
|
||||
fprintf(fo, "%s", res[i]);
|
||||
}
|
||||
fclose(fo);
|
||||
_WrapperEnd();
|
||||
}
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
*/
|
||||
extern "C" {
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/Random.h>
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
@ -36,6 +37,13 @@ extern "C" {
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data);
|
||||
/*!
|
||||
* \brief create a new dmatrix from sliced content of existing matrix
|
||||
* \param handle instance of data matrix to be sliced
|
||||
* \param idxset index set
|
||||
* \return a sliced new matrix
|
||||
*/
|
||||
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
|
||||
/*!
|
||||
* \brief load a data matrix into binary file
|
||||
* \param handle a instance of data matrix
|
||||
@ -99,8 +107,9 @@ extern "C" {
|
||||
* \param handle handle
|
||||
* \param dmat data matrix
|
||||
* \param output_margin whether only output raw margin value
|
||||
* \param ntree_limit limit number of trees used in prediction
|
||||
*/
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
|
||||
/*!
|
||||
* \brief load model from existing file
|
||||
* \param handle handle
|
||||
@ -120,5 +129,5 @@ extern "C" {
|
||||
* \param fmap name to fmap can be empty string
|
||||
*/
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
|
||||
};
|
||||
}
|
||||
#endif // XGBOOST_WRAPPER_R_H_
|
||||
|
||||
33
R-package/src/xgboost_assert.c
Normal file
33
R-package/src/xgboost_assert.c
Normal file
@ -0,0 +1,33 @@
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <Rinternals.h>
|
||||
|
||||
// implements error handling
|
||||
void XGBoostAssert_R(int exp, const char *fmt, ...) {
|
||||
char buf[1024];
|
||||
if (exp == 0) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsprintf(buf, fmt, args);
|
||||
va_end(args);
|
||||
error("AssertError:%s\n", buf);
|
||||
}
|
||||
}
|
||||
void XGBoostCheck_R(int exp, const char *fmt, ...) {
|
||||
char buf[1024];
|
||||
if (exp == 0) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsprintf(buf, fmt, args);
|
||||
va_end(args);
|
||||
error("%s\n", buf);
|
||||
}
|
||||
}
|
||||
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
|
||||
int ret;
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
ret = vsnprintf(buf, size, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
||||
212
R-package/vignettes/xgboost.Rnw
Normal file
212
R-package/vignettes/xgboost.Rnw
Normal file
@ -0,0 +1,212 @@
|
||||
\documentclass{article}
|
||||
\RequirePackage{url}
|
||||
\usepackage{hyperref}
|
||||
\RequirePackage{amsmath}
|
||||
\RequirePackage{natbib}
|
||||
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
|
||||
|
||||
\makeatletter
|
||||
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
|
||||
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
|
||||
%\VignettePackage{xgboost}
|
||||
% \VignetteEngine{knitr::knitr}
|
||||
\makeatother
|
||||
|
||||
\begin{document}
|
||||
%\SweaveOpts{concordance=TRUE}
|
||||
|
||||
<<knitropts,echo=FALSE,message=FALSE>>=
|
||||
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
|
||||
@
|
||||
|
||||
%
|
||||
<<prelim,echo=FALSE>>=
|
||||
xgboost.version = '0.3-0'
|
||||
@
|
||||
%
|
||||
|
||||
\begin{center}
|
||||
\vspace*{6\baselineskip}
|
||||
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
|
||||
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
|
||||
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
|
||||
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
|
||||
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
|
||||
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
|
||||
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
|
||||
{\large \today}\par
|
||||
\vfill
|
||||
\end{center}
|
||||
|
||||
\thispagestyle{empty}
|
||||
|
||||
\clearpage
|
||||
|
||||
\setcounter{page}{1}
|
||||
|
||||
\section{Introduction}
|
||||
|
||||
This is an introductory document of using the \verb@xgboost@ package in R.
|
||||
|
||||
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
|
||||
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}.
|
||||
The package includes efficient linear model solver and tree learning algorithm.
|
||||
It supports various objective functions, including regression, classification
|
||||
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
|
||||
\begin{enumerate}
|
||||
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
|
||||
Windows and Linux, with openmp. It is generally over 10 times faster than
|
||||
\verb@gbm@.}
|
||||
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
|
||||
\begin{itemize}
|
||||
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
|
||||
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
|
||||
\item{Data File: }{Local data files}
|
||||
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
|
||||
\end{itemize}
|
||||
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
|
||||
and linear booster, and is optimized for sparse input.}
|
||||
\item{Customization: }{\verb@xgboost@ supports customized objective function
|
||||
and evaluation function}
|
||||
\item{Performance: }{\verb@xgboost@ has better performance on several different
|
||||
datasets.}
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
\section{Example with iris}
|
||||
|
||||
In this section, we will illustrate some common usage of \verb@xgboost@.
|
||||
|
||||
<<Training and prediction with iris>>=
|
||||
library(xgboost)
|
||||
data(iris)
|
||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
|
||||
nrounds = 5)
|
||||
xgb.save(bst, 'model.save')
|
||||
bst = xgb.load('model.save')
|
||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||
@
|
||||
|
||||
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
||||
\verb@predict@ does prediction on the model.
|
||||
|
||||
Here we can save the model to a binary local file, and load it when needed.
|
||||
We can't inspect the trees inside. However we have another function to save the
|
||||
model in plain text.
|
||||
<<Dump Model>>=
|
||||
xgb.dump(bst, 'model.dump')
|
||||
@
|
||||
|
||||
The output looks like
|
||||
|
||||
\begin{verbatim}
|
||||
booster[0]:
|
||||
0:[f2<2.45] yes=1,no=2,missing=1
|
||||
1:leaf=0.147059
|
||||
2:[f3<1.65] yes=3,no=4,missing=3
|
||||
3:leaf=0.464151
|
||||
4:leaf=0.722449
|
||||
booster[1]:
|
||||
0:[f2<2.45] yes=1,no=2,missing=1
|
||||
1:leaf=0.103806
|
||||
2:[f2<4.85] yes=3,no=4,missing=3
|
||||
3:leaf=0.316341
|
||||
4:leaf=0.510365
|
||||
\end{verbatim}
|
||||
|
||||
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
||||
It speeds up \verb@xgboost@, and is needed for advanced features such as
|
||||
training from initial prediction value, weighted training instance.
|
||||
|
||||
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
||||
<<xgb.DMatrix>>=
|
||||
iris.mat <- as.matrix(iris[,1:4])
|
||||
iris.label <- as.numeric(iris[,5])
|
||||
diris <- xgb.DMatrix(iris.mat, label = iris.label)
|
||||
class(diris)
|
||||
getinfo(diris,'label')
|
||||
@
|
||||
|
||||
We can also save the matrix to a binary file. Then load it simply with
|
||||
\verb@xgb.DMatrix@
|
||||
<<save model>>=
|
||||
xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
|
||||
diris = xgb.DMatrix('iris.xgb.DMatrix')
|
||||
@
|
||||
|
||||
\section{Advanced Examples}
|
||||
|
||||
The function \verb@xgboost@ is a simple function with less parameter, in order
|
||||
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
|
||||
|
||||
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
|
||||
|
||||
<<Customized loss function>>=
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- sqrt(mean((preds-labels)^2))
|
||||
return(list(metric = "MSE", value = err))
|
||||
}
|
||||
|
||||
dtest <- slice(diris,1:100)
|
||||
watchlist <- list(eval = dtest, train = diris)
|
||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||
|
||||
bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
|
||||
@
|
||||
|
||||
The gradient and second order gradient is required for the output of customized
|
||||
objective function.
|
||||
|
||||
We also have \verb@slice@ for row extraction. It is useful in
|
||||
cross-validation.
|
||||
|
||||
For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further
|
||||
details.
|
||||
|
||||
\section{The Higgs Boson competition}
|
||||
|
||||
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
|
||||
Boson Machine Learning Challenge}.
|
||||
|
||||
Here are the instructions to make a submission
|
||||
\begin{enumerate}
|
||||
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
|
||||
and extract them to \verb@data/@.
|
||||
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
|
||||
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
|
||||
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
|
||||
The computation will take less than a minute on Intel i7.
|
||||
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
|
||||
and submit your result.
|
||||
\end{enumerate}
|
||||
|
||||
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
|
||||
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
|
||||
The training set contains 350000 records and 30 features.
|
||||
|
||||
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
|
||||
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
|
||||
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
|
||||
still about two times faster than \verb@gbm@.
|
||||
|
||||
Meanwhile, the result from \verb@xgboost@ reaches
|
||||
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
|
||||
single model. This results stands in the
|
||||
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
|
||||
competition.
|
||||
|
||||
\bibliographystyle{jss}
|
||||
\nocite{*} % list uncited references
|
||||
\bibliography{xgboost}
|
||||
|
||||
\end{document}
|
||||
|
||||
20
R-package/vignettes/xgboost.bib
Normal file
20
R-package/vignettes/xgboost.bib
Normal file
@ -0,0 +1,20 @@
|
||||
@article{friedman2001greedy,
|
||||
title={Greedy function approximation: a gradient boosting machine},
|
||||
author={Friedman, Jerome H},
|
||||
journal={Annals of Statistics},
|
||||
pages={1189--1232},
|
||||
year={2001},
|
||||
publisher={JSTOR}
|
||||
}
|
||||
|
||||
@article{friedman2000additive,
|
||||
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
|
||||
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
|
||||
journal={The annals of statistics},
|
||||
volume={28},
|
||||
number={2},
|
||||
pages={337--407},
|
||||
year={2000},
|
||||
publisher={Institute of Mathematical Statistics}
|
||||
}
|
||||
|
||||
26
README.md
26
README.md
@ -1,26 +0,0 @@
|
||||
This is a Fork of XGBoost from https://github.com/tqchen/xgboost
|
||||
|
||||
In the main repo you already find 2 windows projects for the porting of the executable and the python library.
|
||||
|
||||
Here you have:
|
||||
|
||||
1) a c# dll wrapper, meaning the passage from unmanaged to managed code, in https://github.com/giuliohome/xgboost/tree/master/windows/xgboost_sharp_wrapper
|
||||
|
||||
2) the c# Higgs Kaggle demo, instead of the python one (actually you will get a higher score with the c# version, due to some changes I've made) in https://github.com/giuliohome/xgboost/tree/master/windows/kaggle_higgs_demo
|
||||
|
||||
Start the demo from the root folder like this:
|
||||
|
||||
bin\x64\Debug\kaggle_higgs_demo.exe training_path.csv test_path.csv sharp_pred.csv NFoldCV NRound
|
||||
|
||||
NFoldCV: 0 => no cv , 5 = 5-fold-cv, 10 = 10-fold-cv :-)
|
||||
|
||||
3) 5 fold cv implementation in c# for the demo: you see inline cv ams while training (of course on a completely separate set)
|
||||
|
||||
In my latest commit I've added
|
||||
|
||||
4) parallel execution of n-fold cv, on top of dotnet multithreading
|
||||
|
||||
5) double inputted model training, stopping at a configured ams objective
|
||||
|
||||
|
||||
|
||||
27
demo/README.md
Normal file
27
demo/README.md
Normal file
@ -0,0 +1,27 @@
|
||||
XGBoost Examples
|
||||
====
|
||||
This folder contains the all example codes using xgboost.
|
||||
|
||||
* Contribution of exampls, benchmarks is more than welcomed!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||
|
||||
Features Walkthrough
|
||||
====
|
||||
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
|
||||
* Basic walkthrough of wrappers [python](guide-python/basic_walkthrough.py)
|
||||
* Cutomize loss function, and evaluation metric [python](guide-python/custom_objective.py)
|
||||
* Boosting from existing prediction [python](guide-python/boost_from_prediction.py)
|
||||
* Predicting using first n trees [python](guide-python/predict_first_ntree.py)
|
||||
* Generalized Linear Model [python](guide-python/generalized_linear_model.py)
|
||||
* Cross validation [python](guide-python/cross_validation.py)
|
||||
|
||||
Basic Examples by Tasks
|
||||
====
|
||||
* [Binary classification](binary_classification)
|
||||
* [Multiclass classification](multiclass_classification)
|
||||
* [Regression](regression)
|
||||
* [Learning to Rank](rank)
|
||||
|
||||
Benchmarks
|
||||
====
|
||||
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
|
||||
2
demo/data/README.md
Normal file
2
demo/data/README.md
Normal file
@ -0,0 +1,2 @@
|
||||
This folder contains processed example dataset used by the demos.
|
||||
Copyright of the dataset belongs to the original copyright holder
|
||||
3
demo/guide-R/README.md
Normal file
3
demo/guide-R/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
To be finished
|
||||
5
demo/guide-R/runall.sh
Executable file
5
demo/guide-R/runall.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
# todo
|
||||
Rscript basic_walkthrough.R
|
||||
Rscript custom_objective.R
|
||||
Rscript boost_from_prediction.R
|
||||
8
demo/guide-python/README.md
Normal file
8
demo/guide-python/README.md
Normal file
@ -0,0 +1,8 @@
|
||||
XGBoost Python Feature Walkthrough
|
||||
====
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.py)
|
||||
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
||||
* [Boosting from existing prediction](boost_from_prediction.py)
|
||||
* [Predicting using first n trees](predict_first_ntree.py)
|
||||
* [Generalized Linear Model](generalized_linear_model.py)
|
||||
* [Cross validation](cross_validation.py)
|
||||
76
demo/guide-python/basic_walkthrough.py
Executable file
76
demo/guide-python/basic_walkthrough.py
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
# append the path to xgboost, you may need to change the following line
|
||||
# alternatively, you can add the path to PYTHONPATH environment variable
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### simple example
|
||||
# load file from text file, also binary buffer generated by xgboost
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
|
||||
# specify validations set to watch performance
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
# this is prediction
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
bst.save_model('0001.model')
|
||||
# dump model
|
||||
bst.dump_model('dump.raw.txt')
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.nice.txt','../data/featmap.txt')
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
dtest.save_binary('dtest.buffer')
|
||||
bst.save_model('xgb.model')
|
||||
# load model and data in
|
||||
bst2 = xgb.Booster(model_file='xgb.model')
|
||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||
preds2 = bst2.predict(dtest2)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2-preds)) == 0
|
||||
|
||||
###
|
||||
# build dmatrix from scipy.sparse
|
||||
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
||||
labels = []
|
||||
row = []; col = []; dat = []
|
||||
i = 0
|
||||
for l in open('../data/agaricus.txt.train'):
|
||||
arr = l.split()
|
||||
labels.append( int(arr[0]))
|
||||
for it in arr[1:]:
|
||||
k,v = it.split(':')
|
||||
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||
i += 1
|
||||
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||
dtrain = xgb.DMatrix( csr, label = labels )
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||
|
||||
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
||||
# we can also construct from csc matrix
|
||||
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
|
||||
dtrain = xgb.DMatrix(csc, label=labels)
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||
|
||||
print ('start running example of build DMatrix from numpy array')
|
||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
|
||||
# then convert to DMatrix
|
||||
npymat = csr.todense()
|
||||
dtrain = xgb.DMatrix(npymat, label = labels)
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||
|
||||
|
||||
26
demo/guide-python/boost_from_prediction.py
Executable file
26
demo/guide-python/boost_from_prediction.py
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ('start running example to start from a initial prediction')
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
# train xgboost for 1 round
|
||||
bst = xgb.train( param, dtrain, 1, watchlist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||
ptrain = bst.predict(dtrain, output_margin=True)
|
||||
ptest = bst.predict(dtest, output_margin=True)
|
||||
dtrain.set_base_margin(ptrain)
|
||||
dtest.set_base_margin(ptest)
|
||||
|
||||
print ('this is result of running from initial prediction')
|
||||
bst = xgb.train( param, dtrain, 1, watchlist )
|
||||
63
demo/guide-python/cross_validation.py
Executable file
63
demo/guide-python/cross_validation.py
Executable file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||
num_round = 2
|
||||
|
||||
print ('running cross validation')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0)
|
||||
|
||||
print ('running cross validation, disable standard deviation display')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0, show_stdv = False)
|
||||
|
||||
print ('running cross validation, with preprocessing function')
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
# as a example, we try to set scale_pos_weight
|
||||
def fpreproc(dtrain, dtest, param):
|
||||
label = dtrain.get_label()
|
||||
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||
param['scale_pos_weight'] = ratio
|
||||
return (dtrain, dtest, param)
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
# then the return value of fpreproc will be used to generate
|
||||
# results of that fold
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
# See custom_objective.py
|
||||
##
|
||||
print ('running cross validation, with cutomsized loss function')
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
44
demo/guide-python/custom_objective.py
Executable file
44
demo/guide-python/custom_objective.py
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
#
|
||||
print ('start running example to used cutomized objective function')
|
||||
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 2
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||
32
demo/guide-python/generalized_linear_model.py
Executable file
32
demo/guide-python/generalized_linear_model.py
Executable file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
##
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
|
||||
'alpha': 0.0001, 'lambda': 1 }
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
# param['eta'] = 1
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 4
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
22
demo/guide-python/predict_first_ntree.py
Executable file
22
demo/guide-python/predict_first_ntree.py
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 3
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
print ('start testing prediction from first n trees')
|
||||
### predict using first 1 tree
|
||||
label = dtest.get_label()
|
||||
ypred1 = bst.predict(dtest, ntree_limit=1)
|
||||
# by default, we predict using all the trees
|
||||
ypred2 = bst.predict(dtest)
|
||||
print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
|
||||
print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
|
||||
7
demo/guide-python/runall.sh
Executable file
7
demo/guide-python/runall.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
python basic_walkthrough.py
|
||||
python custom_objective.py
|
||||
python boost_from_prediction.py
|
||||
python generalized_linear_model.py
|
||||
python cross_validation.py
|
||||
rm -rf *~ *.model *.buffer
|
||||
@ -10,6 +10,7 @@ This script will achieve about 3.600 AMS score in public leadboard. To get start
|
||||
cd ../..
|
||||
make
|
||||
```
|
||||
|
||||
2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
|
||||
|
||||
3. Run ./run.sh
|
||||
@ -21,5 +22,5 @@ speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
||||
|
||||
Using R module
|
||||
=====
|
||||
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R
|
||||
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R.
|
||||
|
||||
|
||||
39
demo/kaggle-higgs/higgs-cv.py
Executable file
39
demo/kaggle-higgs/higgs-cv.py
Executable file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
|
||||
### load data in do training
|
||||
train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
|
||||
label = train[:,32]
|
||||
data = train[:,1:31]
|
||||
weight = train[:,31]
|
||||
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
|
||||
num_round = 120
|
||||
|
||||
print ('running cross validation, with preprocessing function')
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
# as a example, we try to set scale_pos_weight
|
||||
def fpreproc(dtrain, dtest, param):
|
||||
label = dtrain.get_label()
|
||||
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||
param['scale_pos_weight'] = ratio
|
||||
wtrain = dtrain.get_weight()
|
||||
wtest = dtest.get_weight()
|
||||
sum_weight = sum(wtrain) + sum(wtest)
|
||||
wtrain *= sum_weight / sum(wtrain)
|
||||
wtest *= sum_weight / sum(wtest)
|
||||
dtrain.set_weight(wtrain)
|
||||
dtest.set_weight(wtest)
|
||||
return (dtrain, dtest, param)
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
# then the return value of fpreproc will be used to generate
|
||||
# results of that fold
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)
|
||||
@ -1,5 +1,6 @@
|
||||
# include xgboost library, must set chdir=TRURE
|
||||
source("../../wrapper/xgboost.R", chdir=TRUE)
|
||||
# install xgboost package, see R-package in root folder
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
modelfile <- "higgs.model"
|
||||
outfile <- "higgs.pred.csv"
|
||||
@ -8,8 +9,8 @@ data <- as.matrix(dtest[2:31])
|
||||
idx <- dtest[[1]]
|
||||
|
||||
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||
bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile)
|
||||
ypred <- xgb.predict(bst, xgmat)
|
||||
bst <- xgb.load(modelfile=modelfile)
|
||||
ypred <- predict(bst, xgmat)
|
||||
|
||||
rorder <- rank(ypred, ties.method="first")
|
||||
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# include xgboost library, must set chdir=TRURE
|
||||
source("../../wrapper/xgboost.R", chdir=TRUE)
|
||||
# install xgboost package, see R-package in root folder
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE)
|
||||
@ -12,7 +14,7 @@ sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0)
|
||||
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||
param <- list("objective" = "binary:logitraw",
|
||||
"scale_pos_weight" = sumwneg / sumwpos,
|
||||
"bst:eta" = 0.1,
|
||||
|
||||
71
demo/kaggle-higgs/speedtest.R
Normal file
71
demo/kaggle-higgs/speedtest.R
Normal file
@ -0,0 +1,71 @@
|
||||
# install xgboost package, see R-package in root folder
|
||||
require(xgboost)
|
||||
require(gbm)
|
||||
require(methods)
|
||||
|
||||
testsize <- 550000
|
||||
|
||||
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
||||
|
||||
# gbm.time = system.time({
|
||||
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
||||
# verbose = TRUE)
|
||||
# })
|
||||
# print(gbm.time)
|
||||
# Test result: 761.48 secs
|
||||
|
||||
dtrain[33] <- dtrain[33] == "s"
|
||||
label <- as.numeric(dtrain[[33]])
|
||||
data <- as.matrix(dtrain[2:31])
|
||||
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||
|
||||
sumwpos <- sum(weight * (label==1.0))
|
||||
sumwneg <- sum(weight * (label==0.0))
|
||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||
|
||||
xgboost.time = list()
|
||||
threads = c(1,2,4,8,16)
|
||||
for (i in 1:length(threads)){
|
||||
thread = threads[i]
|
||||
xgboost.time[[i]] = system.time({
|
||||
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||
param <- list("objective" = "binary:logitraw",
|
||||
"scale_pos_weight" = sumwneg / sumwpos,
|
||||
"bst:eta" = 0.1,
|
||||
"bst:max_depth" = 6,
|
||||
"eval_metric" = "auc",
|
||||
"eval_metric" = "ams@0.15",
|
||||
"silent" = 1,
|
||||
"nthread" = thread)
|
||||
watchlist <- list("train" = xgmat)
|
||||
nround = 120
|
||||
print ("loading data end, start to boost trees")
|
||||
bst = xgb.train(param, xgmat, nround, watchlist );
|
||||
# save out model
|
||||
xgb.save(bst, "higgs.model")
|
||||
print ('finish training')
|
||||
})
|
||||
}
|
||||
|
||||
xgboost.time
|
||||
# [[1]]
|
||||
# user system elapsed
|
||||
# 444.98 1.96 450.22
|
||||
#
|
||||
# [[2]]
|
||||
# user system elapsed
|
||||
# 188.15 0.82 102.41
|
||||
#
|
||||
# [[3]]
|
||||
# user system elapsed
|
||||
# 143.29 0.79 44.18
|
||||
#
|
||||
# [[4]]
|
||||
# user system elapsed
|
||||
# 176.60 1.45 34.04
|
||||
#
|
||||
# [[5]]
|
||||
# user system elapsed
|
||||
# 180.15 2.85 35.26
|
||||
|
||||
@ -13,10 +13,10 @@ Project Logical Layout
|
||||
|
||||
File Naming Convention
|
||||
=======
|
||||
* The project is templatized, to make it easy to adjust input data structure.
|
||||
* .h files are data structures and interface, which are needed to use functions in that layer.
|
||||
* -inl.hpp files are implementations of interface, like cpp file in most project.
|
||||
- You only need to understand the interface file to understand the usage of that layer
|
||||
* In each folder, there can be a .cpp file, that compiles the module of that layer
|
||||
|
||||
How to Hack the Code
|
||||
======
|
||||
|
||||
340
src/data.h
340
src/data.h
@ -7,16 +7,8 @@
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include "utils/io.h"
|
||||
#include "utils/omp.h"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/iterator.h"
|
||||
#include "utils/random.h"
|
||||
#include "utils/matrix_csr.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*!
|
||||
@ -70,12 +62,12 @@ struct SparseBatch {
|
||||
/*! \brief an entry of sparse vector */
|
||||
struct Entry {
|
||||
/*! \brief feature index */
|
||||
bst_uint findex;
|
||||
bst_uint index;
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
// default constructor
|
||||
Entry(void) {}
|
||||
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {}
|
||||
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||
/*! \brief reversely compare feature values */
|
||||
inline static bool CmpValue(const Entry &a, const Entry &b) {
|
||||
return a.fvalue < b.fvalue;
|
||||
@ -86,7 +78,7 @@ struct SparseBatch {
|
||||
/*! \brief pointer to the elements*/
|
||||
const Entry *data;
|
||||
/*! \brief length of the instance */
|
||||
const bst_uint length;
|
||||
bst_uint length;
|
||||
/*! \brief constructor */
|
||||
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
|
||||
/*! \brief get i-th pair in the sparse vector*/
|
||||
@ -96,298 +88,72 @@ struct SparseBatch {
|
||||
};
|
||||
/*! \brief batch size */
|
||||
size_t size;
|
||||
};
|
||||
/*! \brief read-only row batch, used to access row continuously */
|
||||
struct RowBatch : public SparseBatch {
|
||||
/*! \brief the offset of rowid of this batch */
|
||||
size_t base_rowid;
|
||||
/*! \brief array[size+1], row pointer of each of the elements */
|
||||
const size_t *row_ptr;
|
||||
/*! \brief array[row_ptr.back()], content of the sparse element */
|
||||
const size_t *ind_ptr;
|
||||
/*! \brief array[ind_ptr.back()], content of the sparse element */
|
||||
const Entry *data_ptr;
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
|
||||
return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief This is a interface convention via template, defining the way to access features,
|
||||
* column access rule is defined by template, for efficiency purpose,
|
||||
* row access is defined by iterator of sparse batches
|
||||
* \tparam Derived type of actual implementation
|
||||
/*!
|
||||
* \brief read-only column batch, used to access columns,
|
||||
* the columns are not required to be continuous
|
||||
*/
|
||||
template<typename Derived>
|
||||
class FMatrixInterface {
|
||||
struct ColBatch : public SparseBatch {
|
||||
/*! \brief column index of each columns in the data */
|
||||
const bst_uint *col_index;
|
||||
/*! \brief pointer to the column data */
|
||||
const Inst *col_data;
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return col_data[i];
|
||||
}
|
||||
};
|
||||
/**
|
||||
* \brief interface of feature matrix, needed for tree construction
|
||||
* this interface defines two way to access features,
|
||||
* row access is defined by iterator of RowBatch
|
||||
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
|
||||
*/
|
||||
class IFMatrix {
|
||||
public:
|
||||
/*! \brief example iterator over one column */
|
||||
struct ColIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
inline bool Next(void);
|
||||
/*! \return row index of current position */
|
||||
inline bst_uint rindex(void) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue(void) const;
|
||||
};
|
||||
/*! \brief backward iterator over column */
|
||||
struct ColBackIter : public ColIter {};
|
||||
public:
|
||||
// column access is needed by some of tree construction algorithms
|
||||
// the interface only need to ganrantee row iter
|
||||
// column iter is active, when ColIterator is called, row_iter can be disabled
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
|
||||
/*!\brief get column iterator */
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
|
||||
/*!
|
||||
* \brief get column iterator, the columns must be sorted by feature value
|
||||
* \param cidx column index
|
||||
* \return column iterator
|
||||
* \brief get the column iterator associated with FMatrix with subset of column features
|
||||
* \param fset is the list of column index set that must be contained in the returning Column iterator
|
||||
* \return the column iterator, initialized so that it reads the elements in fset
|
||||
*/
|
||||
inline ColIter GetSortedCol(size_t cidx) const;
|
||||
/*!
|
||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
||||
* \param cidx column index
|
||||
* \return reverse column iterator
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
|
||||
/*!
|
||||
* \brief get number of columns
|
||||
* \return number of columns
|
||||
*/
|
||||
inline size_t NumCol(void) const;
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
|
||||
/*!
|
||||
* \brief check if column access is supported, if not, initialize column access
|
||||
* \param max_rows maximum number of rows allowed in constructor
|
||||
* \param subsample subsample ratio when generating column access
|
||||
*/
|
||||
inline void InitColAccess(void);
|
||||
virtual void InitColAccess(float subsample) = 0;
|
||||
// the following are column meta data, should be able to answer them fast
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess(void) const;
|
||||
/*! \breif return #entries-in-col */
|
||||
inline size_t GetColSize(size_t cidx) const;
|
||||
/*!
|
||||
* \breif return #entries-in-col / #rows
|
||||
* \param cidx column index
|
||||
* this function is used to help speedup,
|
||||
* doese not necessarily implement it if not sure, return 0.0;
|
||||
* \return column density
|
||||
*/
|
||||
inline float GetColDensity(size_t cidx) const;
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief row iterator */
|
||||
struct ColIter{
|
||||
const Entry *dptr_, *end_;
|
||||
ColIter(const Entry* begin, const Entry* end)
|
||||
:dptr_(begin), end_(end) {}
|
||||
inline bool Next(void) {
|
||||
if (dptr_ == end_) {
|
||||
return false;
|
||||
} else {
|
||||
++dptr_; return true;
|
||||
}
|
||||
}
|
||||
inline bst_uint rindex(void) const {
|
||||
return dptr_->findex;
|
||||
}
|
||||
inline bst_float fvalue(void) const {
|
||||
return dptr_->fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief reverse column iterator */
|
||||
struct ColBackIter : public ColIter {
|
||||
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
|
||||
// shadows ColIter::Next
|
||||
inline bool Next(void) {
|
||||
if (dptr_ == end_) {
|
||||
return false;
|
||||
} else {
|
||||
--dptr_; return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief constructor */
|
||||
FMatrixS(void) {
|
||||
iter_ = NULL;
|
||||
}
|
||||
// destructor
|
||||
~FMatrixS(void) {
|
||||
if (iter_ != NULL) delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess(void) const {
|
||||
return col_ptr_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
inline size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_ptr_.size() - 1;
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
inline const std::vector<bst_uint> buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get col sorted iterator */
|
||||
inline ColIter GetSortedCol(size_t cidx) const {
|
||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColIter(&col_data_[0] + col_ptr_[cidx] - 1,
|
||||
&col_data_[0] + col_ptr_[cidx + 1] - 1);
|
||||
}
|
||||
/*!
|
||||
* \brief get reversed col iterator,
|
||||
* this function will be deprecated at some point
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
|
||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColBackIter(&col_data_[0] + col_ptr_[cidx + 1],
|
||||
&col_data_[0] + col_ptr_[cidx]);
|
||||
}
|
||||
/*! \brief get col size */
|
||||
inline size_t GetColSize(size_t cidx) const {
|
||||
return col_ptr_[cidx+1] - col_ptr_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
inline float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
inline void InitColAccess(float pkeep = 1.0f) {
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(pkeep);
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
* this function is not threadsafe, returns iterator stored in FMatrixS
|
||||
*/
|
||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*! \brief set iterator */
|
||||
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
/*!
|
||||
* \brief save column access data into stream
|
||||
* \param fo output stream to save to
|
||||
*/
|
||||
inline void SaveColAccess(utils::IStream &fo) const {
|
||||
fo.Write(buffered_rowset_);
|
||||
if (buffered_rowset_.size() != 0) {
|
||||
SaveBinary(fo, col_ptr_, col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load column access data from stream
|
||||
* \param fo output stream to load from
|
||||
*/
|
||||
inline void LoadColAccess(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
|
||||
if (buffered_rowset_.size() != 0) {
|
||||
LoadBinary(fi, &col_ptr_, &col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* \param fo output stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void SaveBinary(utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<SparseBatch::Entry> &data) {
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write(&nrow, sizeof(size_t));
|
||||
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
|
||||
if (data.size() != 0) {
|
||||
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* \param fi input stream
|
||||
* \param out_ptr pointer data
|
||||
* \param out_data data content
|
||||
*/
|
||||
inline static void LoadBinary(utils::IStream &fi,
|
||||
std::vector<size_t> *out_ptr,
|
||||
std::vector<SparseBatch::Entry> *out_data) {
|
||||
size_t nrow;
|
||||
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
||||
out_ptr->resize(nrow + 1);
|
||||
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
|
||||
"invalid input file format");
|
||||
out_data->resize(out_ptr->back());
|
||||
if (out_data->size() != 0) {
|
||||
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
|
||||
"invalid input file format");
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \param pkeep probability to keep a row
|
||||
*/
|
||||
inline void InitColData(float pkeep) {
|
||||
buffered_rowset_.clear();
|
||||
// note: this part of code is serial, todo, parallelize this transformer
|
||||
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.AddBudget(inst[j].findex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
size_t ktop = 0;
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (ktop < buffered_rowset_.size() &&
|
||||
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
||||
++ktop;
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.PushElem(inst[j].findex,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
std::sort(&col_data_[0] + col_ptr_[i],
|
||||
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// --- data structure used to support InitColAccess --
|
||||
utils::IIterator<SparseBatch> *iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas in CSC format */
|
||||
std::vector<SparseBatch::Entry> col_data_;
|
||||
virtual bool HaveColAccess(void) const = 0;
|
||||
/*! \return number of columns in the FMatrix */
|
||||
virtual size_t NumCol(void) const = 0;
|
||||
/*! \brief get number of non-missing entries in column */
|
||||
virtual size_t GetColSize(size_t cidx) const = 0;
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const = 0;
|
||||
/*! \brief reference of buffered rowset */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
|
||||
// virtual destructor
|
||||
virtual ~IFMatrix(void){}
|
||||
};
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_H
|
||||
|
||||
@ -18,13 +18,13 @@ namespace gbm {
|
||||
* \brief gradient boosted linear model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBLinear : public IGradBooster<FMatrix> {
|
||||
class GBLinear : public IGradBooster {
|
||||
public:
|
||||
virtual ~GBLinear(void) {
|
||||
}
|
||||
// set model parameters
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strncmp(name, "bst:", 4)) {
|
||||
param.SetParam(name + 4, val);
|
||||
}
|
||||
@ -41,13 +41,12 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
virtual void InitModel(void) {
|
||||
model.InitModel();
|
||||
}
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) {
|
||||
this->InitFeatIndex(fmat);
|
||||
std::vector<bst_gpair> &gpair = *in_gpair;
|
||||
const int ngroup = model.param.num_output_group;
|
||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
|
||||
// for all the output group
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
@ -72,45 +71,52 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
}
|
||||
}
|
||||
}
|
||||
// number of features
|
||||
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const bst_uint fid = feat_index[i];
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
const float v = it.fvalue();
|
||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
sum_grad += p.grad * v;
|
||||
sum_hess += p.hess * v * v;
|
||||
}
|
||||
float &w = model[fid][gid];
|
||||
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
||||
w += dw;
|
||||
// update grad value
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
p.grad += p.hess * it.fvalue() * dw;
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
// number of features
|
||||
const ColBatch &batch = iter->Value();
|
||||
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
ColBatch::Inst col = batch[i];
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for (bst_uint j = 0; j < col.length; ++j) {
|
||||
const float v = col[j].fvalue;
|
||||
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
sum_grad += p.grad * v;
|
||||
sum_hess += p.hess * v * v;
|
||||
}
|
||||
float &w = model[fid][gid];
|
||||
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
||||
w += dw;
|
||||
// update grad value
|
||||
for (bst_uint j = 0; j < col.length; ++j) {
|
||||
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
p.grad += p.hess * col[j].fvalue * dw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) {
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) {
|
||||
utils::Check(ntree_limit == 0,
|
||||
"GBLinear::Predict ntrees is only valid for gbtree predictor");
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
const int ngroup = model.param.num_output_group;
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
utils::Assert(batch.base_rowid * ngroup == preds.size(),
|
||||
"base_rowid is not set correctly");
|
||||
// output convention: nrow * k, where nrow is number of rows
|
||||
@ -134,23 +140,11 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
}
|
||||
|
||||
protected:
|
||||
inline void InitFeatIndex(const FMatrix &fmat) {
|
||||
if (feat_index.size() != 0) return;
|
||||
// initialize feature index
|
||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
||||
feat_index.reserve(ncol);
|
||||
for (unsigned i = 0; i < ncol; ++i) {
|
||||
if (fmat.GetColSize(i) != 0) {
|
||||
feat_index.push_back(i);
|
||||
}
|
||||
}
|
||||
random::Shuffle(feat_index);
|
||||
}
|
||||
inline void Pred(const SparseBatch::Inst &inst, float *preds) {
|
||||
inline void Pred(const RowBatch::Inst &inst, float *preds) {
|
||||
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||
float psum = model.bias()[gid];
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
psum += inst[i].fvalue * model[inst[i].findex][gid];
|
||||
psum += inst[i].fvalue * model[inst[i].index][gid];
|
||||
}
|
||||
preds[gid] = psum;
|
||||
}
|
||||
@ -173,6 +167,7 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
learning_rate = 1.0f;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
// sync-names
|
||||
if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
|
||||
if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
|
||||
@ -214,9 +209,10 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
Param(void) {
|
||||
num_feature = 0;
|
||||
num_output_group = 1;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
std::memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val);
|
||||
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
|
||||
}
|
||||
|
||||
19
src/gbm/gbm.cpp
Normal file
19
src/gbm/gbm.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#include <cstring>
|
||||
#include "./gbm.h"
|
||||
#include "./gbtree-inl.hpp"
|
||||
#include "./gblinear-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
IGradBooster* CreateGradBooster(const char *name) {
|
||||
using namespace std;
|
||||
if (!strcmp("gbtree", name)) return new GBTree();
|
||||
if (!strcmp("gblinear", name)) return new GBLinear();
|
||||
utils::Error("unknown booster type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/fmap.h"
|
||||
|
||||
namespace xgboost {
|
||||
@ -14,9 +15,7 @@ namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief interface of gradient boosting model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IGradBooster {
|
||||
public:
|
||||
/*!
|
||||
@ -41,28 +40,31 @@ class IGradBooster {
|
||||
virtual void InitModel(void) = 0;
|
||||
/*!
|
||||
* \brief peform update to the model(boosting)
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param p_fmat feature matrix that provide access to features
|
||||
* \param info meta information about training
|
||||
* \param in_gpair address of the gradient pair statistics of the data
|
||||
* the booster may change content of gpair
|
||||
*/
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) = 0;
|
||||
/*!
|
||||
* \brief generate predictions for given feature matrix
|
||||
* \param fmat feature matrix
|
||||
* \param p_fmat feature matrix
|
||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||
* this means we do not have buffer index allocated to the gbm
|
||||
* a buffer index is assigned to each instance that requires repeative prediction
|
||||
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
|
||||
* \param info extra side information that may be needed for prediction
|
||||
* \param out_preds output vector to hold the predictions
|
||||
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
|
||||
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
|
||||
*/
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) = 0;
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) = 0;
|
||||
/*!
|
||||
* \brief dump the model in text format
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
@ -73,21 +75,11 @@ class IGradBooster {
|
||||
// destrcutor
|
||||
virtual ~IGradBooster(void){}
|
||||
};
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
#include "gbtree-inl.hpp"
|
||||
#include "gblinear-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
template<typename FMatrix>
|
||||
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
|
||||
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
|
||||
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
|
||||
utils::Error("unknown booster type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
/*!
|
||||
* \breif create a gradient booster from given name
|
||||
* \param name name of gradient booster
|
||||
*/
|
||||
IGradBooster* CreateGradBooster(const char *name);
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBM_H_
|
||||
|
||||
@ -9,21 +9,21 @@
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./gbm.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../tree/updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief gradient boosted tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBTree : public IGradBooster<FMatrix> {
|
||||
class GBTree : public IGradBooster {
|
||||
public:
|
||||
virtual ~GBTree(void) {
|
||||
this->Clear();
|
||||
}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strncmp(name, "bst:", 4)) {
|
||||
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
||||
// set into updaters, if already intialized
|
||||
@ -82,12 +82,12 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
||||
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
||||
}
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) {
|
||||
const std::vector<bst_gpair> &gpair = *in_gpair;
|
||||
if (mparam.num_output_group == 1) {
|
||||
this->BoostNewTrees(gpair, fmat, info, 0);
|
||||
this->BoostNewTrees(gpair, p_fmat, info, 0);
|
||||
} else {
|
||||
const int ngroup = mparam.num_output_group;
|
||||
utils::Check(gpair.size() % ngroup == 0,
|
||||
@ -99,14 +99,15 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
tmp[i] = gpair[i * ngroup + gid];
|
||||
}
|
||||
this->BoostNewTrees(tmp, fmat, info, gid);
|
||||
this->BoostNewTrees(tmp, p_fmat, info, gid);
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) {
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
@ -118,17 +119,13 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
}
|
||||
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
const size_t stride = info.num_row * mparam.num_output_group;
|
||||
preds.resize(stride * (mparam.size_leaf_vector+1));
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
|
||||
"base_rowid is not set correctly");
|
||||
// output convention: nrow * k, where nrow is number of rows
|
||||
// k is number of group
|
||||
preds.resize(preds.size() + batch.size * mparam.num_output_group);
|
||||
const RowBatch &batch = iter->Value();
|
||||
// parallel over local batch
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
@ -136,13 +133,14 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
const int tid = omp_get_thread_num();
|
||||
tree::RegTree::FVec &feats = thread_temp[tid];
|
||||
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
|
||||
const unsigned root_idx = info.GetRoot(ridx);
|
||||
utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
|
||||
// loop over output groups
|
||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||
preds[ridx * mparam.num_output_group + gid] =
|
||||
this->Pred(batch[i],
|
||||
buffer_offset < 0 ? -1 : buffer_offset+ridx,
|
||||
gid, root_idx, &feats);
|
||||
this->Pred(batch[i],
|
||||
buffer_offset < 0 ? -1 : buffer_offset + ridx,
|
||||
gid, info.GetRoot(ridx), &feats,
|
||||
&preds[ridx * mparam.num_output_group + gid], stride,
|
||||
ntree_limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -174,20 +172,20 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
updaters.clear();
|
||||
std::string tval = tparam.updater_seq;
|
||||
char *pstr;
|
||||
pstr = strtok(&tval[0], ",");
|
||||
pstr = std::strtok(&tval[0], ",");
|
||||
while (pstr != NULL) {
|
||||
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
|
||||
updaters.push_back(tree::CreateUpdater(pstr));
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
// set parameters
|
||||
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
}
|
||||
pstr = strtok(NULL, ",");
|
||||
pstr = std::strtok(NULL, ",");
|
||||
}
|
||||
tparam.updater_initialized = 1;
|
||||
}
|
||||
// do group specific group
|
||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
int bst_group) {
|
||||
this->InitUpdater();
|
||||
@ -202,7 +200,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
}
|
||||
// update the trees
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->Update(gpair, fmat, info, new_trees);
|
||||
updaters[i]->Update(gpair, p_fmat, info, new_trees);
|
||||
}
|
||||
// push back to model
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
@ -212,34 +210,53 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
mparam.num_trees += tparam.num_parallel_tree;
|
||||
}
|
||||
// make a prediction for a single instance
|
||||
inline float Pred(const SparseBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
tree::RegTree::FVec *p_feats) {
|
||||
inline void Pred(const RowBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
tree::RegTree::FVec *p_feats,
|
||||
float *out_pred, size_t stride, unsigned ntree_limit) {
|
||||
size_t itop = 0;
|
||||
float psum = 0.0f;
|
||||
// sum of leaf vector
|
||||
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
|
||||
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||
// number of valid trees
|
||||
unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
|
||||
// load buffered results if any
|
||||
if (bid >= 0) {
|
||||
if (bid >= 0 && ntree_limit == 0) {
|
||||
itop = pred_counter[bid];
|
||||
psum = pred_buffer[bid];
|
||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||
vec_psum[i] = pred_buffer[bid + i + 1];
|
||||
}
|
||||
}
|
||||
if (itop != trees.size()) {
|
||||
p_feats->Fill(inst);
|
||||
for (size_t i = itop; i < trees.size(); ++i) {
|
||||
if (tree_info[i] == bst_group) {
|
||||
psum += trees[i]->Predict(*p_feats, root_index);
|
||||
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
||||
psum += (*trees[i])[tid].leaf_value();
|
||||
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
|
||||
vec_psum[j] += trees[i]->leafvec(tid)[j];
|
||||
}
|
||||
if(--treeleft == 0) break;
|
||||
}
|
||||
}
|
||||
p_feats->Drop(inst);
|
||||
}
|
||||
// updated the buffered results
|
||||
if (bid >= 0) {
|
||||
if (bid >= 0 && ntree_limit == 0) {
|
||||
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
||||
pred_buffer[bid] = psum;
|
||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||
pred_buffer[bid + i + 1] = vec_psum[i];
|
||||
}
|
||||
}
|
||||
out_pred[0] = psum;
|
||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||
out_pred[stride * (i + 1)] = vec_psum[i];
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
// --- data structure ---
|
||||
/*! \brief training parameters */
|
||||
@ -263,6 +280,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
updater_initialized = 0;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
using namespace std;
|
||||
if (!strcmp(name, "updater") &&
|
||||
strcmp(updater_seq.c_str(), val) != 0) {
|
||||
updater_seq = val;
|
||||
@ -292,15 +310,18 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
* suppose we have n instance and k group, output will be k*n
|
||||
*/
|
||||
int num_output_group;
|
||||
/*! \brief size of leaf vector needed in tree */
|
||||
int size_leaf_vector;
|
||||
/*! \brief reserved parameters */
|
||||
int reserved[32];
|
||||
int reserved[31];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
num_trees = 0;
|
||||
num_roots = num_feature = 0;
|
||||
num_pbuffer = 0;
|
||||
num_output_group = 1;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
size_leaf_vector = 0;
|
||||
std::memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
@ -308,14 +329,16 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
|
||||
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
|
||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
|
||||
}
|
||||
/*! \return size of prediction buffer actually needed */
|
||||
inline size_t PredBufferSize(void) const {
|
||||
return num_output_group * num_pbuffer;
|
||||
return num_output_group * num_pbuffer * (size_leaf_vector + 1);
|
||||
}
|
||||
/*!
|
||||
* \brief get the buffer offset given a buffer index and group id
|
||||
@ -324,7 +347,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
|
||||
if (buffer_index < 0) return -1;
|
||||
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
|
||||
return buffer_index + num_pbuffer * bst_group;
|
||||
return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
|
||||
}
|
||||
};
|
||||
// training parameter
|
||||
@ -345,7 +368,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
// temporal storage for per thread
|
||||
std::vector<tree::RegTree::FVec> thread_temp;
|
||||
// the updaters that can be applied to each of tree
|
||||
std::vector< tree::IUpdater<FMatrix>* > updaters;
|
||||
std::vector<tree::IUpdater*> updaters;
|
||||
};
|
||||
|
||||
} // namespace gbm
|
||||
|
||||
@ -13,7 +13,7 @@ namespace xgboost {
|
||||
/*! \brief namespace related to data format */
|
||||
namespace io {
|
||||
/*! \brief DMatrix object that I/O module support save/load */
|
||||
typedef learner::DMatrix<FMatrixS> DataMatrix;
|
||||
typedef learner::DMatrix DataMatrix;
|
||||
/*!
|
||||
* \brief load DataMatrix from stream
|
||||
* \param fname file name to be loaded
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#include "../utils/utils.h"
|
||||
#include "../learner/dmatrix.h"
|
||||
#include "./io.h"
|
||||
#include "./simple_fmatrix-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
@ -24,11 +25,16 @@ class DMatrixSimple : public DataMatrix {
|
||||
public:
|
||||
// constructor
|
||||
DMatrixSimple(void) : DataMatrix(kMagic) {
|
||||
this->fmat.set_iter(new OneBatchIter(this));
|
||||
fmat_ = new FMatrixS(new OneBatchIter(this));
|
||||
this->Clear();
|
||||
}
|
||||
// virtual destructor
|
||||
virtual ~DMatrixSimple(void) {}
|
||||
virtual ~DMatrixSimple(void) {
|
||||
delete fmat_;
|
||||
}
|
||||
virtual IFMatrix *fmat(void) const {
|
||||
return fmat_;
|
||||
}
|
||||
/*! \brief clear the storage */
|
||||
inline void Clear(void) {
|
||||
row_ptr_.clear();
|
||||
@ -41,15 +47,17 @@ class DMatrixSimple : public DataMatrix {
|
||||
this->info = src.info;
|
||||
this->Clear();
|
||||
// clone data content in thos matrix
|
||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
row_data_.resize(row_data_.size() + inst.length);
|
||||
memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
if (inst.length != 0) {
|
||||
std::memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
}
|
||||
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
||||
}
|
||||
}
|
||||
@ -59,10 +67,10 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \param feats features
|
||||
* \return the index of added row
|
||||
*/
|
||||
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) {
|
||||
inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
|
||||
for (size_t i = 0; i < feats.size(); ++i) {
|
||||
row_data_.push_back(feats[i]);
|
||||
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].findex+1));
|
||||
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
|
||||
}
|
||||
row_ptr_.push_back(row_ptr_.back() + feats.size());
|
||||
info.info.num_row += 1;
|
||||
@ -74,14 +82,15 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText(const char* fname, bool silent = false) {
|
||||
using namespace std;
|
||||
this->Clear();
|
||||
FILE* file = utils::FopenCheck(fname, "r");
|
||||
float label; bool init = true;
|
||||
char tmp[1024];
|
||||
std::vector<SparseBatch::Entry> feats;
|
||||
std::vector<RowBatch::Entry> feats;
|
||||
while (fscanf(file, "%s", tmp) == 1) {
|
||||
SparseBatch::Entry e;
|
||||
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
|
||||
RowBatch::Entry e;
|
||||
if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
|
||||
feats.push_back(e);
|
||||
} else {
|
||||
if (!init) {
|
||||
@ -98,8 +107,10 @@ class DMatrixSimple : public DataMatrix {
|
||||
this->AddRow(feats);
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
||||
utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()), fname);
|
||||
}
|
||||
fclose(file);
|
||||
// try to load in additional file
|
||||
@ -125,7 +136,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary(const char* fname, bool silent = false) {
|
||||
FILE *fp = fopen64(fname, "rb");
|
||||
std::FILE *fp = fopen64(fname, "rb");
|
||||
if (fp == NULL) return false;
|
||||
utils::FileStream fs(fp);
|
||||
this->LoadBinary(fs, silent, fname);
|
||||
@ -139,24 +150,26 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \param fname file name, used to print message
|
||||
*/
|
||||
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
|
||||
int magic;
|
||||
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
|
||||
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
|
||||
int tmagic;
|
||||
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
||||
utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
|
||||
|
||||
info.LoadBinary(fs);
|
||||
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
|
||||
fmat.LoadColAccess(fs);
|
||||
fmat_->LoadColAccess(fs);
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is loaded",
|
||||
info.num_row(), info.num_col(), row_data_.size());
|
||||
utils::Printf("%lux%lu matrix with %lu entries is loaded",
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()));
|
||||
if (fname != NULL) {
|
||||
printf(" from %s\n", fname);
|
||||
utils::Printf(" from %s\n", fname);
|
||||
} else {
|
||||
printf("\n");
|
||||
utils::Printf("\n");
|
||||
}
|
||||
if (info.group_ptr.size() != 0) {
|
||||
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
||||
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -167,19 +180,22 @@ class DMatrixSimple : public DataMatrix {
|
||||
*/
|
||||
inline void SaveBinary(const char* fname, bool silent = false) const {
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
int magic = kMagic;
|
||||
fs.Write(&magic, sizeof(magic));
|
||||
int tmagic = kMagic;
|
||||
fs.Write(&tmagic, sizeof(tmagic));
|
||||
|
||||
info.SaveBinary(fs);
|
||||
FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
|
||||
fmat.SaveColAccess(fs);
|
||||
fmat_->SaveColAccess(fs);
|
||||
fs.Close();
|
||||
|
||||
if (!silent) {
|
||||
printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
||||
utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()),
|
||||
static_cast<unsigned long>(row_data_.size()), fname);
|
||||
if (info.group_ptr.size() != 0) {
|
||||
printf("data contains %lu groups\n", info.group_ptr.size()-1);
|
||||
utils::Printf("data contains %u groups\n",
|
||||
static_cast<unsigned>(info.group_ptr.size()-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -193,6 +209,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
|
||||
using namespace std;
|
||||
size_t len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
|
||||
if (!this->LoadBinary(fname, silent)) {
|
||||
@ -201,7 +218,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
return;
|
||||
}
|
||||
char bname[1024];
|
||||
snprintf(bname, sizeof(bname), "%s.buffer", fname);
|
||||
utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
|
||||
if (!this->LoadBinary(bname, silent)) {
|
||||
this->LoadText(fname, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, silent);
|
||||
@ -211,13 +228,15 @@ class DMatrixSimple : public DataMatrix {
|
||||
/*! \brief row pointer of CSR sparse storage */
|
||||
std::vector<size_t> row_ptr_;
|
||||
/*! \brief data in the row */
|
||||
std::vector<SparseBatch::Entry> row_data_;
|
||||
std::vector<RowBatch::Entry> row_data_;
|
||||
/*! \brief the real fmatrix */
|
||||
FMatrixS *fmat_;
|
||||
/*! \brief magic number used to identify DMatrix */
|
||||
static const int kMagic = 0xffffab01;
|
||||
|
||||
protected:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct OneBatchIter: utils::IIterator<SparseBatch> {
|
||||
struct OneBatchIter: utils::IIterator<RowBatch> {
|
||||
explicit OneBatchIter(DMatrixSimple *parent)
|
||||
: at_first_(true), parent_(parent) {}
|
||||
virtual ~OneBatchIter(void) {}
|
||||
@ -229,11 +248,11 @@ class DMatrixSimple : public DataMatrix {
|
||||
at_first_ = false;
|
||||
batch_.size = parent_->row_ptr_.size() - 1;
|
||||
batch_.base_rowid = 0;
|
||||
batch_.row_ptr = &parent_->row_ptr_[0];
|
||||
batch_.data_ptr = &parent_->row_data_[0];
|
||||
batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
|
||||
batch_.data_ptr = BeginPtr(parent_->row_data_);
|
||||
return true;
|
||||
}
|
||||
virtual const SparseBatch &Value(void) const {
|
||||
virtual const RowBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
|
||||
@ -243,8 +262,8 @@ class DMatrixSimple : public DataMatrix {
|
||||
// pointer to parient
|
||||
DMatrixSimple *parent_;
|
||||
// temporal space for batch
|
||||
SparseBatch batch_;
|
||||
};
|
||||
RowBatch batch_;
|
||||
};
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
|
||||
242
src/io/simple_fmatrix-inl.hpp
Normal file
242
src/io/simple_fmatrix-inl.hpp
Normal file
@ -0,0 +1,242 @@
|
||||
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||
/*!
|
||||
* \file simple_fmatrix-inl.hpp
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../data.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../utils/random.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../utils/matrix_csr.h"
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixS : public IFMatrix{
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief constructor */
|
||||
FMatrixS(utils::IIterator<RowBatch> *iter) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
// destructor
|
||||
virtual ~FMatrixS(void) {
|
||||
if (iter_ != NULL) delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_ptr_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_ptr_.size() - 1;
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get column size */
|
||||
virtual size_t GetColSize(size_t cidx) const {
|
||||
return col_ptr_[cidx+1] - col_ptr_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
virtual void InitColAccess(float pkeep = 1.0f) {
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(pkeep);
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
*/
|
||||
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief get the column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_iter_.col_index_.resize(ncol);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||
}
|
||||
col_iter_.SetBatch(col_ptr_, col_data_);
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief colmun based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
col_iter_.col_index_ = fset;
|
||||
col_iter_.SetBatch(col_ptr_, col_data_);
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief save column access data into stream
|
||||
* \param fo output stream to save to
|
||||
*/
|
||||
inline void SaveColAccess(utils::IStream &fo) const {
|
||||
fo.Write(buffered_rowset_);
|
||||
if (buffered_rowset_.size() != 0) {
|
||||
SaveBinary(fo, col_ptr_, col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load column access data from stream
|
||||
* \param fo output stream to load from
|
||||
*/
|
||||
inline void LoadColAccess(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
|
||||
if (buffered_rowset_.size() != 0) {
|
||||
LoadBinary(fi, &col_ptr_, &col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* \param fo output stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void SaveBinary(utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<RowBatch::Entry> &data) {
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write(&nrow, sizeof(size_t));
|
||||
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
|
||||
if (data.size() != 0) {
|
||||
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* \param fi input stream
|
||||
* \param out_ptr pointer data
|
||||
* \param out_data data content
|
||||
*/
|
||||
inline static void LoadBinary(utils::IStream &fi,
|
||||
std::vector<size_t> *out_ptr,
|
||||
std::vector<RowBatch::Entry> *out_data) {
|
||||
size_t nrow;
|
||||
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
||||
out_ptr->resize(nrow + 1);
|
||||
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
|
||||
"invalid input file format");
|
||||
out_data->resize(out_ptr->back());
|
||||
if (out_data->size() != 0) {
|
||||
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
|
||||
"invalid input file format");
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \param pkeep probability to keep a row
|
||||
*/
|
||||
inline void InitColData(float pkeep) {
|
||||
buffered_rowset_.clear();
|
||||
// note: this part of code is serial, todo, parallelize this transformer
|
||||
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.AddBudget(inst[j].index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
size_t ktop = 0;
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (ktop < buffered_rowset_.size() &&
|
||||
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
||||
++ktop;
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.PushElem(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
std::sort(&col_data_[0] + col_ptr_[i],
|
||||
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct OneBatchIter: utils::IIterator<ColBatch> {
|
||||
OneBatchIter(void) : at_first_(true){}
|
||||
virtual ~OneBatchIter(void) {}
|
||||
virtual void BeforeFirst(void) {
|
||||
at_first_ = true;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (!at_first_) return false;
|
||||
at_first_ = false;
|
||||
return true;
|
||||
}
|
||||
virtual const ColBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
inline void SetBatch(const std::vector<size_t> &ptr,
|
||||
const std::vector<ColBatch::Entry> &data) {
|
||||
batch_.size = col_index_.size();
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
const bst_uint ridx = col_index_[i];
|
||||
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
|
||||
static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
|
||||
}
|
||||
batch_.col_index = BeginPtr(col_index_);
|
||||
batch_.col_data = BeginPtr(col_data_);
|
||||
this->BeforeFirst();
|
||||
}
|
||||
// data content
|
||||
std::vector<bst_uint> col_index_;
|
||||
std::vector<ColBatch::Inst> col_data_;
|
||||
// whether is at first
|
||||
bool at_first_;
|
||||
// temporal space for batch
|
||||
ColBatch batch_;
|
||||
};
|
||||
// --- data structure used to support InitColAccess --
|
||||
// column iterator
|
||||
OneBatchIter col_iter_;
|
||||
// row iterator
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas in CSC format */
|
||||
std::vector<ColBatch::Entry> col_data_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||
@ -7,8 +7,9 @@
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include "../data.h"
|
||||
|
||||
#include "../utils/io.h"
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
@ -89,6 +90,7 @@ struct MetaInfo {
|
||||
}
|
||||
// try to load group information from file, if exists
|
||||
inline bool TryLoadGroup(const char* fname, bool silent = false) {
|
||||
using namespace std;
|
||||
FILE *fi = fopen64(fname, "r");
|
||||
if (fi == NULL) return false;
|
||||
group_ptr.push_back(0);
|
||||
@ -97,12 +99,14 @@ struct MetaInfo {
|
||||
group_ptr.push_back(group_ptr.back()+nline);
|
||||
}
|
||||
if (!silent) {
|
||||
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname);
|
||||
utils::Printf("%u groups are loaded from %s\n",
|
||||
static_cast<unsigned>(group_ptr.size()-1), fname);
|
||||
}
|
||||
fclose(fi);
|
||||
return true;
|
||||
}
|
||||
inline std::vector<float>& GetFloatInfo(const char *field) {
|
||||
using namespace std;
|
||||
if (!strcmp(field, "label")) return labels;
|
||||
if (!strcmp(field, "weight")) return weights;
|
||||
if (!strcmp(field, "base_margin")) return base_margin;
|
||||
@ -113,6 +117,7 @@ struct MetaInfo {
|
||||
return ((MetaInfo*)this)->GetFloatInfo(field);
|
||||
}
|
||||
inline std::vector<unsigned> &GetUIntInfo(const char *field) {
|
||||
using namespace std;
|
||||
if (!strcmp(field, "root_index")) return info.root_index;
|
||||
if (!strcmp(field, "fold_index")) return info.fold_index;
|
||||
utils::Error("unknown field %s", field);
|
||||
@ -123,15 +128,16 @@ struct MetaInfo {
|
||||
}
|
||||
// try to load weight information from file, if exists
|
||||
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
|
||||
std::vector<float> &weights = this->GetFloatInfo(field);
|
||||
using namespace std;
|
||||
std::vector<float> &data = this->GetFloatInfo(field);
|
||||
FILE *fi = fopen64(fname, "r");
|
||||
if (fi == NULL) return false;
|
||||
float wt;
|
||||
while (fscanf(fi, "%f", &wt) == 1) {
|
||||
weights.push_back(wt);
|
||||
data.push_back(wt);
|
||||
}
|
||||
if (!silent) {
|
||||
printf("loading %s from %s\n", field, fname);
|
||||
utils::Printf("loading %s from %s\n", field, fname);
|
||||
}
|
||||
fclose(fi);
|
||||
return true;
|
||||
@ -142,7 +148,6 @@ struct MetaInfo {
|
||||
* \brief data object used for learning,
|
||||
* \tparam FMatrix type of feature data source
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
struct DMatrix {
|
||||
/*!
|
||||
* \brief magic number associated with this object
|
||||
@ -151,8 +156,6 @@ struct DMatrix {
|
||||
const int magic;
|
||||
/*! \brief meta information about the dataset */
|
||||
MetaInfo info;
|
||||
/*! \brief feature matrix about data content */
|
||||
FMatrix fmat;
|
||||
/*!
|
||||
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||
* used to verify if DMatrix is cached
|
||||
@ -160,6 +163,8 @@ struct DMatrix {
|
||||
void *cache_learner_ptr_;
|
||||
/*! \brief default constructor */
|
||||
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
|
||||
/*! \brief get feature matrix about data content */
|
||||
virtual IFMatrix *fmat(void) const = 0;
|
||||
// virtual destructor
|
||||
virtual ~DMatrix(void){}
|
||||
};
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <climits>
|
||||
#include <algorithm>
|
||||
#include "./evaluation.h"
|
||||
#include "./helper_utils.h"
|
||||
@ -24,9 +24,12 @@ template<typename Derived>
|
||||
struct EvalEWiseBase : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||
utils::Check(preds.size() % info.labels.size() == 0,
|
||||
"label and prediction size not match");
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
|
||||
|
||||
float sum = 0.0, wsum = 0.0;
|
||||
#pragma omp parallel for reduction(+: sum, wsum) schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
@ -99,17 +102,58 @@ struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief ctest */
|
||||
struct EvalCTest: public IEvaluator {
|
||||
EvalCTest(IEvaluator *base, const char *name)
|
||||
: base_(base), name_(name) {}
|
||||
virtual ~EvalCTest(void) {
|
||||
delete base_;
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return name_.c_str();
|
||||
}
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() % info.labels.size() == 0,
|
||||
"label and prediction size not match");
|
||||
size_t ngroup = preds.size() / info.labels.size() - 1;
|
||||
const unsigned ndata = static_cast<unsigned>(info.labels.size());
|
||||
utils::Check(ngroup > 1, "pred size does not meet requirement");
|
||||
utils::Check(ndata == info.info.fold_index.size(), "need fold index");
|
||||
double wsum = 0.0;
|
||||
for (size_t k = 0; k < ngroup; ++k) {
|
||||
std::vector<float> tpred;
|
||||
MetaInfo tinfo;
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
if (info.info.fold_index[i] == k) {
|
||||
tpred.push_back(preds[i + (k + 1) * ndata]);
|
||||
tinfo.labels.push_back(info.labels[i]);
|
||||
tinfo.weights.push_back(info.GetWeight(i));
|
||||
}
|
||||
}
|
||||
wsum += base_->Eval(tpred, tinfo);
|
||||
}
|
||||
return static_cast<float>(wsum / ngroup);
|
||||
}
|
||||
|
||||
private:
|
||||
IEvaluator *base_;
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
/*! \brief AMS: also records best threshold */
|
||||
struct EvalAMS : public IEvaluator {
|
||||
public:
|
||||
explicit EvalAMS(const char *name) {
|
||||
name_ = name;
|
||||
// note: ams@0 will automatically select which ratio to go
|
||||
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
||||
utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
||||
}
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||
using namespace std;
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
|
||||
|
||||
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
|
||||
std::vector< std::pair<float, unsigned> > rec(ndata);
|
||||
|
||||
@ -140,7 +184,7 @@ struct EvalAMS : public IEvaluator {
|
||||
}
|
||||
}
|
||||
if (ntop == ndata) {
|
||||
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
||||
utils::Printf("\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
||||
return static_cast<float>(tams);
|
||||
} else {
|
||||
return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
|
||||
@ -159,6 +203,7 @@ struct EvalAMS : public IEvaluator {
|
||||
struct EvalPrecisionRatio : public IEvaluator{
|
||||
public:
|
||||
explicit EvalPrecisionRatio(const char *name) : name_(name) {
|
||||
using namespace std;
|
||||
if (sscanf(name, "apratio@%f", &ratio_) == 1) {
|
||||
use_ap = 1;
|
||||
} else {
|
||||
@ -168,9 +213,11 @@ struct EvalPrecisionRatio : public IEvaluator{
|
||||
}
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Assert(preds.size() == info.labels.size(), "label size predict size not match");
|
||||
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||
utils::Assert(preds.size() % info.labels.size() == 0,
|
||||
"label size predict size not match");
|
||||
std::vector< std::pair<float, unsigned> > rec;
|
||||
for (size_t j = 0; j < preds.size(); ++j) {
|
||||
for (size_t j = 0; j < info.labels.size(); ++j) {
|
||||
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
|
||||
}
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
@ -206,10 +253,14 @@ struct EvalPrecisionRatio : public IEvaluator{
|
||||
struct EvalAuc : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
|
||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
|
||||
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||
utils::Check(preds.size() % info.labels.size() == 0,
|
||||
"label size predict size not match");
|
||||
std::vector<unsigned> tgptr(2, 0);
|
||||
tgptr[1] = static_cast<unsigned>(info.labels.size());
|
||||
|
||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||
utils::Check(gptr.back() == preds.size(),
|
||||
utils::Check(gptr.back() == info.labels.size(),
|
||||
"EvalAuc: group structure must match number of prediction");
|
||||
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||
// sum statictis
|
||||
@ -293,6 +344,7 @@ struct EvalRankList : public IEvaluator {
|
||||
|
||||
protected:
|
||||
explicit EvalRankList(const char *name) {
|
||||
using namespace std;
|
||||
name_ = name;
|
||||
minus_ = false;
|
||||
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
|
||||
@ -339,7 +391,7 @@ struct EvalNDCG : public EvalRankList{
|
||||
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
|
||||
const unsigned rel = rec[i].second;
|
||||
if (rel != 0) {
|
||||
sumdcg += ((1 << rel) - 1) / log(i + 2.0);
|
||||
sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sumdcg);
|
||||
|
||||
@ -36,6 +36,7 @@ struct IEvaluator{
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
using namespace std;
|
||||
if (!strcmp(name, "rmse")) return new EvalRMSE();
|
||||
if (!strcmp(name, "error")) return new EvalError();
|
||||
if (!strcmp(name, "merror")) return new EvalMatchError();
|
||||
@ -45,7 +46,9 @@ inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
||||
if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
|
||||
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
||||
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
|
||||
if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
|
||||
if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
|
||||
|
||||
utils::Error("unknown evaluation metric type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
@ -54,6 +57,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
class EvalSet{
|
||||
public:
|
||||
inline void AddEval(const char *name) {
|
||||
using namespace std;
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
if (!strcmp(name, evals_[i]->Name())) return;
|
||||
}
|
||||
@ -71,11 +75,14 @@ class EvalSet{
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
float res = evals_[i]->Eval(preds, info);
|
||||
char tmp[1024];
|
||||
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
result += tmp;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline size_t Size(void) const {
|
||||
return evals_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
*/
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
|
||||
@ -21,7 +21,6 @@ namespace learner {
|
||||
* \brief learner that takes do gradient boosting on specific objective functions
|
||||
* and do training and prediction
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class BoostLearner {
|
||||
public:
|
||||
BoostLearner(void) {
|
||||
@ -44,7 +43,7 @@ class BoostLearner {
|
||||
* data matrices to continue training otherwise it will cause error
|
||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||
*/
|
||||
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
|
||||
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
|
||||
// estimate feature bound
|
||||
unsigned num_feature = 0;
|
||||
// assign buffer index
|
||||
@ -64,13 +63,14 @@ class BoostLearner {
|
||||
}
|
||||
char str_temp[25];
|
||||
if (num_feature > mparam.num_feature) {
|
||||
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||
utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||
this->SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
|
||||
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
|
||||
static_cast<unsigned long>(buffer_size));
|
||||
this->SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%ld\n", buffer_size);
|
||||
utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
@ -79,6 +79,7 @@ class BoostLearner {
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
// in this version, bst: prefix is no longer required
|
||||
if (strncmp(name, "bst:", 4) != 0) {
|
||||
std::string n = "bst:"; n += name;
|
||||
@ -158,18 +159,18 @@ class BoostLearner {
|
||||
* if not intialize it
|
||||
* \param p_train pointer to the matrix used by training
|
||||
*/
|
||||
inline void CheckInit(DMatrix<FMatrix> *p_train) {
|
||||
p_train->fmat.InitColAccess(prob_buffer_row);
|
||||
inline void CheckInit(DMatrix *p_train) {
|
||||
p_train->fmat()->InitColAccess(prob_buffer_row);
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iter current iteration number
|
||||
* \param p_train pointer to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
|
||||
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
||||
this->PredictRaw(train, &preds_);
|
||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
|
||||
gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
|
||||
}
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
@ -179,11 +180,11 @@ class BoostLearner {
|
||||
* \return a string corresponding to the evaluation result
|
||||
*/
|
||||
inline std::string EvalOneIter(int iter,
|
||||
const std::vector<const DMatrix<FMatrix>*> &evals,
|
||||
const std::vector<const DMatrix*> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
std::string res;
|
||||
char tmp[256];
|
||||
snprintf(tmp, sizeof(tmp), "[%d]", iter);
|
||||
utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter);
|
||||
res = tmp;
|
||||
for (size_t i = 0; i < evals.size(); ++i) {
|
||||
this->PredictRaw(*evals[i], &preds_);
|
||||
@ -198,7 +199,7 @@ class BoostLearner {
|
||||
* \param metric name of metric
|
||||
* \return a pair of <evaluation name, result>
|
||||
*/
|
||||
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
|
||||
std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
|
||||
if (metric == "auto") metric = obj_->DefaultEvalMetric();
|
||||
IEvaluator *ev = CreateEvaluator(metric.c_str());
|
||||
this->PredictRaw(data, &preds_);
|
||||
@ -212,11 +213,14 @@ class BoostLearner {
|
||||
* \param data input data
|
||||
* \param output_margin whether to only predict margin value instead of transformed prediction
|
||||
* \param out_preds output vector that stores the prediction
|
||||
* \param ntree_limit limit number of trees used for boosted tree
|
||||
* predictor, when it equals 0, this means we are using all the trees
|
||||
*/
|
||||
inline void Predict(const DMatrix<FMatrix> &data,
|
||||
inline void Predict(const DMatrix &data,
|
||||
bool output_margin,
|
||||
std::vector<float> *out_preds) const {
|
||||
this->PredictRaw(data, out_preds);
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) const {
|
||||
this->PredictRaw(data, out_preds, ntree_limit);
|
||||
if (!output_margin) {
|
||||
obj_->PredTransform(out_preds);
|
||||
}
|
||||
@ -235,22 +239,27 @@ class BoostLearner {
|
||||
if (obj_ != NULL) return;
|
||||
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
|
||||
obj_ = CreateObjFunction(name_obj_.c_str());
|
||||
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
|
||||
gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
|
||||
for (size_t i = 0; i < cfg_.size(); ++i) {
|
||||
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
}
|
||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||
if (evaluator_.Size() == 0) {
|
||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get un-transformed prediction
|
||||
* \param data training data matrix
|
||||
* \param out_preds output vector that stores the prediction
|
||||
* \param ntree_limit limit number of trees used for boosted tree
|
||||
* predictor, when it equals 0, this means we are using all the trees
|
||||
*/
|
||||
inline void PredictRaw(const DMatrix<FMatrix> &data,
|
||||
std::vector<float> *out_preds) const {
|
||||
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
|
||||
data.info.info, out_preds);
|
||||
inline void PredictRaw(const DMatrix &data,
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) const {
|
||||
gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
|
||||
data.info.info, out_preds, ntree_limit);
|
||||
// add base margin
|
||||
std::vector<float> &preds = *out_preds;
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||
@ -284,7 +293,7 @@ class BoostLearner {
|
||||
base_score = 0.5f;
|
||||
num_feature = 0;
|
||||
num_class = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
std::memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
@ -292,6 +301,7 @@ class BoostLearner {
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
|
||||
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
@ -307,7 +317,7 @@ class BoostLearner {
|
||||
// model parameter
|
||||
ModelParam mparam;
|
||||
// gbm model that back everything
|
||||
gbm::IGradBooster<FMatrix> *gbm_;
|
||||
gbm::IGradBooster *gbm_;
|
||||
// name of gbm model used for training
|
||||
std::string name_gbm_;
|
||||
// objective fnction
|
||||
@ -324,14 +334,14 @@ class BoostLearner {
|
||||
private:
|
||||
// cache entry object that helps handle feature caching
|
||||
struct CacheEntry {
|
||||
const DMatrix<FMatrix> *mat_;
|
||||
const DMatrix *mat_;
|
||||
size_t buffer_offset_;
|
||||
size_t num_row_;
|
||||
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
|
||||
CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
|
||||
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
||||
};
|
||||
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
|
||||
inline int64_t FindBufferOffset(const DMatrix &mat) const {
|
||||
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
||||
if (cache_[i].num_row_ == mat.info.num_row()) {
|
||||
|
||||
@ -6,9 +6,9 @@
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include "../data.h"
|
||||
#include "./objective.h"
|
||||
@ -37,7 +37,7 @@ struct LossType {
|
||||
case kLogisticRaw:
|
||||
case kLinearSquare: return x;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||
case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x));
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
@ -50,7 +50,7 @@ struct LossType {
|
||||
inline float FirstOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return predt - label;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt - label;
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
@ -65,7 +65,7 @@ struct LossType {
|
||||
inline float SecondOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return 1.0f;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt * (1 - predt);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
@ -80,7 +80,7 @@ struct LossType {
|
||||
loss_type == kLogisticNeglik ) {
|
||||
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
||||
"base_score must be in (0,1) for logistic loss");
|
||||
base_score = -logf(1.0f / base_score - 1.0f);
|
||||
base_score = -std::log(1.0f / base_score - 1.0f);
|
||||
}
|
||||
return base_score;
|
||||
}
|
||||
@ -101,6 +101,7 @@ class RegLossObj : public IObjFunction{
|
||||
}
|
||||
virtual ~RegLossObj(void) {}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp("scale_pos_weight", name)) {
|
||||
scale_pos_weight = static_cast<float>(atof(val));
|
||||
}
|
||||
@ -123,7 +124,7 @@ class RegLossObj : public IObjFunction{
|
||||
float p = loss.PredTransform(preds[i]);
|
||||
float w = info.GetWeight(j);
|
||||
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
|
||||
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
||||
gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
||||
loss.SecondOrderGradient(p, info.labels[j]) * w);
|
||||
}
|
||||
}
|
||||
@ -156,6 +157,7 @@ class SoftmaxMultiClassObj : public IObjFunction {
|
||||
}
|
||||
virtual ~SoftmaxMultiClassObj(void) {}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp( "num_class", name )) nclass = atoi(val);
|
||||
}
|
||||
virtual void GetGradient(const std::vector<float> &preds,
|
||||
@ -247,6 +249,7 @@ class LambdaRankObj : public IObjFunction {
|
||||
}
|
||||
virtual ~LambdaRankObj(void) {}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
|
||||
if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
|
||||
if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
|
||||
@ -419,8 +422,8 @@ class LambdaRankObjNDCG : public LambdaRankObj {
|
||||
for (size_t i = 0; i < pairs.size(); ++i) {
|
||||
unsigned pos_idx = pairs[i].pos_index;
|
||||
unsigned neg_idx = pairs[i].neg_index;
|
||||
float pos_loginv = 1.0f / logf(pos_idx + 2.0f);
|
||||
float neg_loginv = 1.0f / logf(neg_idx + 2.0f);
|
||||
float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
|
||||
float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
|
||||
int pos_label = static_cast<int>(sorted_list[pos_idx].label);
|
||||
int neg_label = static_cast<int>(sorted_list[neg_idx].label);
|
||||
float original =
|
||||
@ -438,7 +441,7 @@ class LambdaRankObjNDCG : public LambdaRankObj {
|
||||
for (size_t i = 0; i < labels.size(); ++i) {
|
||||
const unsigned rel = static_cast<unsigned>(labels[i]);
|
||||
if (rel != 0) {
|
||||
sumdcg += ((1 << rel) - 1) / logf(static_cast<float>(i + 2));
|
||||
sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sumdcg);
|
||||
|
||||
@ -67,6 +67,7 @@ namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief factory funciton to create objective function by name */
|
||||
inline IObjFunction* CreateObjFunction(const char *name) {
|
||||
using namespace std;
|
||||
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
|
||||
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
|
||||
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
|
||||
|
||||
@ -53,7 +53,7 @@ class TreeModel {
|
||||
Param(void) {
|
||||
max_depth = 0;
|
||||
size_leaf_vector = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
std::memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
@ -61,6 +61,7 @@ class TreeModel {
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp("num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("num_feature", name)) num_feature = atoi(val);
|
||||
if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
|
||||
@ -272,6 +273,7 @@ class TreeModel {
|
||||
param.num_nodes = param.num_roots;
|
||||
nodes.resize(param.num_nodes);
|
||||
stats.resize(param.num_nodes);
|
||||
leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
|
||||
for (int i = 0; i < param.num_nodes; i ++) {
|
||||
nodes[i].set_leaf(0.0f);
|
||||
nodes[i].set_parent(-1);
|
||||
@ -289,6 +291,9 @@ class TreeModel {
|
||||
"TreeModel: wrong format");
|
||||
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
|
||||
"TreeModel: wrong format");
|
||||
if (param.size_leaf_vector != 0) {
|
||||
utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
|
||||
}
|
||||
// chg deleted nodes
|
||||
deleted_nodes.resize(0);
|
||||
for (int i = param.num_roots; i < param.num_nodes; i ++) {
|
||||
@ -309,6 +314,7 @@ class TreeModel {
|
||||
fo.Write(¶m, sizeof(Param));
|
||||
fo.Write(&nodes[0], sizeof(Node) * nodes.size());
|
||||
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
|
||||
if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
|
||||
}
|
||||
/*!
|
||||
* \brief add child nodes to node
|
||||
@ -486,15 +492,15 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
std::fill(data.begin(), data.end(), e);
|
||||
}
|
||||
/*! \brief fill the vector with sparse vector */
|
||||
inline void Fill(const SparseBatch::Inst &inst) {
|
||||
inline void Fill(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
data[inst[i].findex].fvalue = inst[i].fvalue;
|
||||
data[inst[i].index].fvalue = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
/*! \brief drop the trace after fill, must be called after fill */
|
||||
inline void Drop(const SparseBatch::Inst &inst) {
|
||||
inline void Drop(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
data[inst[i].findex].flag = -1;
|
||||
data[inst[i].index].flag = -1;
|
||||
}
|
||||
}
|
||||
/*! \brief get ith value */
|
||||
|
||||
151
src/tree/param.h
151
src/tree/param.h
@ -22,10 +22,10 @@ struct TrainParam{
|
||||
//----- the rest parameters are less important ----
|
||||
// minimum amount of hessian(weight) allowed in a child
|
||||
float min_child_weight;
|
||||
// weight decay parameter used to control leaf fitting
|
||||
// L2 regularization factor
|
||||
float reg_lambda;
|
||||
// reg method
|
||||
int reg_method;
|
||||
// L1 regularization factor
|
||||
float reg_alpha;
|
||||
// default direction choice
|
||||
int default_direction;
|
||||
// whether we want to do subsample
|
||||
@ -36,6 +36,8 @@ struct TrainParam{
|
||||
float colsample_bytree;
|
||||
// speed optimization for dense column
|
||||
float opt_dense_col;
|
||||
// leaf vector size
|
||||
int size_leaf_vector;
|
||||
// number of threads to be used for tree construction,
|
||||
// if OpenMP is enabled, if equals 0, use system default
|
||||
int nthread;
|
||||
@ -45,13 +47,14 @@ struct TrainParam{
|
||||
min_child_weight = 1.0f;
|
||||
max_depth = 6;
|
||||
reg_lambda = 1.0f;
|
||||
reg_method = 2;
|
||||
reg_alpha = 0.0f;
|
||||
default_direction = 0;
|
||||
subsample = 1.0f;
|
||||
colsample_bytree = 1.0f;
|
||||
colsample_bylevel = 1.0f;
|
||||
opt_dense_col = 1.0f;
|
||||
nthread = 0;
|
||||
size_leaf_vector = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
@ -59,19 +62,22 @@ struct TrainParam{
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
// sync-names
|
||||
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "reg_method")) reg_method = atoi(val);
|
||||
if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
|
||||
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
||||
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
||||
if (!strcmp(name, "default_direction")) {
|
||||
@ -82,31 +88,31 @@ struct TrainParam{
|
||||
}
|
||||
// calculate the cost of loss function
|
||||
inline double CalcGain(double sum_grad, double sum_hess) const {
|
||||
if (sum_hess < min_child_weight) {
|
||||
return 0.0;
|
||||
if (sum_hess < min_child_weight) return 0.0;
|
||||
if (reg_alpha == 0.0f) {
|
||||
return Sqr(sum_grad) / (sum_hess + reg_lambda);
|
||||
} else {
|
||||
return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
|
||||
}
|
||||
switch (reg_method) {
|
||||
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess;
|
||||
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda);
|
||||
case 3 : return
|
||||
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) /
|
||||
(sum_hess + 0.5 * reg_lambda);
|
||||
default: return Sqr(sum_grad) / sum_hess;
|
||||
}
|
||||
// calculate cost of loss function with four stati
|
||||
inline double CalcGain(double sum_grad, double sum_hess,
|
||||
double test_grad, double test_hess) const {
|
||||
double w = CalcWeight(sum_grad, sum_hess);
|
||||
double ret = test_grad * w + 0.5 * (test_hess + reg_lambda) * Sqr(w);
|
||||
if (reg_alpha == 0.0f) {
|
||||
return - 2.0 * ret;
|
||||
} else {
|
||||
return - 2.0 * (ret + reg_alpha * std::abs(w));
|
||||
}
|
||||
}
|
||||
// calculate weight given the statistics
|
||||
inline double CalcWeight(double sum_grad, double sum_hess) const {
|
||||
if (sum_hess < min_child_weight) {
|
||||
return 0.0;
|
||||
if (sum_hess < min_child_weight) return 0.0;
|
||||
if (reg_alpha == 0.0f) {
|
||||
return -sum_grad / (sum_hess + reg_lambda);
|
||||
} else {
|
||||
switch (reg_method) {
|
||||
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
|
||||
case 2: return - sum_grad / (sum_hess + reg_lambda);
|
||||
case 3: return
|
||||
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
|
||||
(sum_hess + 0.5 * reg_lambda);
|
||||
default: return - sum_grad / sum_hess;
|
||||
}
|
||||
return -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
|
||||
}
|
||||
}
|
||||
/*! \brief whether need forward small to big search: default right */
|
||||
@ -153,6 +159,9 @@ struct GradStats {
|
||||
inline void Clear(void) {
|
||||
sum_grad = sum_hess = 0.0f;
|
||||
}
|
||||
/*! \brief check if necessary information is ready */
|
||||
inline static void CheckInfo(const BoosterInfo &info) {
|
||||
}
|
||||
/*!
|
||||
* \brief accumulate statistics,
|
||||
* \param gpair the vector storing the gradient statistics
|
||||
@ -188,14 +197,88 @@ struct GradStats {
|
||||
}
|
||||
/*! \brief set leaf vector value based on statistics */
|
||||
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{
|
||||
}
|
||||
protected:
|
||||
}
|
||||
// constructor to allow inheritance
|
||||
GradStats(void) {}
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(double grad, double hess) {
|
||||
sum_grad += grad; sum_hess += hess;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief vectorized cv statistics */
|
||||
template<unsigned vsize>
|
||||
struct CVGradStats : public GradStats {
|
||||
// additional statistics
|
||||
GradStats train[vsize], valid[vsize];
|
||||
// constructor
|
||||
explicit CVGradStats(const TrainParam ¶m) {
|
||||
utils::Check(param.size_leaf_vector == vsize,
|
||||
"CVGradStats: vsize must match size_leaf_vector");
|
||||
this->Clear();
|
||||
}
|
||||
/*! \brief check if necessary information is ready */
|
||||
inline static void CheckInfo(const BoosterInfo &info) {
|
||||
utils::Check(info.fold_index.size() != 0,
|
||||
"CVGradStats: require fold_index");
|
||||
}
|
||||
/*! \brief clear the statistics */
|
||||
inline void Clear(void) {
|
||||
GradStats::Clear();
|
||||
for (unsigned i = 0; i < vsize; ++i) {
|
||||
train[i].Clear(); valid[i].Clear();
|
||||
}
|
||||
}
|
||||
inline void Add(const std::vector<bst_gpair> &gpair,
|
||||
const BoosterInfo &info,
|
||||
bst_uint ridx) {
|
||||
GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
|
||||
const size_t step = info.fold_index.size();
|
||||
for (unsigned i = 0; i < vsize; ++i) {
|
||||
const bst_gpair &b = gpair[(i + 1) * step + ridx];
|
||||
if (info.fold_index[ridx] == i) {
|
||||
valid[i].Add(b.grad, b.hess);
|
||||
} else {
|
||||
train[i].Add(b.grad, b.hess);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief calculate gain of the solution */
|
||||
inline double CalcGain(const TrainParam ¶m) const {
|
||||
double ret = 0.0;
|
||||
for (unsigned i = 0; i < vsize; ++i) {
|
||||
ret += param.CalcGain(train[i].sum_grad,
|
||||
train[i].sum_hess,
|
||||
vsize * valid[i].sum_grad,
|
||||
vsize * valid[i].sum_hess);
|
||||
}
|
||||
return ret / vsize;
|
||||
}
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(const CVGradStats &b) {
|
||||
GradStats::Add(b);
|
||||
for (unsigned i = 0; i < vsize; ++i) {
|
||||
train[i].Add(b.train[i]);
|
||||
valid[i].Add(b.valid[i]);
|
||||
}
|
||||
}
|
||||
/*! \brief set current value to a - b */
|
||||
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
|
||||
GradStats::SetSubstract(a, b);
|
||||
for (int i = 0; i < vsize; ++i) {
|
||||
train[i].SetSubstract(a.train[i], b.train[i]);
|
||||
valid[i].SetSubstract(a.valid[i], b.valid[i]);
|
||||
}
|
||||
}
|
||||
/*! \brief set leaf vector value based on statistics */
|
||||
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{
|
||||
for (int i = 0; i < vsize; ++i) {
|
||||
vec[i] = param.learning_rate *
|
||||
param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief statistics that is helpful to store
|
||||
* and represent a split solution for the tree
|
||||
@ -216,11 +299,11 @@ struct SplitEntry{
|
||||
* \param loss_chg the loss reduction get through the split
|
||||
* \param split_index the feature index where the split is on
|
||||
*/
|
||||
inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const {
|
||||
inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
|
||||
if (this->split_index() <= split_index) {
|
||||
return loss_chg > this->loss_chg;
|
||||
return new_loss_chg > this->loss_chg;
|
||||
} else {
|
||||
return !(this->loss_chg > loss_chg);
|
||||
return !(this->loss_chg > new_loss_chg);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
@ -246,13 +329,13 @@ struct SplitEntry{
|
||||
* \param default_left whether the missing value goes to left
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
inline bool Update(bst_float loss_chg, unsigned split_index,
|
||||
float split_value, bool default_left) {
|
||||
if (this->NeedReplace(loss_chg, split_index)) {
|
||||
this->loss_chg = loss_chg;
|
||||
inline bool Update(bst_float new_loss_chg, unsigned split_index,
|
||||
float new_split_value, bool default_left) {
|
||||
if (this->NeedReplace(new_loss_chg, split_index)) {
|
||||
this->loss_chg = new_loss_chg;
|
||||
if (default_left) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
this->split_value = split_value;
|
||||
this->split_value = new_split_value;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
||||
21
src/tree/updater.cpp
Normal file
21
src/tree/updater.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#include <cstring>
|
||||
#include "./updater.h"
|
||||
#include "./updater_prune-inl.hpp"
|
||||
#include "./updater_refresh-inl.hpp"
|
||||
#include "./updater_colmaker-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
IUpdater* CreateUpdater(const char *name) {
|
||||
using namespace std;
|
||||
if (!strcmp(name, "prune")) return new TreePruner();
|
||||
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
|
||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
|
||||
utils::Error("unknown updater:%s", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@ -14,9 +14,7 @@ namespace xgboost {
|
||||
namespace tree {
|
||||
/*!
|
||||
* \brief interface of tree update module, that performs update of a tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IUpdater {
|
||||
public:
|
||||
/*!
|
||||
@ -28,7 +26,7 @@ class IUpdater {
|
||||
/*!
|
||||
* \brief peform update to the tree models
|
||||
* \param gpair the gradient pair statistics of the data
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param p_fmat feature matrix that provide access to features
|
||||
* \param info extra side information that may be need, such as root index
|
||||
* \param trees pointer to the trese to be updated, upater will change the content of the tree
|
||||
* note: all the trees in the vector are updated, with the same statistics,
|
||||
@ -36,36 +34,18 @@ class IUpdater {
|
||||
* there can be multiple trees when we train random forest style model
|
||||
*/
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const std::vector<RegTree*> &trees) = 0;
|
||||
// destructor
|
||||
virtual ~IUpdater(void) {}
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
#include "./updater_prune-inl.hpp"
|
||||
#include "./updater_refresh-inl.hpp"
|
||||
#include "./updater_colmaker-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*!
|
||||
* \brief create a updater based on name
|
||||
* \param name name of updater
|
||||
* \return return the updater instance
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
|
||||
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
|
||||
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
|
||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
|
||||
utils::Error("unknown updater:%s", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
IUpdater* CreateUpdater(const char *name);
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_H_
|
||||
|
||||
@ -15,8 +15,8 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename FMatrix, typename TStats>
|
||||
class ColMaker: public IUpdater<FMatrix> {
|
||||
template<typename TStats>
|
||||
class ColMaker: public IUpdater {
|
||||
public:
|
||||
virtual ~ColMaker(void) {}
|
||||
// set training parameter
|
||||
@ -24,16 +24,17 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
param.SetParam(name, val);
|
||||
}
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const std::vector<RegTree*> &trees) {
|
||||
TStats::CheckInfo(info);
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param.learning_rate;
|
||||
param.learning_rate = lr / trees.size();
|
||||
// build tree
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
Builder builder(param);
|
||||
builder.Update(gpair, fmat, info, trees[i]);
|
||||
builder.Update(gpair, p_fmat, info, trees[i]);
|
||||
}
|
||||
param.learning_rate = lr;
|
||||
}
|
||||
@ -76,23 +77,22 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
explicit Builder(const TrainParam ¶m) : param(param) {}
|
||||
// update one tree, growing
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
RegTree *p_tree) {
|
||||
this->InitData(gpair, fmat, info.root_index, *p_tree);
|
||||
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
|
||||
|
||||
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
|
||||
this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
|
||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree);
|
||||
this->ResetPosition(this->qexpand, fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree, &this->qexpand);
|
||||
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
|
||||
this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
|
||||
this->ResetPosition(qexpand_, p_fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree, &qexpand_);
|
||||
this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
|
||||
// if nothing left to be expand, break
|
||||
if (qexpand.size() == 0) break;
|
||||
if (qexpand_.size() == 0) break;
|
||||
}
|
||||
// set all the rest expanding nodes to leaf
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||
const int nid = qexpand_[i];
|
||||
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
||||
}
|
||||
// remember auxiliary statistics in the tree node
|
||||
@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
private:
|
||||
// initialize temp data structure
|
||||
inline void InitData(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
const IFMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
||||
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
|
||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||
@ -137,8 +137,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
{
|
||||
// initialize feature index
|
||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
||||
@ -166,16 +165,16 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
snode.reserve(256);
|
||||
}
|
||||
{// expand query
|
||||
qexpand.reserve(256); qexpand.clear();
|
||||
qexpand_.reserve(256); qexpand_.clear();
|
||||
for (int i = 0; i < tree.param.num_roots; ++i) {
|
||||
qexpand.push_back(i);
|
||||
qexpand_.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
|
||||
inline void InitNewNode(const std::vector<int> &qexpand,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
const IFMatrix &fmat,
|
||||
const BoosterInfo &info,
|
||||
const RegTree &tree) {
|
||||
{// setup statistics space for each tree node
|
||||
@ -222,24 +221,26 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
qexpand = newnodes;
|
||||
}
|
||||
// enumerate the split values of specific feature
|
||||
template<typename Iter>
|
||||
inline void EnumerateSplit(Iter it, unsigned fid,
|
||||
inline void EnumerateSplit(const ColBatch::Entry *begin,
|
||||
const ColBatch::Entry *end,
|
||||
int d_step,
|
||||
bst_uint fid,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const BoosterInfo &info,
|
||||
std::vector<ThreadEntry> &temp,
|
||||
bool is_forward_search) {
|
||||
std::vector<ThreadEntry> &temp) {
|
||||
const std::vector<int> &qexpand = qexpand_;
|
||||
// clear all the temp statistics
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
temp[qexpand[j]].stats.Clear();
|
||||
}
|
||||
// left statistics
|
||||
TStats c(param);
|
||||
while (it.Next()) {
|
||||
const bst_uint ridx = it.rindex();
|
||||
for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
|
||||
const bst_uint ridx = it->index;
|
||||
const int nid = position[ridx];
|
||||
if (nid < 0) continue;
|
||||
// start working
|
||||
const float fvalue = it.fvalue();
|
||||
const float fvalue = it->fvalue;
|
||||
// get the statistics of nid
|
||||
ThreadEntry &e = temp[nid];
|
||||
// test if first hit, this is fine, because we set 0 during init
|
||||
@ -248,11 +249,11 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
e.last_fvalue = fvalue;
|
||||
} else {
|
||||
// try to find a split
|
||||
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||
if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||
c.SetSubstract(snode[nid].stats, e.stats);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
|
||||
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
|
||||
}
|
||||
}
|
||||
// update the statistics
|
||||
@ -267,38 +268,46 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
c.SetSubstract(snode[nid].stats, e.stats);
|
||||
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||
const float delta = is_forward_search ? rt_eps : -rt_eps;
|
||||
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
|
||||
const float delta = d_step == +1 ? rt_eps : -rt_eps;
|
||||
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// find splits at current level, do split per level
|
||||
inline void FindSplit(int depth, const std::vector<int> &qexpand,
|
||||
inline void FindSplit(int depth,
|
||||
const std::vector<int> &qexpand,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
RegTree *p_tree) {
|
||||
std::vector<unsigned> feat_set = feat_index;
|
||||
std::vector<bst_uint> feat_set = feat_index;
|
||||
if (param.colsample_bylevel != 1.0f) {
|
||||
random::Shuffle(feat_set);
|
||||
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
|
||||
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
|
||||
feat_set.resize(n);
|
||||
}
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size());
|
||||
#if defined(_OPENMP)
|
||||
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
|
||||
#endif
|
||||
#pragma omp parallel for schedule(dynamic, batch_size)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
const unsigned fid = feat_set[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
if (param.need_forward_search(fmat.GetColDensity(fid))) {
|
||||
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true);
|
||||
}
|
||||
if (param.need_backward_search(fmat.GetColDensity(fid))) {
|
||||
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false);
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
// start enumeration
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#if defined(_OPENMP)
|
||||
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
|
||||
#endif
|
||||
#pragma omp parallel for schedule(dynamic, batch_size)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
const ColBatch::Inst c = batch[i];
|
||||
if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
|
||||
this->EnumerateSplit(c.data, c.data + c.length, +1,
|
||||
fid, gpair, info, stemp[tid]);
|
||||
}
|
||||
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
|
||||
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
|
||||
fid, gpair, info, stemp[tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// after this each thread's stemp will get the best candidates, aggregate results
|
||||
@ -318,8 +327,8 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
}
|
||||
}
|
||||
// reset position of each data points after split is created in the tree
|
||||
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
|
||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||
inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
|
||||
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
|
||||
// step 1, set default direct nodes to default, and leaf nodes to -1
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
@ -343,22 +352,28 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
}
|
||||
std::sort(fsplits.begin(), fsplits.end());
|
||||
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
|
||||
// start put things into right place
|
||||
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size());
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (bst_omp_uint i = 0; i < nfeats; ++i) {
|
||||
const unsigned fid = fsplits[i];
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
const bst_uint ridx = it.rindex();
|
||||
int nid = position[ridx];
|
||||
if (nid == -1) continue;
|
||||
// go back to parent, correct those who are not default
|
||||
nid = tree[nid].parent();
|
||||
if (tree[nid].split_index() == fid) {
|
||||
if (it.fvalue() < tree[nid].split_cond()) {
|
||||
position[ridx] = tree[nid].cleft();
|
||||
} else {
|
||||
position[ridx] = tree[nid].cright();
|
||||
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
|
||||
while (iter->Next()) {
|
||||
const ColBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
ColBatch::Inst col = batch[i];
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
const bst_uint ridx = col[j].index;
|
||||
const float fvalue = col[j].fvalue;
|
||||
int nid = position[ridx];
|
||||
if (nid == -1) continue;
|
||||
// go back to parent, correct those who are not default
|
||||
nid = tree[nid].parent();
|
||||
if (tree[nid].split_index() == fid) {
|
||||
if (fvalue < tree[nid].split_cond()) {
|
||||
position[ridx] = tree[nid].cleft();
|
||||
} else {
|
||||
position[ridx] = tree[nid].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -369,7 +384,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
// number of omp thread used during training
|
||||
int nthread;
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<unsigned> feat_index;
|
||||
std::vector<bst_uint> feat_index;
|
||||
// Instance Data: current node position in the tree of each instance
|
||||
std::vector<int> position;
|
||||
// PerThread x PerTreeNode: statistics for per thread construction
|
||||
@ -377,7 +392,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
||||
/*! \brief TreeNode Data: statistics for each constructed node */
|
||||
std::vector<NodeEntry> snode;
|
||||
/*! \brief queue of nodes to be expanded */
|
||||
std::vector<int> qexpand;
|
||||
std::vector<int> qexpand_;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@ -12,18 +12,18 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename FMatrix>
|
||||
class TreePruner: public IUpdater<FMatrix> {
|
||||
class TreePruner: public IUpdater {
|
||||
public:
|
||||
virtual ~TreePruner(void) {}
|
||||
// set training parameter
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
param.SetParam(name, val);
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
}
|
||||
// update the tree, do pruning
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const std::vector<RegTree*> &trees) {
|
||||
// rescale learning rate according to size of trees
|
||||
@ -64,8 +64,8 @@ class TreePruner: public IUpdater<FMatrix> {
|
||||
}
|
||||
}
|
||||
if (silent == 0) {
|
||||
printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
|
||||
utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
|
||||
}
|
||||
}
|
||||
|
||||
@ -75,7 +75,6 @@ class TreePruner: public IUpdater<FMatrix> {
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
||||
|
||||
@ -9,12 +9,13 @@
|
||||
#include <limits>
|
||||
#include "./param.h"
|
||||
#include "./updater.h"
|
||||
#include "../utils/omp.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename FMatrix, typename TStats>
|
||||
class TreeRefresher: public IUpdater<FMatrix> {
|
||||
template<typename TStats>
|
||||
class TreeRefresher: public IUpdater {
|
||||
public:
|
||||
virtual ~TreeRefresher(void) {}
|
||||
// set training parameter
|
||||
@ -23,16 +24,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
||||
}
|
||||
// update the tree, do pruning
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
const std::vector<RegTree*> &trees) {
|
||||
const std::vector<RegTree*> &trees) {
|
||||
if (trees.size() == 0) return;
|
||||
// number of threads
|
||||
int nthread;
|
||||
// thread temporal space
|
||||
std::vector< std::vector<TStats> > stemp;
|
||||
std::vector<RegTree::FVec> fvec_temp;
|
||||
// setup temp space for each thread
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
@ -50,16 +51,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
||||
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
||||
}
|
||||
// start accumulating statistics
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
|
||||
"too large batch size ");
|
||||
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
RegTree::FVec &feats = fvec_temp[tid];
|
||||
@ -126,8 +127,6 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
||||
this->Refresh(gstats, tree[nid].cright(), p_tree);
|
||||
}
|
||||
}
|
||||
// number of thread in the data
|
||||
int nthread;
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
};
|
||||
|
||||
@ -24,15 +24,15 @@ class FeatMap {
|
||||
// function definitions
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(const char *fname) {
|
||||
FILE *fi = utils::FopenCheck(fname, "r");
|
||||
std::FILE *fi = utils::FopenCheck(fname, "r");
|
||||
this->LoadText(fi);
|
||||
fclose(fi);
|
||||
std::fclose(fi);
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(FILE *fi) {
|
||||
inline void LoadText(std::FILE *fi) {
|
||||
int fid;
|
||||
char fname[1256], ftype[1256];
|
||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
||||
while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
||||
this->PushBack(fid, fname, ftype);
|
||||
}
|
||||
}
|
||||
@ -62,6 +62,7 @@ class FeatMap {
|
||||
|
||||
private:
|
||||
inline static Type GetType(const char *tname) {
|
||||
using namespace std;
|
||||
if (!strcmp("i", tname)) return kIndicator;
|
||||
if (!strcmp("q", tname)) return kQuantitive;
|
||||
if (!strcmp("int", tname)) return kInteger;
|
||||
|
||||
@ -91,22 +91,21 @@ class IStream {
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream : public IStream {
|
||||
private:
|
||||
FILE *fp;
|
||||
std::FILE *fp;
|
||||
public:
|
||||
explicit FileStream(FILE *fp) {
|
||||
this->fp = fp;
|
||||
explicit FileStream(std::FILE *fp) : fp(fp) {
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return fread(ptr, size, 1, fp);
|
||||
return std::fread(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
fwrite(ptr, size, 1, fp);
|
||||
std::fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
inline void Seek(size_t pos) {
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
std::fseek(fp, 0, SEEK_SET);
|
||||
}
|
||||
inline void Close(void) {
|
||||
fclose(fp);
|
||||
std::fclose(fp);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -9,13 +9,8 @@
|
||||
#include <omp.h>
|
||||
#else
|
||||
#ifndef DISABLE_OPENMP
|
||||
#ifndef _MSC_VER
|
||||
#warning "OpenMP is not available, compile to single thread code."\
|
||||
"You may want to ungrade your compiler to enable OpenMP support,"\
|
||||
"to get benefit of multi-threading."
|
||||
#else
|
||||
// TODO add warning for msvc
|
||||
#endif
|
||||
// use pragma message instead of warning
|
||||
#pragma message ("Warning: OpenMP is not available, xgboost will be compiled into single-thread code. Use OpenMP-enabled compiler to get benefit of multi-threading")
|
||||
#endif
|
||||
inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
|
||||
@ -16,30 +16,21 @@
|
||||
/*! namespace of PRNG */
|
||||
namespace xgboost {
|
||||
namespace random {
|
||||
|
||||
#ifndef XGBOOST_CUSTOMIZE_PRNG_
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed(uint32_t seed) {
|
||||
inline void Seed(unsigned seed) {
|
||||
srand(seed);
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(void) {
|
||||
/*! \brief basic function, uniform */
|
||||
inline double Uniform(void) {
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(void) {
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||
}
|
||||
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32(void) {
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32(uint32_t n) {
|
||||
return (uint32_t)floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal() {
|
||||
inline double Normal(void) {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
@ -49,22 +40,24 @@ inline double SampleNormal() {
|
||||
|
||||
return x * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
#else
|
||||
// include declarations, to be implemented
|
||||
void Seed(unsigned seed);
|
||||
double Uniform(void);
|
||||
double Normal(void);
|
||||
#endif
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D(double &xx, double &yy) {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
double t = sqrt(-2.0 * log(s) / s);
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(void) {
|
||||
return Uniform();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32(uint32_t n) {
|
||||
return (uint32_t)std::floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal(double mu, double sigma) {
|
||||
return SampleNormal() * sigma + mu;
|
||||
return Normal() * sigma + mu;
|
||||
}
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary(double p) {
|
||||
@ -90,7 +83,7 @@ struct Random{
|
||||
inline void Seed(unsigned sd) {
|
||||
this->rseed = sd;
|
||||
#if defined(_MSC_VER)||defined(_WIN32)
|
||||
srand(rseed);
|
||||
::xgboost::random::Seed(sd);
|
||||
#endif
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
@ -98,8 +91,8 @@ struct Random{
|
||||
// use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
|
||||
// For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
|
||||
// todo, replace with another PRNG
|
||||
#if defined(_MSC_VER)||defined(_WIN32)
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0);
|
||||
#if defined(_MSC_VER)||defined(_WIN32)||defined(XGBOOST_STRICT_CXX98_)
|
||||
return Uniform();
|
||||
#else
|
||||
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
|
||||
#endif
|
||||
|
||||
@ -7,10 +7,18 @@
|
||||
*/
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#include <cstdio>
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
#include <cstdarg>
|
||||
#endif
|
||||
|
||||
#if !defined(__GNUC__)
|
||||
#define fopen64 std::fopen
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#define fopen64 fopen
|
||||
// NOTE: sprintf_s is not equivalent to snprintf,
|
||||
// they are equivalent when success, which is sufficient for our case
|
||||
#define snprintf sprintf_s
|
||||
@ -18,19 +26,18 @@
|
||||
#else
|
||||
#ifdef _FILE_OFFSET_BITS
|
||||
#if _FILE_OFFSET_BITS == 32
|
||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
||||
#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#ifdef __APPLE__
|
||||
#define off64_t off_t
|
||||
#define fopen64 fopen
|
||||
#define fopen64 std::fopen
|
||||
#endif
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
extern "C" {
|
||||
#include <sys/types.h>
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@ -46,10 +53,11 @@ typedef long int64_t;
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils {
|
||||
/*! \brief error message buffer length */
|
||||
const int kErrorBuffer = 1 << 12;
|
||||
|
||||
#ifndef XGBOOST_CUSTOMIZE_ERROR_
|
||||
/*! \brief error message buffer length */
|
||||
const int kPrintBuffer = 1 << 12;
|
||||
|
||||
#ifndef XGBOOST_CUSTOMIZE_MSG_
|
||||
/*!
|
||||
* \brief handling of Assert error, caused by in-apropriate input
|
||||
* \param msg error message
|
||||
@ -66,19 +74,50 @@ inline void HandleCheckError(const char *msg) {
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
exit(-1);
|
||||
}
|
||||
inline void HandlePrint(const char *msg) {
|
||||
printf("%s", msg);
|
||||
}
|
||||
#else
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
// include declarations, some one must implement this
|
||||
void HandleAssertError(const char *msg);
|
||||
void HandleCheckError(const char *msg);
|
||||
void HandlePrint(const char *msg);
|
||||
#endif
|
||||
#endif
|
||||
#ifdef XGBOOST_STRICT_CXX98_
|
||||
// these function pointers are to be assigned
|
||||
extern "C" void (*Printf)(const char *fmt, ...);
|
||||
extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
|
||||
extern "C" void (*Assert)(int exp, const char *fmt, ...);
|
||||
extern "C" void (*Check)(int exp, const char *fmt, ...);
|
||||
extern "C" void (*Error)(const char *fmt, ...);
|
||||
#else
|
||||
/*! \brief printf, print message to the console */
|
||||
inline void Printf(const char *fmt, ...) {
|
||||
std::string msg(kPrintBuffer, '\0');
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||
va_end(args);
|
||||
HandlePrint(msg.c_str());
|
||||
}
|
||||
/*! \brief portable version of snprintf */
|
||||
inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
int ret = vsnprintf(buf, size, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*! \brief assert an condition is true, use this to handle debug information */
|
||||
inline void Assert(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
std::string msg(kErrorBuffer, '\0');
|
||||
std::string msg(kPrintBuffer, '\0');
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||
va_end(args);
|
||||
HandleAssertError(msg.c_str());
|
||||
}
|
||||
@ -87,10 +126,10 @@ inline void Assert(bool exp, const char *fmt, ...) {
|
||||
/*!\brief same as assert, but this is intended to be used as message for user*/
|
||||
inline void Check(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
std::string msg(kErrorBuffer, '\0');
|
||||
std::string msg(kPrintBuffer, '\0');
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||
va_end(args);
|
||||
HandleCheckError(msg.c_str());
|
||||
}
|
||||
@ -99,22 +138,41 @@ inline void Check(bool exp, const char *fmt, ...) {
|
||||
/*! \brief report error message, same as check */
|
||||
inline void Error(const char *fmt, ...) {
|
||||
{
|
||||
std::string msg(kErrorBuffer, '\0');
|
||||
std::string msg(kPrintBuffer, '\0');
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||
va_end(args);
|
||||
HandleCheckError(msg.c_str());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag) {
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
inline std::FILE *FopenCheck(const char *fname, const char *flag) {
|
||||
std::FILE *fp = fopen64(fname, flag);
|
||||
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
||||
return fp;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
// easy utils that can be directly acessed in xgboost
|
||||
/*! \brief get the beginning address of a vector */
|
||||
template<typename T>
|
||||
inline T *BeginPtr(std::vector<T> &vec) {
|
||||
if (vec.size() == 0) {
|
||||
return NULL;
|
||||
} else {
|
||||
return &vec[0];
|
||||
}
|
||||
}
|
||||
/*! \brief get the beginning address of a vector */
|
||||
template<typename T>
|
||||
inline const T *BeginPtr(const std::vector<T> &vec) {
|
||||
if (vec.size() == 0) {
|
||||
return NULL;
|
||||
} else {
|
||||
return &vec[0];
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_UTILS_H_
|
||||
|
||||
@ -50,6 +50,7 @@ class BoostLearnTask{
|
||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||
if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
|
||||
if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val);
|
||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
||||
if (!strcmp("task", name)) task = val;
|
||||
@ -79,6 +80,7 @@ class BoostLearnTask{
|
||||
save_period = 0;
|
||||
eval_train = 0;
|
||||
pred_margin = 0;
|
||||
ntree_limit = 0;
|
||||
dump_model_stats = 0;
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
@ -186,7 +188,7 @@ class BoostLearnTask{
|
||||
inline void TaskPred(void) {
|
||||
std::vector<float> preds;
|
||||
if (!silent) printf("start prediction...\n");
|
||||
learner.Predict(*data, pred_margin != 0, &preds);
|
||||
learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
|
||||
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||
for (size_t i = 0; i < preds.size(); i++) {
|
||||
@ -217,6 +219,8 @@ class BoostLearnTask{
|
||||
std::string task;
|
||||
/*! \brief name of predict file */
|
||||
std::string name_pred;
|
||||
/*!\brief limit number of trees in prediction */
|
||||
int ntree_limit;
|
||||
/*!\brief whether to directly output margin value */
|
||||
int pred_margin;
|
||||
/*! \brief whether dump statistics along with model */
|
||||
@ -234,7 +238,7 @@ class BoostLearnTask{
|
||||
std::vector<io::DataMatrix*> deval;
|
||||
std::vector<const io::DataMatrix*> devalall;
|
||||
utils::FeatMap fmap;
|
||||
learner::BoostLearner<FMatrixS> learner;
|
||||
learner::BoostLearner learner;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
The solution has been created with Visual Studio Express 2013.
|
||||
The solution has been created with Visual Studio Express 2010.
|
||||
Make sure to compile the Release version, unless you need to debug the code
|
||||
(and in the latter case modify the path in xgboost.py from release to test).
|
||||
Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio Express 2013 for Windows Desktop
|
||||
VisualStudioVersion = 12.0.30723.0
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}"
|
||||
Microsoft Visual Studio Solution File, Format Version 11.00
|
||||
# Visual Studio 2010
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{2E1AF937-28BB-4832-B916-309C9A0F6C4F}"
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
@ -15,22 +13,21 @@ Global
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.Build.0 = Debug|x64
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.Build.0 = Release|Win32
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.ActiveCfg = Release|x64
|
||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.Build.0 = Release|x64
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.Build.0 = Debug|x64
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.Build.0 = Release|Win32
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.ActiveCfg = Release|x64
|
||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.Build.0 = Release|x64
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
|
||||
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
|
||||
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
@ -18,8 +18,14 @@
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\..\src\gbm\gbm.cpp" />
|
||||
<ClCompile Include="..\..\src\io\io.cpp" />
|
||||
<ClCompile Include="..\..\src\tree\updater.cpp" />
|
||||
<ClCompile Include="..\..\src\xgboost_main.cpp" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}</ProjectGuid>
|
||||
<ProjectGuid>{19766C3F-7508-49D0-BAAC-0988FCC9970C}</ProjectGuid>
|
||||
<RootNamespace>xgboost</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
@ -27,27 +33,23 @@
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
@ -111,10 +113,6 @@
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\..\src\io\io.cpp" />
|
||||
<ClCompile Include="..\..\src\xgboost_main.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
|
||||
@ -30,17 +30,17 @@
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
|
||||
@ -1,126 +0,0 @@
|
||||
# include xgboost library, must set chdir=TRURE
|
||||
source("../xgboost.R", chdir=TRUE)
|
||||
|
||||
# helper function to read libsvm format
|
||||
# this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose
|
||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||
read.libsvm <- function(fname, maxcol) {
|
||||
content <- readLines(fname)
|
||||
nline <- length(content)
|
||||
label <- numeric(nline)
|
||||
mat <- matrix(0, nline, maxcol+1)
|
||||
for (i in 1:nline) {
|
||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] <- as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv <- strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex <- as.integer(kv[1]) + 1
|
||||
fvalue <- as.numeric(kv[2])
|
||||
mat[i,findex] <- fvalue
|
||||
}
|
||||
}
|
||||
mat <- as(mat, "sparseMatrix")
|
||||
return(list(label=label, data=mat))
|
||||
}
|
||||
|
||||
# test code here
|
||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
# training xgboost model
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
# make prediction
|
||||
preds <- xgb.predict(bst, dtest)
|
||||
labels <- xgb.getinfo(dtest, "label")
|
||||
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
|
||||
# print error rate
|
||||
print(paste("error=",err))
|
||||
|
||||
# dump model
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# dump model with feature map
|
||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
succ <- xgb.save(dtest, "dtest.buffer")
|
||||
# save model into file
|
||||
succ <- xgb.save(bst, "xgb.model")
|
||||
# load model and data in
|
||||
bst2 <- xgb.Booster(modelfile="xgb.model")
|
||||
dtest2 <- xgb.DMatrix("dtest.buffer")
|
||||
preds2 <- xgb.predict(bst2, dtest2)
|
||||
# assert they are the same
|
||||
stopifnot(sum(abs(preds2-preds)) == 0)
|
||||
|
||||
###
|
||||
# build dmatrix from sparseMatrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.sparseMatrix')
|
||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
||||
label <- csc$label
|
||||
data <- csc$data
|
||||
dtrain <- xgb.DMatrix(data, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# build dmatrix from dense matrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.Matrix')
|
||||
mat = as.matrix(data)
|
||||
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
#
|
||||
print("start running example to used cutomized objective function")
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- xgb.getinfo(dtrain, "label")
|
||||
preds <- 1.0 / (1.0 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1.0-preds)
|
||||
return(list(grad=grad, hess=hess))
|
||||
}
|
||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- xgb.getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
||||
return(list(metric="error", value=err))
|
||||
}
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py"s implementation of train
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ("start running example to start from a initial prediction")
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
# train xgboost for 1 round
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
|
||||
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
|
||||
succ <- xgb.setinfo(dtest, "base_margin", ptest)
|
||||
print ("this is result of running from initial prediction")
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
@ -2,14 +2,11 @@ Wrapper of XGBoost
|
||||
=====
|
||||
This folder provides wrapper of xgboost to other languages
|
||||
|
||||
|
||||
Python
|
||||
=====
|
||||
* To make the python module, type ```make``` in the root directory of project
|
||||
* Refer to the walk through example in [python-example/demo.py](python-example/demo.py)
|
||||
* Refer also to the walk through example in [demo folder](../demo/guide-python)
|
||||
|
||||
R
|
||||
=====
|
||||
* To make the R wrapper, type ```make R``` in the root directory of project
|
||||
* R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
|
||||
* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)
|
||||
* See [R-package](../R-package)
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
|
||||
|
||||
for usage: see demo.py and comments in demo.py
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,121 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
# append the path to xgboost, you may need to change the following line
|
||||
# alternatively, you can add the path to PYTHONPATH environment variable
|
||||
sys.path.append('../')
|
||||
import xgboost as xgb
|
||||
|
||||
### simple example
|
||||
# load file from text file, also binary buffer generated by xgboost
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
|
||||
# specify validations set to watch performance
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, evallist)
|
||||
|
||||
# this is prediction
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
bst.save_model('0001.model')
|
||||
# dump model
|
||||
bst.dump_model('dump.raw.txt')
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.nice.txt','featmap.txt')
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
dtest.save_binary('dtest.buffer')
|
||||
bst.save_model('xgb.model')
|
||||
# load model and data in
|
||||
bst2 = xgb.Booster(model_file='xgb.model')
|
||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||
preds2 = bst2.predict(dtest2)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2-preds)) == 0
|
||||
|
||||
###
|
||||
# build dmatrix from scipy.sparse
|
||||
print ('start running example of build DMatrix from scipy.sparse')
|
||||
labels = []
|
||||
row = []; col = []; dat = []
|
||||
i = 0
|
||||
for l in open('agaricus.txt.train'):
|
||||
arr = l.split()
|
||||
labels.append( int(arr[0]))
|
||||
for it in arr[1:]:
|
||||
k,v = it.split(':')
|
||||
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||
i += 1
|
||||
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||
dtrain = xgb.DMatrix( csr )
|
||||
dtrain.set_label(labels)
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||
|
||||
print ('start running example of build DMatrix from numpy array')
|
||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
|
||||
npymat = csr.todense()
|
||||
dtrain = xgb.DMatrix( npymat)
|
||||
dtrain.set_label(labels)
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
#
|
||||
print ('start running example to used cutomized objective function')
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ('start running example to start from a initial prediction')
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
# train xgboost for 1 round
|
||||
bst = xgb.train( param, dtrain, 1, evallist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||
ptrain = bst.predict(dtrain, output_margin=True)
|
||||
ptest = bst.predict(dtest, output_margin=True)
|
||||
dtrain.set_base_margin(ptrain)
|
||||
dtest.set_base_margin(ptest)
|
||||
|
||||
print ('this is result of running from initial prediction')
|
||||
bst = xgb.train( param, dtrain, 1, evallist )
|
||||
@ -1,126 +0,0 @@
|
||||
0 cap-shape=bell i
|
||||
1 cap-shape=conical i
|
||||
2 cap-shape=convex i
|
||||
3 cap-shape=flat i
|
||||
4 cap-shape=knobbed i
|
||||
5 cap-shape=sunken i
|
||||
6 cap-surface=fibrous i
|
||||
7 cap-surface=grooves i
|
||||
8 cap-surface=scaly i
|
||||
9 cap-surface=smooth i
|
||||
10 cap-color=brown i
|
||||
11 cap-color=buff i
|
||||
12 cap-color=cinnamon i
|
||||
13 cap-color=gray i
|
||||
14 cap-color=green i
|
||||
15 cap-color=pink i
|
||||
16 cap-color=purple i
|
||||
17 cap-color=red i
|
||||
18 cap-color=white i
|
||||
19 cap-color=yellow i
|
||||
20 bruises?=bruises i
|
||||
21 bruises?=no i
|
||||
22 odor=almond i
|
||||
23 odor=anise i
|
||||
24 odor=creosote i
|
||||
25 odor=fishy i
|
||||
26 odor=foul i
|
||||
27 odor=musty i
|
||||
28 odor=none i
|
||||
29 odor=pungent i
|
||||
30 odor=spicy i
|
||||
31 gill-attachment=attached i
|
||||
32 gill-attachment=descending i
|
||||
33 gill-attachment=free i
|
||||
34 gill-attachment=notched i
|
||||
35 gill-spacing=close i
|
||||
36 gill-spacing=crowded i
|
||||
37 gill-spacing=distant i
|
||||
38 gill-size=broad i
|
||||
39 gill-size=narrow i
|
||||
40 gill-color=black i
|
||||
41 gill-color=brown i
|
||||
42 gill-color=buff i
|
||||
43 gill-color=chocolate i
|
||||
44 gill-color=gray i
|
||||
45 gill-color=green i
|
||||
46 gill-color=orange i
|
||||
47 gill-color=pink i
|
||||
48 gill-color=purple i
|
||||
49 gill-color=red i
|
||||
50 gill-color=white i
|
||||
51 gill-color=yellow i
|
||||
52 stalk-shape=enlarging i
|
||||
53 stalk-shape=tapering i
|
||||
54 stalk-root=bulbous i
|
||||
55 stalk-root=club i
|
||||
56 stalk-root=cup i
|
||||
57 stalk-root=equal i
|
||||
58 stalk-root=rhizomorphs i
|
||||
59 stalk-root=rooted i
|
||||
60 stalk-root=missing i
|
||||
61 stalk-surface-above-ring=fibrous i
|
||||
62 stalk-surface-above-ring=scaly i
|
||||
63 stalk-surface-above-ring=silky i
|
||||
64 stalk-surface-above-ring=smooth i
|
||||
65 stalk-surface-below-ring=fibrous i
|
||||
66 stalk-surface-below-ring=scaly i
|
||||
67 stalk-surface-below-ring=silky i
|
||||
68 stalk-surface-below-ring=smooth i
|
||||
69 stalk-color-above-ring=brown i
|
||||
70 stalk-color-above-ring=buff i
|
||||
71 stalk-color-above-ring=cinnamon i
|
||||
72 stalk-color-above-ring=gray i
|
||||
73 stalk-color-above-ring=orange i
|
||||
74 stalk-color-above-ring=pink i
|
||||
75 stalk-color-above-ring=red i
|
||||
76 stalk-color-above-ring=white i
|
||||
77 stalk-color-above-ring=yellow i
|
||||
78 stalk-color-below-ring=brown i
|
||||
79 stalk-color-below-ring=buff i
|
||||
80 stalk-color-below-ring=cinnamon i
|
||||
81 stalk-color-below-ring=gray i
|
||||
82 stalk-color-below-ring=orange i
|
||||
83 stalk-color-below-ring=pink i
|
||||
84 stalk-color-below-ring=red i
|
||||
85 stalk-color-below-ring=white i
|
||||
86 stalk-color-below-ring=yellow i
|
||||
87 veil-type=partial i
|
||||
88 veil-type=universal i
|
||||
89 veil-color=brown i
|
||||
90 veil-color=orange i
|
||||
91 veil-color=white i
|
||||
92 veil-color=yellow i
|
||||
93 ring-number=none i
|
||||
94 ring-number=one i
|
||||
95 ring-number=two i
|
||||
96 ring-type=cobwebby i
|
||||
97 ring-type=evanescent i
|
||||
98 ring-type=flaring i
|
||||
99 ring-type=large i
|
||||
100 ring-type=none i
|
||||
101 ring-type=pendant i
|
||||
102 ring-type=sheathing i
|
||||
103 ring-type=zone i
|
||||
104 spore-print-color=black i
|
||||
105 spore-print-color=brown i
|
||||
106 spore-print-color=buff i
|
||||
107 spore-print-color=chocolate i
|
||||
108 spore-print-color=green i
|
||||
109 spore-print-color=orange i
|
||||
110 spore-print-color=purple i
|
||||
111 spore-print-color=white i
|
||||
112 spore-print-color=yellow i
|
||||
113 population=abundant i
|
||||
114 population=clustered i
|
||||
115 population=numerous i
|
||||
116 population=scattered i
|
||||
117 population=several i
|
||||
118 population=solitary i
|
||||
119 habitat=grasses i
|
||||
120 habitat=leaves i
|
||||
121 habitat=meadows i
|
||||
122 habitat=paths i
|
||||
123 habitat=urban i
|
||||
124 habitat=waste i
|
||||
125 habitat=woods i
|
||||
@ -1,222 +0,0 @@
|
||||
# depends on matrix
|
||||
succ <- require("Matrix")
|
||||
if (!succ) {
|
||||
stop("xgboost depends on Matrix library")
|
||||
}
|
||||
# load in library
|
||||
dyn.load("./libxgboostR.so")
|
||||
|
||||
# constructing DMatrix
|
||||
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
|
||||
if (typeof(data) == "character") {
|
||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
|
||||
} else if(is.matrix(data)) {
|
||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
|
||||
} else if(class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
|
||||
} else {
|
||||
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
|
||||
}
|
||||
dmat <- structure(handle, class="xgb.DMatrix")
|
||||
if (length(info) != 0) {
|
||||
for (i in 1:length(info)) {
|
||||
p <- info[i]
|
||||
xgb.setinfo(dmat, names(p), p[[1]])
|
||||
}
|
||||
}
|
||||
return(dmat)
|
||||
}
|
||||
# get information from dmatrix
|
||||
xgb.getinfo <- function(dmat, name) {
|
||||
if (typeof(name) != "character") {
|
||||
stop("xgb.getinfo: name must be character")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
if (name != "label" &&
|
||||
name != "weight" &&
|
||||
name != "base_margin" ) {
|
||||
stop(paste("xgb.getinfo: unknown info name", name))
|
||||
}
|
||||
ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
|
||||
return(ret)
|
||||
}
|
||||
# set information into dmatrix, this mutate dmatrix
|
||||
xgb.setinfo <- function(dmat, name, info) {
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
if (name == "label") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "weight") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "base_margin") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "group") {
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
|
||||
return(TRUE)
|
||||
}
|
||||
stop(pase("xgb.setinfo: unknown info name", name))
|
||||
return(FALSE)
|
||||
}
|
||||
# construct a Booster from cachelist
|
||||
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
if (typeof(cachelist) != "list") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
for (dm in cachelist) {
|
||||
if (class(dm) != "xgb.DMatrix") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
}
|
||||
handle <- .Call("XGBoosterCreate_R", cachelist)
|
||||
.Call("XGBoosterSetParam_R", handle, "seed", "0")
|
||||
if (length(params) != 0) {
|
||||
for (i in 1:length(params)) {
|
||||
p <- params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
|
||||
}
|
||||
}
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) != "character"){
|
||||
stop("xgb.Booster: modelfile must be character");
|
||||
}
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile)
|
||||
}
|
||||
return(structure(handle, class="xgb.Booster"))
|
||||
}
|
||||
# train a model using given parameters
|
||||
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.train: first argument params must be list");
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
bst <- xgb.Booster(params, append(watchlist,dtrain))
|
||||
for (i in 1:nrounds) {
|
||||
if (is.null(obj)) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i-1)
|
||||
} else {
|
||||
pred <- xgb.predict(bst, dtrain)
|
||||
gpair <- obj(pred, dtrain)
|
||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
if (is.null(feval)) {
|
||||
msg <- xgb.iter.eval(bst, watchlist, i-1)
|
||||
cat(msg); cat("\n")
|
||||
} else {
|
||||
cat("["); cat(i); cat("]");
|
||||
for (j in 1:length(watchlist)) {
|
||||
w <- watchlist[j]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
||||
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
|
||||
cat(":"); cat(ret$value)
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
return(bst)
|
||||
}
|
||||
# save model or DMatrix to file
|
||||
xgb.save <- function(handle, fname) {
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.save: fname must be character");
|
||||
}
|
||||
if (class(handle) == "xgb.Booster") {
|
||||
.Call("XGBoosterSaveModel_R", handle, fname);
|
||||
return(TRUE)
|
||||
}
|
||||
if (class(handle) == "xgb.DMatrix") {
|
||||
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE))
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||
return(FALSE)
|
||||
}
|
||||
# predict
|
||||
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.predict: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
|
||||
return(ret)
|
||||
}
|
||||
# dump model
|
||||
xgb.dump <- function(booster, fname, fmap = "") {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character"){
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", booster, fname, fmap)
|
||||
return(TRUE)
|
||||
}
|
||||
##--------------------------------------
|
||||
# the following are low level iteratively function, not needed
|
||||
# if you do not want to use them
|
||||
#---------------------------------------
|
||||
# iteratively update booster with dtrain
|
||||
xgb.iter.update <- function(booster, dtrain, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain)
|
||||
return(TRUE)
|
||||
}
|
||||
# iteratively update booster with customized statistics
|
||||
xgb.iter.boost <- function(booster, dtrain, gpair) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess)
|
||||
return(TRUE)
|
||||
}
|
||||
# iteratively evaluate one iteration
|
||||
xgb.iter.eval <- function(booster, watchlist, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(watchlist) != "list") {
|
||||
stop("xgb.eval: only accepts list of DMatrix as watchlist")
|
||||
}
|
||||
for (w in watchlist) {
|
||||
if (class(w) != "xgb.DMatrix") {
|
||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
||||
}
|
||||
}
|
||||
evnames <- list()
|
||||
if (length(watchlist) != 0) {
|
||||
for (i in 1:length(watchlist)) {
|
||||
w <- watchlist[i]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
evnames <- append(evnames, names(w))
|
||||
}
|
||||
}
|
||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
|
||||
return(msg)
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user