Merge branch 'master' of https://github.com/tqchen/xgboost into tqchen-master
Conflicts: README.md
This commit is contained in:
commit
909a61edac
6
.gitignore
vendored
6
.gitignore
vendored
@ -6,12 +6,15 @@
|
|||||||
# Compiled Dynamic libraries
|
# Compiled Dynamic libraries
|
||||||
*.so
|
*.so
|
||||||
*.dylib
|
*.dylib
|
||||||
|
*.page
|
||||||
# Compiled Static libraries
|
# Compiled Static libraries
|
||||||
*.lai
|
*.lai
|
||||||
*.la
|
*.la
|
||||||
*.a
|
*.a
|
||||||
*~
|
*~
|
||||||
|
*.Rcheck
|
||||||
|
*.rds
|
||||||
|
*.tar.gz
|
||||||
*txt*
|
*txt*
|
||||||
*conf
|
*conf
|
||||||
*buffer
|
*buffer
|
||||||
@ -40,3 +43,4 @@ Debug
|
|||||||
*x64
|
*x64
|
||||||
*dump
|
*dump
|
||||||
*save
|
*save
|
||||||
|
*csv
|
||||||
|
|||||||
@ -11,7 +11,7 @@ xgboost-0.2x
|
|||||||
* Weighted samples instances
|
* Weighted samples instances
|
||||||
* Initial version of pairwise rank
|
* Initial version of pairwise rank
|
||||||
|
|
||||||
xgboost-unity
|
xgboost-0.3
|
||||||
=====
|
=====
|
||||||
* Faster tree construction module
|
* Faster tree construction module
|
||||||
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
|
||||||
|
|||||||
55
Makefile
55
Makefile
@ -1,32 +1,32 @@
|
|||||||
export CC = gcc
|
export CC = gcc
|
||||||
export CXX = g++
|
export CXX = g++
|
||||||
export LDFLAGS= -pthread -lm
|
export LDFLAGS= -pthread -lm
|
||||||
# note for R module
|
|
||||||
# add include path to Rinternals.h here
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
|
||||||
|
|
||||||
ifeq ($(no_omp),1)
|
ifeq ($(no_omp),1)
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -DDISABLE_OPENMP
|
CFLAGS += -DDISABLE_OPENMP
|
||||||
else
|
else
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
CFLAGS += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# expose these flags to R CMD SHLIB
|
|
||||||
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
|
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
BIN = xgboost
|
BIN = xgboost
|
||||||
OBJ =
|
OBJ = updater.o gbm.o io.o
|
||||||
SLIB = wrapper/libxgboostwrapper.so
|
SLIB = wrapper/libxgboostwrapper.so
|
||||||
RLIB = wrapper/libxgboostR.so
|
|
||||||
.PHONY: clean all R
|
|
||||||
|
|
||||||
all: $(BIN) wrapper/libxgboostwrapper.so
|
.PHONY: clean all python Rpack
|
||||||
R: wrapper/libxgboostR.so
|
|
||||||
|
|
||||||
xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp
|
all: $(BIN) $(OBJ) $(SLIB)
|
||||||
|
|
||||||
|
python: wrapper/libxgboostwrapper.so
|
||||||
# now the wrapper takes in two files. io and wrapper part
|
# now the wrapper takes in two files. io and wrapper part
|
||||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h
|
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
|
||||||
wrapper/libxgboostR.so: wrapper/xgboost_wrapper.cpp wrapper/xgboost_R.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h
|
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
|
||||||
|
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
|
||||||
|
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
|
||||||
|
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
|
||||||
|
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
@ -34,14 +34,31 @@ $(BIN) :
|
|||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
|
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
$(RLIB) :
|
|
||||||
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
|
|
||||||
|
|
||||||
$(OBJ) :
|
$(OBJ) :
|
||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||||
|
|
||||||
install:
|
install:
|
||||||
cp -f -r $(BIN) $(INSTALL_PATH)
|
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||||
|
|
||||||
|
Rpack:
|
||||||
|
make clean
|
||||||
|
rm -rf xgboost xgboost*.tar.gz
|
||||||
|
cp -r R-package xgboost
|
||||||
|
rm -rf xgboost/inst/examples/*.buffer
|
||||||
|
rm -rf xgboost/inst/examples/*.model
|
||||||
|
rm -rf xgboost/inst/examples/dump*
|
||||||
|
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
|
||||||
|
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer
|
||||||
|
cp -r src xgboost/src/src
|
||||||
|
mkdir xgboost/src/wrapper
|
||||||
|
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
|
||||||
|
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
|
||||||
|
cp ./LICENSE xgboost
|
||||||
|
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
|
||||||
|
cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
|
||||||
|
R CMD build xgboost
|
||||||
|
rm -rf xgboost
|
||||||
|
R CMD check --as-cran xgboost*.tar.gz
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(BIN) $(SLIB) $(RLIB) *~ */*~ */*/*~
|
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||||
|
|||||||
@ -1,12 +1,20 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: R wrapper of xgboost
|
Title: eXtreme Gradient Boosting
|
||||||
Version: 0.3-0
|
Version: 0.3-1
|
||||||
Date: 2014-08-23
|
Date: 2014-08-23
|
||||||
Author: Tianqi Chen
|
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
|
||||||
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>
|
Maintainer: Tong He <hetong007@gmail.com>
|
||||||
Description: xgboost
|
Description: This package is a R wrapper of xgboost, which is short for eXtreme
|
||||||
License: See LICENSE file
|
Gradient Boosting. It is an efficient and scalable implementation of
|
||||||
|
gradient boosting framework. The package includes efficient linear model
|
||||||
|
solver and tree learning algorithms. The package can automatically do
|
||||||
|
parallel computation with OpenMP, and it can be more than 10 times faster
|
||||||
|
than existing gradient boosting packages such as gbm. It supports various
|
||||||
|
objective functions, including regression, classification and ranking. The
|
||||||
|
package is made to be extensible, so that users are also allowed to define
|
||||||
|
their own objectives easily.
|
||||||
|
License: Apache License (== 2.0) | file LICENSE
|
||||||
URL: https://github.com/tqchen/xgboost
|
URL: https://github.com/tqchen/xgboost
|
||||||
BugReports: https://github.com/tqchen/xgboost/issues
|
BugReports: https://github.com/tqchen/xgboost/issues
|
||||||
Depends:
|
Depends:
|
||||||
|
|||||||
@ -1,10 +1,15 @@
|
|||||||
importClassesFrom("Matrix", dgCMatrix, dgeMatrix)
|
# Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
|
||||||
export(xgboost)
|
export(getinfo)
|
||||||
|
export(slice)
|
||||||
export(xgb.DMatrix)
|
export(xgb.DMatrix)
|
||||||
export(xgb.getinfo)
|
export(xgb.DMatrix.save)
|
||||||
exportMethods(predict)
|
|
||||||
export(xgb.train)
|
|
||||||
export(xgb.save)
|
|
||||||
export(xgb.load)
|
|
||||||
export(xgb.dump)
|
export(xgb.dump)
|
||||||
|
export(xgb.load)
|
||||||
|
export(xgb.save)
|
||||||
|
export(xgb.train)
|
||||||
|
export(xgboost)
|
||||||
|
exportMethods(predict)
|
||||||
|
import(methods)
|
||||||
|
importClassesFrom(Matrix,dgCMatrix)
|
||||||
|
importClassesFrom(Matrix,dgeMatrix)
|
||||||
|
|||||||
38
R-package/R/getinfo.xgb.DMatrix.R
Normal file
38
R-package/R/getinfo.xgb.DMatrix.R
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
setClass('xgb.DMatrix')
|
||||||
|
|
||||||
|
#' Get information of an xgb.DMatrix object
|
||||||
|
#'
|
||||||
|
#' Get information of an xgb.DMatrix object
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' iris[,5] <- as.numeric(iris[,5])
|
||||||
|
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
#' labels <- getinfo(dtrain, "label")
|
||||||
|
#' @rdname getinfo
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
getinfo <- function(object, ...){
|
||||||
|
UseMethod("getinfo")
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @param object Object of class "xgb.DMatrix"
|
||||||
|
#' @param name the name of the field to get
|
||||||
|
#' @param ... other parameters
|
||||||
|
#' @rdname getinfo
|
||||||
|
#' @method getinfo xgb.DMatrix
|
||||||
|
setMethod("getinfo", signature = "xgb.DMatrix",
|
||||||
|
definition = function(object, name) {
|
||||||
|
if (typeof(name) != "character") {
|
||||||
|
stop("xgb.getinfo: name must be character")
|
||||||
|
}
|
||||||
|
if (class(object) != "xgb.DMatrix") {
|
||||||
|
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||||
|
}
|
||||||
|
if (name != "label" && name != "weight" && name != "base_margin") {
|
||||||
|
stop(paste("xgb.getinfo: unknown info name", name))
|
||||||
|
}
|
||||||
|
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
|
||||||
|
return(ret)
|
||||||
|
})
|
||||||
|
|
||||||
@ -1,16 +1,37 @@
|
|||||||
#' @export
|
|
||||||
setClass("xgb.Booster")
|
setClass("xgb.Booster")
|
||||||
|
|
||||||
|
#' Predict method for eXtreme Gradient Boosting model
|
||||||
|
#'
|
||||||
|
#' Predicted values based on xgboost model object.
|
||||||
|
#'
|
||||||
|
#' @param object Object of class "xgb.Boost"
|
||||||
|
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||||
|
#' \code{xgb.DMatrix}.
|
||||||
|
#' @param outputmargin whether the prediction should be shown in the original
|
||||||
|
#' value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||||
|
#' untransformed margin value. In logistic regression, outputmargin=T will
|
||||||
|
#' output value before logistic transformation.
|
||||||
|
#' @param ntreelimit limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
|
||||||
|
#' set it to be value bigger than 0. It will use all trees by default.
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
#' @export
|
#' @export
|
||||||
setMethod("predict",
|
#'
|
||||||
signature = "xgb.Booster",
|
setMethod("predict", signature = "xgb.Booster",
|
||||||
definition = function(object, newdata, outputmargin = FALSE)
|
definition = function(object, newdata, outputmargin = FALSE, ntreelimit = NULL) {
|
||||||
{
|
|
||||||
if (class(newdata) != "xgb.DMatrix") {
|
if (class(newdata) != "xgb.DMatrix") {
|
||||||
newdata = xgb.DMatrix(newdata)
|
newdata <- xgb.DMatrix(newdata)
|
||||||
}
|
}
|
||||||
ret <- .Call("XGBoosterPredict_R", object, newdata,
|
if (is.null(ntreelimit)) {
|
||||||
as.integer(outputmargin), PACKAGE="xgboost")
|
ntreelimit <- 0
|
||||||
|
} else {
|
||||||
|
if (ntreelimit < 1){
|
||||||
|
stop("predict: ntreelimit must be equal to or greater than 1")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
|
||||||
return(ret)
|
return(ret)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
33
R-package/R/slice.xgb.DMatrix.R
Normal file
33
R-package/R/slice.xgb.DMatrix.R
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
setClass('xgb.DMatrix')
|
||||||
|
|
||||||
|
#' Get a new DMatrix containing the specified rows of
|
||||||
|
#' orginal xgb.DMatrix object
|
||||||
|
#'
|
||||||
|
#' Get a new DMatrix containing the specified rows of
|
||||||
|
#' orginal xgb.DMatrix object
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' iris[,5] <- as.numeric(iris[,5])
|
||||||
|
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
#' dsub <- slice(dtrain, 1:3)
|
||||||
|
#' @rdname slice
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
slice <- function(object, ...){
|
||||||
|
UseMethod("slice")
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @param object Object of class "xgb.DMatrix"
|
||||||
|
#' @param idxset a integer vector of indices of rows needed
|
||||||
|
#' @param ... other parameters
|
||||||
|
#' @rdname slice
|
||||||
|
#' @method slice xgb.DMatrix
|
||||||
|
setMethod("slice", signature = "xgb.DMatrix",
|
||||||
|
definition = function(object, idxset, ...) {
|
||||||
|
if (class(object) != "xgb.DMatrix") {
|
||||||
|
stop("slice: first argument dtrain must be xgb.DMatrix")
|
||||||
|
}
|
||||||
|
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
|
||||||
|
return(structure(ret, class = "xgb.DMatrix"))
|
||||||
|
})
|
||||||
@ -1,30 +1,37 @@
|
|||||||
|
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
||||||
|
#' @import methods
|
||||||
|
|
||||||
# depends on matrix
|
# depends on matrix
|
||||||
.onLoad <- function(libname, pkgname) {
|
.onLoad <- function(libname, pkgname) {
|
||||||
library.dynam("xgboost", pkgname, libname);
|
library.dynam("xgboost", pkgname, libname)
|
||||||
}
|
}
|
||||||
.onUnload <- function(libpath) {
|
.onUnload <- function(libpath) {
|
||||||
library.dynam.unload("xgboost", libpath);
|
library.dynam.unload("xgboost", libpath)
|
||||||
}
|
}
|
||||||
|
|
||||||
# set information into dmatrix, this mutate dmatrix
|
# set information into dmatrix, this mutate dmatrix
|
||||||
xgb.setinfo <- function(dmat, name, info) {
|
xgb.setinfo <- function(dmat, name, info) {
|
||||||
if (class(dmat) != "xgb.DMatrix") {
|
if (class(dmat) != "xgb.DMatrix") {
|
||||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||||
}
|
}
|
||||||
if (name == "label") {
|
if (name == "label") {
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (name == "weight") {
|
if (name == "weight") {
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (name == "base_margin") {
|
if (name == "base_margin") {
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (name == "group") {
|
if (name == "group") {
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost")
|
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
stop(paste("xgb.setinfo: unknown info name", name))
|
stop(paste("xgb.setinfo: unknown info name", name))
|
||||||
@ -42,16 +49,16 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
|
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
|
||||||
.Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost")
|
|
||||||
if (length(params) != 0) {
|
if (length(params) != 0) {
|
||||||
for (i in 1:length(params)) {
|
for (i in 1:length(params)) {
|
||||||
p <- params[i]
|
p <- params[i]
|
||||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost")
|
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!is.null(modelfile)) {
|
if (!is.null(modelfile)) {
|
||||||
if (typeof(modelfile) != "character") {
|
if (typeof(modelfile) != "character") {
|
||||||
stop("xgb.Booster: modelfile must be character");
|
stop("xgb.Booster: modelfile must be character")
|
||||||
}
|
}
|
||||||
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
|
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
|
||||||
}
|
}
|
||||||
@ -67,14 +74,13 @@ xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
|||||||
if (class(dmat) != "xgb.DMatrix") {
|
if (class(dmat) != "xgb.DMatrix") {
|
||||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
||||||
}
|
}
|
||||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost")
|
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(ret)
|
return(ret)
|
||||||
}
|
}
|
||||||
|
|
||||||
##--------------------------------------
|
## ----the following are low level iteratively function, not needed if
|
||||||
# the following are low level iteratively function, not needed
|
## you do not want to use them ---------------------------------------
|
||||||
# if you do not want to use them
|
|
||||||
#---------------------------------------
|
|
||||||
|
|
||||||
# iteratively update booster with dtrain
|
# iteratively update booster with dtrain
|
||||||
xgb.iter.update <- function(booster, dtrain, iter) {
|
xgb.iter.update <- function(booster, dtrain, iter) {
|
||||||
@ -84,7 +90,8 @@ xgb.iter.update <- function(booster, dtrain, iter) {
|
|||||||
if (class(dtrain) != "xgb.DMatrix") {
|
if (class(dtrain) != "xgb.DMatrix") {
|
||||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||||
}
|
}
|
||||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost")
|
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,7 +103,8 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
|
|||||||
if (class(dtrain) != "xgb.DMatrix") {
|
if (class(dtrain) != "xgb.DMatrix") {
|
||||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||||
}
|
}
|
||||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost")
|
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess,
|
||||||
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,8 +121,8 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
|
|||||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
evnames <- list()
|
|
||||||
if (length(watchlist) != 0) {
|
if (length(watchlist) != 0) {
|
||||||
|
evnames <- list()
|
||||||
for (i in 1:length(watchlist)) {
|
for (i in 1:length(watchlist)) {
|
||||||
w <- watchlist[i]
|
w <- watchlist[i]
|
||||||
if (length(names(w)) == 0) {
|
if (length(names(w)) == 0) {
|
||||||
@ -122,7 +130,10 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
|
|||||||
}
|
}
|
||||||
evnames <- append(evnames, names(w))
|
evnames <- append(evnames, names(w))
|
||||||
}
|
}
|
||||||
|
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
|
||||||
|
evnames, PACKAGE = "xgboost")
|
||||||
|
} else {
|
||||||
|
msg <- ""
|
||||||
}
|
}
|
||||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost")
|
|
||||||
return(msg)
|
return(msg)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,21 +1,44 @@
|
|||||||
# constructing DMatrix
|
#' Contruct xgb.DMatrix object
|
||||||
xgb.DMatrix <- function(data, missing=0.0, ...) {
|
#'
|
||||||
|
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
|
||||||
|
#'
|
||||||
|
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
|
||||||
|
#' indicating the data file.
|
||||||
|
#' @param info a list of information of the xgb.DMatrix object
|
||||||
|
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||||
|
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||||
|
#
|
||||||
|
#' @param ... other information to pass to \code{info}.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' iris[,5] <- as.numeric(iris[,5])
|
||||||
|
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||||
|
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
|
||||||
if (typeof(data) == "character") {
|
if (typeof(data) == "character") {
|
||||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost")
|
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
} else if (is.matrix(data)) {
|
} else if (is.matrix(data)) {
|
||||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost")
|
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
|
||||||
|
PACKAGE = "xgboost")
|
||||||
} else if (class(data) == "dgCMatrix") {
|
} else if (class(data) == "dgCMatrix") {
|
||||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost")
|
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
|
||||||
|
PACKAGE = "xgboost")
|
||||||
} else {
|
} else {
|
||||||
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
|
stop(paste("xgb.DMatrix: does not support to construct from ",
|
||||||
|
typeof(data)))
|
||||||
}
|
}
|
||||||
dmat <- structure(handle, class = "xgb.DMatrix")
|
dmat <- structure(handle, class = "xgb.DMatrix")
|
||||||
|
|
||||||
info = list(...)
|
info <- append(info, list(...))
|
||||||
if (length(info) == 0)
|
if (length(info) == 0)
|
||||||
return(dmat)
|
return(dmat)
|
||||||
for (i in 1:length(info)) {
|
for (i in 1:length(info)) {
|
||||||
p = info[i]
|
p <- info[i]
|
||||||
xgb.setinfo(dmat, names(p), p[[1]])
|
xgb.setinfo(dmat, names(p), p[[1]])
|
||||||
}
|
}
|
||||||
return(dmat)
|
return(dmat)
|
||||||
|
|||||||
27
R-package/R/xgb.DMatrix.save.R
Normal file
27
R-package/R/xgb.DMatrix.save.R
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#' Save xgb.DMatrix object to binary file
|
||||||
|
#'
|
||||||
|
#' Save xgb.DMatrix object to binary file
|
||||||
|
#'
|
||||||
|
#' @param DMatrix the model object.
|
||||||
|
#' @param fname the name of the binary file.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' iris[,5] <- as.numeric(iris[,5])
|
||||||
|
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
#' xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||||
|
#' dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgb.DMatrix.save <- function(DMatrix, fname) {
|
||||||
|
if (typeof(fname) != "character") {
|
||||||
|
stop("xgb.save: fname must be character")
|
||||||
|
}
|
||||||
|
if (class(DMatrix) == "xgb.DMatrix") {
|
||||||
|
.Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
|
||||||
|
PACKAGE = "xgboost")
|
||||||
|
return(TRUE)
|
||||||
|
}
|
||||||
|
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||||
|
return(FALSE)
|
||||||
|
}
|
||||||
@ -1,11 +1,29 @@
|
|||||||
# dump model
|
#' Save xgboost model to text file
|
||||||
xgb.dump <- function(booster, fname, fmap = "") {
|
#'
|
||||||
if (class(booster) != "xgb.Booster") {
|
#' Save a xgboost model to text file. Could be parsed later.
|
||||||
|
#'
|
||||||
|
#' @param model the model object.
|
||||||
|
#' @param fname the name of the binary file.
|
||||||
|
#' @param fmap feature map file representing the type of feature.
|
||||||
|
#' Detailed description could be found at
|
||||||
|
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||||
|
#' Run inst/examples/demo.R for the result and inst/examples/featmap.txt
|
||||||
|
#' for example Format.
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
#' xgb.dump(bst, 'iris.xgb.model.dump')
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgb.dump <- function(model, fname, fmap = "") {
|
||||||
|
if (class(model) != "xgb.Booster") {
|
||||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||||
}
|
}
|
||||||
if (typeof(fname) != "character") {
|
if (typeof(fname) != "character") {
|
||||||
stop("xgb.dump: second argument must be type character")
|
stop("xgb.dump: second argument must be type character")
|
||||||
}
|
}
|
||||||
.Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost")
|
.Call("XGBoosterDumpModel_R", model, fname, fmap, PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,16 +0,0 @@
|
|||||||
# get information from dmatrix
|
|
||||||
xgb.getinfo <- function(dmat, name) {
|
|
||||||
if (typeof(name) != "character") {
|
|
||||||
stop("xgb.getinfo: name must be character")
|
|
||||||
}
|
|
||||||
if (class(dmat) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
|
||||||
}
|
|
||||||
if (name != "label" &&
|
|
||||||
name != "weight" &&
|
|
||||||
name != "base_margin" ) {
|
|
||||||
stop(paste("xgb.getinfo: unknown info name", name))
|
|
||||||
}
|
|
||||||
ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost")
|
|
||||||
return(ret)
|
|
||||||
}
|
|
||||||
@ -1,5 +1,19 @@
|
|||||||
|
#' Load xgboost model from binary file
|
||||||
|
#'
|
||||||
|
#' Load xgboost model from the binary model file
|
||||||
|
#'
|
||||||
|
#' @param modelfile the name of the binary file.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
#' xgb.save(bst, 'iris.xgb.model')
|
||||||
|
#' bst <- xgb.load('iris.xgb.model')
|
||||||
|
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
xgb.load <- function(modelfile) {
|
xgb.load <- function(modelfile) {
|
||||||
if (is.null(modelfile))
|
if (is.null(modelfile))
|
||||||
stop('xgb.load: modelfile cannot be NULL')
|
stop("xgb.load: modelfile cannot be NULL")
|
||||||
xgb.Booster(modelfile = modelfile)
|
xgb.Booster(modelfile = modelfile)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,16 +1,27 @@
|
|||||||
# save model or DMatrix to file
|
#' Save xgboost model to binary file
|
||||||
xgb.save <- function(handle, fname) {
|
#'
|
||||||
|
#' Save xgboost model from xgboost or xgb.train
|
||||||
|
#'
|
||||||
|
#' @param model the model object.
|
||||||
|
#' @param fname the name of the binary file.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
#' xgb.save(bst, 'iris.xgb.model')
|
||||||
|
#' bst <- xgb.load('iris.xgb.model')
|
||||||
|
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgb.save <- function(model, fname) {
|
||||||
if (typeof(fname) != "character") {
|
if (typeof(fname) != "character") {
|
||||||
stop("xgb.save: fname must be character")
|
stop("xgb.save: fname must be character")
|
||||||
}
|
}
|
||||||
if (class(handle) == "xgb.Booster") {
|
if (class(model) == "xgb.Booster") {
|
||||||
.Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost")
|
.Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
if (class(handle) == "xgb.DMatrix") {
|
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
|
||||||
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
|
xgb.DMatrix object.")
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
|
||||||
return(FALSE)
|
return(FALSE)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,11 +1,78 @@
|
|||||||
# train a model using given parameters
|
#' eXtreme Gradient Boosting Training
|
||||||
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
|
#'
|
||||||
|
#' The training function of xgboost
|
||||||
|
#'
|
||||||
|
#' @param params the list of parameters. Commonly used ones are:
|
||||||
|
#' \itemize{
|
||||||
|
#' \item \code{objective} objective function, common ones are
|
||||||
|
#' \itemize{
|
||||||
|
#' \item \code{reg:linear} linear regression
|
||||||
|
#' \item \code{binary:logistic} logistic regression for classification
|
||||||
|
#' }
|
||||||
|
#' \item \code{eta} step size of each boosting step
|
||||||
|
#' \item \code{max_depth} maximum depth of the tree
|
||||||
|
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||||
|
#' further details. See also inst/examples/demo.R for walkthrough example in R.
|
||||||
|
#' @param dtrain takes an \code{xgb.DMatrix} as the input.
|
||||||
|
#' @param nrounds the max number of iterations
|
||||||
|
#' @param watchlist what information should be printed when \code{verbose=1} or
|
||||||
|
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
|
||||||
|
#' during training. For example user can specify
|
||||||
|
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
|
||||||
|
#' the performance of each round's model on mat1 and mat2
|
||||||
|
#'
|
||||||
|
#' @param obj customized objective function. Returns gradient and second order
|
||||||
|
#' gradient with given prediction and dtrain,
|
||||||
|
#' @param feval custimized evaluation function. Returns
|
||||||
|
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||||
|
#' prediction and dtrain,
|
||||||
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' This is the training function for xgboost.
|
||||||
|
#'
|
||||||
|
#' Parallelization is automatically enabled if OpenMP is present.
|
||||||
|
#' Number of threads can also be manually specified via "nthread" parameter.
|
||||||
|
#'
|
||||||
|
#' This function only accepts an \code{xgb.DMatrix} object as the input.
|
||||||
|
#' It supports advanced features such as watchlist, customized objective function,
|
||||||
|
#' therefore it is more flexible than \code{\link{xgboost}}.
|
||||||
|
#'
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' iris[,5] <- as.numeric(iris[,5])
|
||||||
|
#' dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
#' dtest <- dtrain
|
||||||
|
#' watchlist <- list(eval = dtest, train = dtrain)
|
||||||
|
#' param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||||
|
#' logregobj <- function(preds, dtrain) {
|
||||||
|
#' labels <- getinfo(dtrain, "label")
|
||||||
|
#' preds <- 1/(1 + exp(-preds))
|
||||||
|
#' grad <- preds - labels
|
||||||
|
#' hess <- preds * (1 - preds)
|
||||||
|
#' return(list(grad = grad, hess = hess))
|
||||||
|
#' }
|
||||||
|
#' evalerror <- function(preds, dtrain) {
|
||||||
|
#' labels <- getinfo(dtrain, "label")
|
||||||
|
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
|
#' return(list(metric = "error", value = err))
|
||||||
|
#' }
|
||||||
|
#' bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgb.train <- function(params=list(), dtrain, nrounds, watchlist = list(),
|
||||||
|
obj = NULL, feval = NULL, ...) {
|
||||||
if (typeof(params) != "list") {
|
if (typeof(params) != "list") {
|
||||||
stop("xgb.train: first argument params must be list");
|
stop("xgb.train: first argument params must be list")
|
||||||
}
|
}
|
||||||
if (class(dtrain) != "xgb.DMatrix") {
|
if (class(dtrain) != "xgb.DMatrix") {
|
||||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
|
stop("xgb.train: second argument dtrain must be xgb.DMatrix")
|
||||||
}
|
}
|
||||||
|
params = append(params, list(...))
|
||||||
bst <- xgb.Booster(params, append(watchlist, dtrain))
|
bst <- xgb.Booster(params, append(watchlist, dtrain))
|
||||||
for (i in 1:nrounds) {
|
for (i in 1:nrounds) {
|
||||||
if (is.null(obj)) {
|
if (is.null(obj)) {
|
||||||
@ -18,17 +85,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, fe
|
|||||||
if (length(watchlist) != 0) {
|
if (length(watchlist) != 0) {
|
||||||
if (is.null(feval)) {
|
if (is.null(feval)) {
|
||||||
msg <- xgb.iter.eval(bst, watchlist, i - 1)
|
msg <- xgb.iter.eval(bst, watchlist, i - 1)
|
||||||
cat(msg); cat("\n")
|
cat(msg)
|
||||||
|
cat("\n")
|
||||||
} else {
|
} else {
|
||||||
cat("["); cat(i); cat("]");
|
cat("[")
|
||||||
|
cat(i)
|
||||||
|
cat("]")
|
||||||
for (j in 1:length(watchlist)) {
|
for (j in 1:length(watchlist)) {
|
||||||
w <- watchlist[j]
|
w <- watchlist[j]
|
||||||
if (length(names(w)) == 0) {
|
if (length(names(w)) == 0) {
|
||||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||||
}
|
}
|
||||||
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
||||||
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
|
cat("\t")
|
||||||
cat(":"); cat(ret$value)
|
cat(names(w))
|
||||||
|
cat("-")
|
||||||
|
cat(ret$metric)
|
||||||
|
cat(":")
|
||||||
|
cat(ret$value)
|
||||||
}
|
}
|
||||||
cat("\n")
|
cat("\n")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,49 +1,71 @@
|
|||||||
# Main function for xgboost-package
|
#' eXtreme Gradient Boosting (Tree) library
|
||||||
|
#'
|
||||||
xgboost = function(x=NULL,y=NULL,DMatrix=NULL, file=NULL, validation=NULL,
|
#' A simple interface for xgboost in R
|
||||||
nrounds=10, obj=NULL, feval=NULL, margin=NULL, verbose = T, ...)
|
#'
|
||||||
{
|
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||||
if (!is.null(DMatrix))
|
#' \code{xgb.DMatrix}.
|
||||||
dtrain = DMatrix
|
#' @param label the response variable. User should not set this field,
|
||||||
else
|
# if data is local data file or \code{xgb.DMatrix}.
|
||||||
{
|
#' @param params the list of parameters. Commonly used ones are:
|
||||||
if (is.null(x) && is.null(y))
|
#' \itemize{
|
||||||
{
|
#' \item \code{objective} objective function, common ones are
|
||||||
if (is.null(file))
|
#' \itemize{
|
||||||
stop('xgboost need input data, either R objects, local files or DMatrix object.')
|
#' \item \code{reg:linear} linear regression
|
||||||
dtrain = xgb.DMatrix(file)
|
#' \item \code{binary:logistic} logistic regression for classification
|
||||||
}
|
#' }
|
||||||
else
|
#' \item \code{eta} step size of each boosting step
|
||||||
dtrain = xgb.DMatrix(x, label=y)
|
#' \item \code{max_depth} maximum depth of the tree
|
||||||
if (!is.null(margin))
|
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||||
{
|
#' }
|
||||||
succ <- xgb.setinfo(dtrain, "base_margin", margin)
|
#'
|
||||||
if (!succ)
|
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||||
warning('Attemp to use margin failed.')
|
#' further details. See also inst/examples/demo.R for walkthrough example in R.
|
||||||
}
|
#' @param nrounds the max number of iterations
|
||||||
|
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||||
|
#' information of performance. If 2, xgboost will print information of both
|
||||||
|
#' performance and construction progress information
|
||||||
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' This is the modeling function for xgboost.
|
||||||
|
#'
|
||||||
|
#' Parallelization is automatically enabled if OpenMP is present.
|
||||||
|
#' Number of threads can also be manually specified via "nthread" parameter
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' data(iris)
|
||||||
|
#' bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
#' pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
#' @export
|
||||||
|
#'
|
||||||
|
xgboost <- function(data = NULL, label = NULL, params = list(), nrounds,
|
||||||
|
verbose = 1, ...) {
|
||||||
|
inClass <- class(data)
|
||||||
|
if (inClass == "dgCMatrix" || inClass == "matrix") {
|
||||||
|
if (is.null(label))
|
||||||
|
stop("xgboost: need label when data is a matrix")
|
||||||
|
dtrain <- xgb.DMatrix(data, label = label)
|
||||||
|
} else {
|
||||||
|
if (!is.null(label))
|
||||||
|
warning("xgboost: label will be ignored.")
|
||||||
|
if (inClass == "character")
|
||||||
|
dtrain <- xgb.DMatrix(data) else if (inClass == "xgb.DMatrix")
|
||||||
|
dtrain <- data else stop("xgboost: Invalid input of data")
|
||||||
}
|
}
|
||||||
|
|
||||||
params = list(...)
|
if (verbose > 1) {
|
||||||
|
silent <- 0
|
||||||
watchlist=list()
|
} else {
|
||||||
if (verbose)
|
silent <- 1
|
||||||
{
|
|
||||||
if (!is.null(validation))
|
|
||||||
{
|
|
||||||
if (class(validation)!='xgb.DMatrix')
|
|
||||||
dtest = xgb.DMatrix(validation)
|
|
||||||
else
|
|
||||||
dtest = validation
|
|
||||||
watchlist = list(eval=dtest,train=dtrain)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
else
|
params <- append(params, list(silent = silent))
|
||||||
watchlist = list(train=dtrain)
|
params <- append(params, list(...))
|
||||||
}
|
|
||||||
|
|
||||||
bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval)
|
if (verbose > 0)
|
||||||
|
watchlist <- list(train = dtrain) else watchlist <- list()
|
||||||
|
|
||||||
|
bst <- xgb.train(params, dtrain, nrounds, watchlist)
|
||||||
|
|
||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,21 @@
|
|||||||
This is subfolder for experimental version of R package.
|
# R package for xgboost.
|
||||||
|
|
||||||
Not yet ready.
|
## Installation
|
||||||
|
|
||||||
Installation:
|
For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
||||||
|
|
||||||
```r
|
```r
|
||||||
require(devtools)
|
require(devtools)
|
||||||
install_github('xgboost','tqchen',subdir='R-package')
|
install_github('xgboost','tqchen',subdir='R-package')
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For stable version on CRAN, please run
|
||||||
|
|
||||||
|
```r
|
||||||
|
install.packages('xgboost')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example.
|
||||||
|
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.
|
||||||
|
|||||||
@ -1,133 +0,0 @@
|
|||||||
require(xgboost)
|
|
||||||
require(methods)
|
|
||||||
|
|
||||||
# helper function to read libsvm format
|
|
||||||
# this is very badly written, load in dense, and convert to sparse
|
|
||||||
# use this only for demo purpose
|
|
||||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
|
||||||
read.libsvm = function(fname, maxcol) {
|
|
||||||
content = readLines(fname)
|
|
||||||
nline = length(content)
|
|
||||||
label = numeric(nline)
|
|
||||||
mat = matrix(0, nline, maxcol+1)
|
|
||||||
for (i in 1:nline) {
|
|
||||||
arr = as.vector(strsplit(content[i], " ")[[1]])
|
|
||||||
label[i] = as.numeric(arr[[1]])
|
|
||||||
for (j in 2:length(arr)) {
|
|
||||||
kv = strsplit(arr[j], ":")[[1]]
|
|
||||||
# to avoid 0 index
|
|
||||||
findex = as.integer(kv[1]) + 1
|
|
||||||
fvalue = as.numeric(kv[2])
|
|
||||||
mat[i,findex] = fvalue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mat = as(mat, "sparseMatrix")
|
|
||||||
return(list(label=label, data=mat))
|
|
||||||
}
|
|
||||||
|
|
||||||
############################
|
|
||||||
# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
|
||||||
############################
|
|
||||||
|
|
||||||
# Directly read in local file
|
|
||||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
|
||||||
class(dtrain)
|
|
||||||
|
|
||||||
# read file in R
|
|
||||||
csc = read.libsvm("agaricus.txt.train", 126)
|
|
||||||
y = csc$label
|
|
||||||
x = csc$data
|
|
||||||
|
|
||||||
# x as Sparse Matrix
|
|
||||||
class(x)
|
|
||||||
dtrain = xgb.DMatrix(x, label=y)
|
|
||||||
|
|
||||||
# x as dense matrix
|
|
||||||
dense.x = as.matrix(x)
|
|
||||||
dtrain = xgb.DMatrix(dense.x, label=y)
|
|
||||||
|
|
||||||
############################
|
|
||||||
# Test xgboost with local file, sparse matrix and dense matrix in R.
|
|
||||||
############################
|
|
||||||
|
|
||||||
# Test with DMatrix object
|
|
||||||
bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
|
||||||
|
|
||||||
# Test with local file
|
|
||||||
bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
|
||||||
|
|
||||||
# Test with Sparse Matrix
|
|
||||||
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
|
||||||
|
|
||||||
# Test with dense Matrix
|
|
||||||
bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
|
||||||
|
|
||||||
# Test with validation set
|
|
||||||
bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test',
|
|
||||||
max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
|
||||||
|
|
||||||
############################
|
|
||||||
# Test predict
|
|
||||||
############################
|
|
||||||
|
|
||||||
# Prediction with DMatrix object
|
|
||||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
|
||||||
pred = predict(bst, dtest)
|
|
||||||
|
|
||||||
# Prediction with local test file
|
|
||||||
pred = predict(bst, 'agaricus.txt.test')
|
|
||||||
|
|
||||||
# Prediction with Sparse Matrix
|
|
||||||
csc = read.libsvm("agaricus.txt.test", 126)
|
|
||||||
test.y = csc$label
|
|
||||||
test.x = csc$data
|
|
||||||
pred = predict(bst, test.x)
|
|
||||||
|
|
||||||
# Extrac label with xgb.getinfo
|
|
||||||
labels = xgb.getinfo(dtest, "label")
|
|
||||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
|
||||||
print(paste("error=",err))
|
|
||||||
|
|
||||||
############################
|
|
||||||
# Save and load model to hard disk
|
|
||||||
############################
|
|
||||||
|
|
||||||
# save model to binary local file
|
|
||||||
xgb.save(bst, 'model.save')
|
|
||||||
|
|
||||||
# load binary model to R
|
|
||||||
bst = xgb.load('model.save')
|
|
||||||
pred = predict(bst, test.x)
|
|
||||||
|
|
||||||
# save model to text file
|
|
||||||
xgb.dump(bst, 'model.dump')
|
|
||||||
|
|
||||||
############################
|
|
||||||
# Customized objective and evaluation function
|
|
||||||
############################
|
|
||||||
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient
|
|
||||||
# this is loglikelihood loss
|
|
||||||
logregobj = function(preds, dtrain) {
|
|
||||||
labels = xgb.getinfo(dtrain, "label")
|
|
||||||
preds = 1.0 / (1.0 + exp(-preds))
|
|
||||||
grad = preds - labels
|
|
||||||
hess = preds * (1.0-preds)
|
|
||||||
return(list(grad=grad, hess=hess))
|
|
||||||
}
|
|
||||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
|
||||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
|
||||||
# this may make buildin evalution metric not function properly
|
|
||||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
|
||||||
# the buildin evaluation error assumes input is after logistic transformation
|
|
||||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
|
||||||
evalerror = function(preds, dtrain) {
|
|
||||||
labels = xgb.getinfo(dtrain, "label")
|
|
||||||
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
|
||||||
return(list(metric="error", value=err))
|
|
||||||
}
|
|
||||||
|
|
||||||
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
|
|
||||||
obj=logregobj, feval=evalerror)
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,127 +0,0 @@
|
|||||||
# load xgboost library
|
|
||||||
require(xgboost)
|
|
||||||
require(methods)
|
|
||||||
|
|
||||||
# helper function to read libsvm format
|
|
||||||
# this is very badly written, load in dense, and convert to sparse
|
|
||||||
# use this only for demo purpose
|
|
||||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
|
||||||
read.libsvm <- function(fname, maxcol) {
|
|
||||||
content <- readLines(fname)
|
|
||||||
nline <- length(content)
|
|
||||||
label <- numeric(nline)
|
|
||||||
mat <- matrix(0, nline, maxcol+1)
|
|
||||||
for (i in 1:nline) {
|
|
||||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
|
||||||
label[i] <- as.numeric(arr[[1]])
|
|
||||||
for (j in 2:length(arr)) {
|
|
||||||
kv <- strsplit(arr[j], ":")[[1]]
|
|
||||||
# to avoid 0 index
|
|
||||||
findex <- as.integer(kv[1]) + 1
|
|
||||||
fvalue <- as.numeric(kv[2])
|
|
||||||
mat[i,findex] <- fvalue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mat <- as(mat, "sparseMatrix")
|
|
||||||
return(list(label=label, data=mat))
|
|
||||||
}
|
|
||||||
|
|
||||||
# test code here
|
|
||||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
|
||||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
|
||||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
# training xgboost model
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
# make prediction
|
|
||||||
preds <- xgb.predict(bst, dtest)
|
|
||||||
labels <- xgb.getinfo(dtest, "label")
|
|
||||||
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
|
|
||||||
# print error rate
|
|
||||||
print(paste("error=",err))
|
|
||||||
|
|
||||||
# dump model
|
|
||||||
xgb.dump(bst, "dump.raw.txt")
|
|
||||||
# dump model with feature map
|
|
||||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
|
||||||
|
|
||||||
# save dmatrix into binary buffer
|
|
||||||
succ <- xgb.save(dtest, "dtest.buffer")
|
|
||||||
# save model into file
|
|
||||||
succ <- xgb.save(bst, "xgb.model")
|
|
||||||
# load model and data in
|
|
||||||
bst2 <- xgb.Booster(modelfile="xgb.model")
|
|
||||||
dtest2 <- xgb.DMatrix("dtest.buffer")
|
|
||||||
preds2 <- xgb.predict(bst2, dtest2)
|
|
||||||
# assert they are the same
|
|
||||||
stopifnot(sum(abs(preds2-preds)) == 0)
|
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from sparseMatrix
|
|
||||||
###
|
|
||||||
print ('start running example of build DMatrix from R.sparseMatrix')
|
|
||||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
|
||||||
label <- csc$label
|
|
||||||
data <- csc$data
|
|
||||||
dtrain <- xgb.DMatrix(data, info=list(label=label) )
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from dense matrix
|
|
||||||
###
|
|
||||||
print ('start running example of build DMatrix from R.Matrix')
|
|
||||||
mat = as.matrix(data)
|
|
||||||
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: cutomsized loss function
|
|
||||||
#
|
|
||||||
print("start running example to used cutomized objective function")
|
|
||||||
# note: for customized objective function, we leave objective as default
|
|
||||||
# note: what we are getting is margin value in prediction
|
|
||||||
# you must know what you are doing
|
|
||||||
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient
|
|
||||||
# this is loglikelihood loss
|
|
||||||
logregobj <- function(preds, dtrain) {
|
|
||||||
labels <- xgb.getinfo(dtrain, "label")
|
|
||||||
preds <- 1.0 / (1.0 + exp(-preds))
|
|
||||||
grad <- preds - labels
|
|
||||||
hess <- preds * (1.0-preds)
|
|
||||||
return(list(grad=grad, hess=hess))
|
|
||||||
}
|
|
||||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
|
||||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
|
||||||
# this may make buildin evalution metric not function properly
|
|
||||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
|
||||||
# the buildin evaluation error assumes input is after logistic transformation
|
|
||||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
|
||||||
evalerror <- function(preds, dtrain) {
|
|
||||||
labels <- xgb.getinfo(dtrain, "label")
|
|
||||||
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
|
||||||
return(list(metric="error", value=err))
|
|
||||||
}
|
|
||||||
|
|
||||||
# training with customized objective, we can also do step by step training
|
|
||||||
# simply look at xgboost.py"s implementation of train
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: start from a initial base prediction
|
|
||||||
#
|
|
||||||
print ("start running example to start from a initial prediction")
|
|
||||||
# specify parameters via map, definition are same as c++ version
|
|
||||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
||||||
# train xgboost for 1 round
|
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
|
||||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
|
||||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
|
||||||
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
|
|
||||||
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
|
|
||||||
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
|
|
||||||
succ <- xgb.setinfo(dtest, "base_margin", ptest)
|
|
||||||
print ("this is result of running from initial prediction")
|
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
|
||||||
@ -1,103 +1,153 @@
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(methods)
|
require(methods)
|
||||||
|
|
||||||
# helper function to read libsvm format
|
# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
|
||||||
# this is very badly written, load in dense, and convert to sparse
|
# use this only for demo purpose adopted from
|
||||||
# use this only for demo purpose
|
# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
||||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
read.libsvm <- function(fname, maxcol) {
|
||||||
read.libsvm = function(fname, maxcol) {
|
content <- readLines(fname)
|
||||||
content = readLines(fname)
|
nline <- length(content)
|
||||||
nline = length(content)
|
label <- numeric(nline)
|
||||||
label = numeric(nline)
|
mat <- matrix(0, nline, maxcol + 1)
|
||||||
mat = matrix(0, nline, maxcol+1)
|
|
||||||
for (i in 1:nline) {
|
for (i in 1:nline) {
|
||||||
arr = as.vector(strsplit(content[i], " ")[[1]])
|
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||||
label[i] = as.numeric(arr[[1]])
|
label[i] <- as.numeric(arr[[1]])
|
||||||
for (j in 2:length(arr)) {
|
for (j in 2:length(arr)) {
|
||||||
kv = strsplit(arr[j], ":")[[1]]
|
kv <- strsplit(arr[j], ":")[[1]]
|
||||||
# to avoid 0 index
|
# to avoid 0 index
|
||||||
findex = as.integer(kv[1]) + 1
|
findex <- as.integer(kv[1]) + 1
|
||||||
fvalue = as.numeric(kv[2])
|
fvalue <- as.numeric(kv[2])
|
||||||
mat[i,findex] = fvalue
|
mat[i, findex] <- fvalue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mat = as(mat, "sparseMatrix")
|
mat <- as(mat, "sparseMatrix")
|
||||||
return(list(label = label, data = mat))
|
return(list(label = label, data = mat))
|
||||||
}
|
}
|
||||||
|
|
||||||
# Parameter setting
|
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
||||||
|
|
||||||
|
# Directly read in local file
|
||||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
class(dtrain)
|
||||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
||||||
watchlist = list("eval"=dtest,"train"=dtrain)
|
|
||||||
|
|
||||||
###########################
|
# read file in R
|
||||||
# Train from local file
|
csc <- read.libsvm("agaricus.txt.train", 126)
|
||||||
###########################
|
y <- csc$label
|
||||||
|
x <- csc$data
|
||||||
|
|
||||||
# Training
|
|
||||||
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
|
|
||||||
# Prediction
|
|
||||||
pred = predict(bst, 'agaricus.txt.test')
|
|
||||||
# Performance
|
|
||||||
labels = xgb.getinfo(dtest, "label")
|
|
||||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
|
||||||
print(paste("error=",err))
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# Train from R object
|
|
||||||
###########################
|
|
||||||
|
|
||||||
csc = read.libsvm("agaricus.txt.train", 126)
|
|
||||||
y = csc$label
|
|
||||||
x = csc$data
|
|
||||||
# x as Sparse Matrix
|
# x as Sparse Matrix
|
||||||
class(x)
|
class(x)
|
||||||
|
dtrain <- xgb.DMatrix(x, label = y)
|
||||||
|
|
||||||
# Training
|
# x as dense matrix
|
||||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
dense.x <- as.matrix(x)
|
||||||
# Prediction
|
dtrain <- xgb.DMatrix(dense.x, label = y)
|
||||||
pred = predict(bst, 'agaricus.txt.test')
|
|
||||||
# Performance
|
############################ Test xgboost with local file, sparse matrix and dense matrix in R.
|
||||||
labels = xgb.getinfo(dtest, "label")
|
|
||||||
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
# Test with DMatrix object
|
||||||
|
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic")
|
||||||
|
|
||||||
|
# Verbose = 0,1,2
|
||||||
|
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic", verbose = 0)
|
||||||
|
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic", verbose = 1)
|
||||||
|
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic", verbose = 2)
|
||||||
|
|
||||||
|
# Test with local file
|
||||||
|
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
|
||||||
|
objective = "binary:logistic")
|
||||||
|
|
||||||
|
# Test with Sparse Matrix
|
||||||
|
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic")
|
||||||
|
|
||||||
|
# Test with dense Matrix
|
||||||
|
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
|
||||||
|
objective = "binary:logistic")
|
||||||
|
|
||||||
|
|
||||||
|
############################ Test predict
|
||||||
|
|
||||||
|
# Prediction with DMatrix object
|
||||||
|
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||||
|
pred <- predict(bst, dtest)
|
||||||
|
|
||||||
|
# Prediction with local test file
|
||||||
|
pred <- predict(bst, "agaricus.txt.test")
|
||||||
|
|
||||||
|
# Prediction with Sparse Matrix
|
||||||
|
csc <- read.libsvm("agaricus.txt.test", 126)
|
||||||
|
test.y <- csc$label
|
||||||
|
test.x <- csc$data
|
||||||
|
pred <- predict(bst, test.x)
|
||||||
|
|
||||||
|
# Extrac label with getinfo
|
||||||
|
labels <- getinfo(dtest, "label")
|
||||||
|
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
|
||||||
print(paste("error=", err))
|
print(paste("error=", err))
|
||||||
|
|
||||||
# Training with dense matrix
|
############################ Save and load model to hard disk
|
||||||
x = as.matrix(x)
|
|
||||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
|
||||||
|
|
||||||
###########################
|
# save model to binary local file
|
||||||
# Train with customization
|
xgb.save(bst, "xgboost.model")
|
||||||
###########################
|
|
||||||
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient
|
# load binary model to R
|
||||||
# this is loglikelihood loss
|
bst <- xgb.load("xgboost.model")
|
||||||
logregobj = function(preds, dtrain) {
|
pred <- predict(bst, test.x)
|
||||||
labels = xgb.getinfo(dtrain, "label")
|
|
||||||
preds = 1.0 / (1.0 + exp(-preds))
|
# save model to text file
|
||||||
grad = preds - labels
|
xgb.dump(bst, "dump.raw.txt")
|
||||||
hess = preds * (1.0-preds)
|
# save model to text file, with feature map
|
||||||
|
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
||||||
|
|
||||||
|
# save a DMatrix object to hard disk
|
||||||
|
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||||
|
|
||||||
|
# load a DMatrix object to R
|
||||||
|
dtrain <- xgb.DMatrix("dtrain.buffer")
|
||||||
|
|
||||||
|
############################ More flexible training function xgb.train
|
||||||
|
|
||||||
|
param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
|
||||||
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
|
|
||||||
|
# training xgboost model
|
||||||
|
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
|
||||||
|
|
||||||
|
############################ cutomsized loss function
|
||||||
|
|
||||||
|
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||||
|
|
||||||
|
# note: for customized objective function, we leave objective as default note: what we are getting is
|
||||||
|
# margin value in prediction you must know what you are doing
|
||||||
|
|
||||||
|
# user define objective function, given prediction, return gradient and second order gradient this is
|
||||||
|
# loglikelihood loss
|
||||||
|
logregobj <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
preds <- 1/(1 + exp(-preds))
|
||||||
|
grad <- preds - labels
|
||||||
|
hess <- preds * (1 - preds)
|
||||||
return(list(grad = grad, hess = hess))
|
return(list(grad = grad, hess = hess))
|
||||||
}
|
}
|
||||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
|
||||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
# you do customized loss function, the default prediction value is margin this may make buildin
|
||||||
# this may make buildin evalution metric not function properly
|
# evalution metric not function properly for example, we are doing logistic loss, the prediction is
|
||||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
# score before logistic transformation the buildin evaluation error assumes input is after logistic
|
||||||
# the buildin evaluation error assumes input is after logistic transformation
|
# transformation Take this in mind when you use the customization, and maybe you need write customized
|
||||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
# evaluation function
|
||||||
evalerror = function(preds, dtrain) {
|
evalerror <- function(preds, dtrain) {
|
||||||
labels = xgb.getinfo(dtrain, "label")
|
labels <- getinfo(dtrain, "label")
|
||||||
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
return(list(metric = "error", value = err))
|
return(list(metric = "error", value = err))
|
||||||
}
|
}
|
||||||
|
|
||||||
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror)
|
# training with customized objective, we can also do step by step training simply look at xgboost.py's
|
||||||
|
# implementation of train
|
||||||
|
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||||
|
|
||||||
############################
|
|
||||||
# Train with previous result
|
|
||||||
############################
|
|
||||||
|
|
||||||
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
|
||||||
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
|
|
||||||
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)
|
|
||||||
|
|||||||
28
R-package/man/getinfo.Rd
Normal file
28
R-package/man/getinfo.Rd
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\docType{methods}
|
||||||
|
\name{getinfo}
|
||||||
|
\alias{getinfo}
|
||||||
|
\alias{getinfo,xgb.DMatrix-method}
|
||||||
|
\title{Get information of an xgb.DMatrix object}
|
||||||
|
\usage{
|
||||||
|
getinfo(object, ...)
|
||||||
|
|
||||||
|
\S4method{getinfo}{xgb.DMatrix}(object, name)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{object}{Object of class "xgb.DMatrix"}
|
||||||
|
|
||||||
|
\item{name}{the name of the field to get}
|
||||||
|
|
||||||
|
\item{...}{other parameters}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Get information of an xgb.DMatrix object
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
iris[,5] <- as.numeric(iris[,5])
|
||||||
|
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
}
|
||||||
|
|
||||||
32
R-package/man/predict-xgb.Booster-method.Rd
Normal file
32
R-package/man/predict-xgb.Booster-method.Rd
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\docType{methods}
|
||||||
|
\name{predict,xgb.Booster-method}
|
||||||
|
\alias{predict,xgb.Booster-method}
|
||||||
|
\title{Predict method for eXtreme Gradient Boosting model}
|
||||||
|
\usage{
|
||||||
|
\S4method{predict}{xgb.Booster}(object, newdata, outputmargin = FALSE,
|
||||||
|
ntreelimit = NULL)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{object}{Object of class "xgb.Boost"}
|
||||||
|
|
||||||
|
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||||
|
\code{xgb.DMatrix}.}
|
||||||
|
|
||||||
|
\item{outputmargin}{whether the prediction should be shown in the original
|
||||||
|
value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||||
|
untransformed margin value. In logistic regression, outputmargin=T will
|
||||||
|
output value before logistic transformation.}
|
||||||
|
|
||||||
|
\item{ntreelimit}{limit number of trees used in prediction, this parameter is only valid for gbtree, but not for gblinear.
|
||||||
|
set it to be value bigger than 0. It will use all trees by default.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Predicted values based on xgboost model object.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
}
|
||||||
|
|
||||||
30
R-package/man/slice.Rd
Normal file
30
R-package/man/slice.Rd
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\docType{methods}
|
||||||
|
\name{slice}
|
||||||
|
\alias{slice}
|
||||||
|
\alias{slice,xgb.DMatrix-method}
|
||||||
|
\title{Get a new DMatrix containing the specified rows of
|
||||||
|
orginal xgb.DMatrix object}
|
||||||
|
\usage{
|
||||||
|
slice(object, ...)
|
||||||
|
|
||||||
|
\S4method{slice}{xgb.DMatrix}(object, idxset, ...)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{object}{Object of class "xgb.DMatrix"}
|
||||||
|
|
||||||
|
\item{idxset}{a integer vector of indices of rows needed}
|
||||||
|
|
||||||
|
\item{...}{other parameters}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Get a new DMatrix containing the specified rows of
|
||||||
|
orginal xgb.DMatrix object
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
iris[,5] <- as.numeric(iris[,5])
|
||||||
|
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
dsub <- slice(dtrain, 1:3)
|
||||||
|
}
|
||||||
|
|
||||||
28
R-package/man/xgb.DMatrix.Rd
Normal file
28
R-package/man/xgb.DMatrix.Rd
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.DMatrix}
|
||||||
|
\alias{xgb.DMatrix}
|
||||||
|
\title{Contruct xgb.DMatrix object}
|
||||||
|
\usage{
|
||||||
|
xgb.DMatrix(data, info = list(), missing = 0, ...)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
|
||||||
|
indicating the data file.}
|
||||||
|
|
||||||
|
\item{info}{a list of information of the xgb.DMatrix object}
|
||||||
|
|
||||||
|
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||||
|
|
||||||
|
\item{...}{other information to pass to \code{info}.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
iris[,5] <- as.numeric(iris[,5])
|
||||||
|
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||||
|
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||||
|
}
|
||||||
|
|
||||||
23
R-package/man/xgb.DMatrix.save.Rd
Normal file
23
R-package/man/xgb.DMatrix.save.Rd
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.DMatrix.save}
|
||||||
|
\alias{xgb.DMatrix.save}
|
||||||
|
\title{Save xgb.DMatrix object to binary file}
|
||||||
|
\usage{
|
||||||
|
xgb.DMatrix.save(DMatrix, fname)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{DMatrix}{the model object.}
|
||||||
|
|
||||||
|
\item{fname}{the name of the binary file.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Save xgb.DMatrix object to binary file
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
iris[,5] <- as.numeric(iris[,5])
|
||||||
|
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
xgb.DMatrix.save(dtrain, 'iris.xgb.DMatrix')
|
||||||
|
dtrain <- xgb.DMatrix('iris.xgb.DMatrix')
|
||||||
|
}
|
||||||
|
|
||||||
27
R-package/man/xgb.dump.Rd
Normal file
27
R-package/man/xgb.dump.Rd
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.dump}
|
||||||
|
\alias{xgb.dump}
|
||||||
|
\title{Save xgboost model to text file}
|
||||||
|
\usage{
|
||||||
|
xgb.dump(model, fname, fmap = "")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{model}{the model object.}
|
||||||
|
|
||||||
|
\item{fname}{the name of the binary file.}
|
||||||
|
|
||||||
|
\item{fmap}{feature map file representing the type of feature.
|
||||||
|
Detailed description could be found at
|
||||||
|
\url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||||
|
Run inst/examples/demo.R for the result and inst/examples/featmap.txt
|
||||||
|
for example Format.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Save a xgboost model to text file. Could be parsed later.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
xgb.dump(bst, 'iris.xgb.model.dump')
|
||||||
|
}
|
||||||
|
|
||||||
21
R-package/man/xgb.load.Rd
Normal file
21
R-package/man/xgb.load.Rd
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.load}
|
||||||
|
\alias{xgb.load}
|
||||||
|
\title{Load xgboost model from binary file}
|
||||||
|
\usage{
|
||||||
|
xgb.load(modelfile)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{modelfile}{the name of the binary file.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Load xgboost model from the binary model file
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
xgb.save(bst, 'iris.xgb.model')
|
||||||
|
bst <- xgb.load('iris.xgb.model')
|
||||||
|
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
}
|
||||||
|
|
||||||
23
R-package/man/xgb.save.Rd
Normal file
23
R-package/man/xgb.save.Rd
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.save}
|
||||||
|
\alias{xgb.save}
|
||||||
|
\title{Save xgboost model to binary file}
|
||||||
|
\usage{
|
||||||
|
xgb.save(model, fname)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{model}{the model object.}
|
||||||
|
|
||||||
|
\item{fname}{the name of the binary file.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Save xgboost model from xgboost or xgb.train
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
xgb.save(bst, 'iris.xgb.model')
|
||||||
|
bst <- xgb.load('iris.xgb.model')
|
||||||
|
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
}
|
||||||
|
|
||||||
78
R-package/man/xgb.train.Rd
Normal file
78
R-package/man/xgb.train.Rd
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgb.train}
|
||||||
|
\alias{xgb.train}
|
||||||
|
\title{eXtreme Gradient Boosting Training}
|
||||||
|
\usage{
|
||||||
|
xgb.train(params = list(), dtrain, nrounds, watchlist = list(),
|
||||||
|
obj = NULL, feval = NULL, ...)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
|
\itemize{
|
||||||
|
\item \code{objective} objective function, common ones are
|
||||||
|
\itemize{
|
||||||
|
\item \code{reg:linear} linear regression
|
||||||
|
\item \code{binary:logistic} logistic regression for classification
|
||||||
|
}
|
||||||
|
\item \code{eta} step size of each boosting step
|
||||||
|
\item \code{max_depth} maximum depth of the tree
|
||||||
|
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||||
|
}
|
||||||
|
|
||||||
|
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||||
|
further details. See also inst/examples/demo.R for walkthrough example in R.}
|
||||||
|
|
||||||
|
\item{dtrain}{takes an \code{xgb.DMatrix} as the input.}
|
||||||
|
|
||||||
|
\item{nrounds}{the max number of iterations}
|
||||||
|
|
||||||
|
\item{watchlist}{what information should be printed when \code{verbose=1} or
|
||||||
|
\code{verbose=2}. Watchlist is used to specify validation set monitoring
|
||||||
|
during training. For example user can specify
|
||||||
|
watchlist=list(validation1=mat1, validation2=mat2) to watch
|
||||||
|
the performance of each round's model on mat1 and mat2}
|
||||||
|
|
||||||
|
\item{obj}{customized objective function. Returns gradient and second order
|
||||||
|
gradient with given prediction and dtrain,}
|
||||||
|
|
||||||
|
\item{feval}{custimized evaluation function. Returns
|
||||||
|
\code{list(metric='metric-name', value='metric-value')} with given
|
||||||
|
prediction and dtrain,}
|
||||||
|
|
||||||
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
The training function of xgboost
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
This is the training function for xgboost.
|
||||||
|
|
||||||
|
Parallelization is automatically enabled if OpenMP is present.
|
||||||
|
Number of threads can also be manually specified via "nthread" parameter.
|
||||||
|
|
||||||
|
This function only accepts an \code{xgb.DMatrix} object as the input.
|
||||||
|
It supports advanced features such as watchlist, customized objective function,
|
||||||
|
therefore it is more flexible than \code{\link{xgboost}}.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
iris[,5] <- as.numeric(iris[,5])
|
||||||
|
dtrain <- xgb.DMatrix(as.matrix(iris[,1:4]), label=iris[,5])
|
||||||
|
dtest <- dtrain
|
||||||
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
|
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||||
|
logregobj <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
preds <- 1/(1 + exp(-preds))
|
||||||
|
grad <- preds - labels
|
||||||
|
hess <- preds * (1 - preds)
|
||||||
|
return(list(grad = grad, hess = hess))
|
||||||
|
}
|
||||||
|
evalerror <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||||
|
return(list(metric = "error", value = err))
|
||||||
|
}
|
||||||
|
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||||
|
}
|
||||||
|
|
||||||
52
R-package/man/xgboost.Rd
Normal file
52
R-package/man/xgboost.Rd
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
% Generated by roxygen2 (4.0.1): do not edit by hand
|
||||||
|
\name{xgboost}
|
||||||
|
\alias{xgboost}
|
||||||
|
\title{eXtreme Gradient Boosting (Tree) library}
|
||||||
|
\usage{
|
||||||
|
xgboost(data = NULL, label = NULL, params = list(), nrounds,
|
||||||
|
verbose = 1, ...)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||||
|
\code{xgb.DMatrix}.}
|
||||||
|
|
||||||
|
\item{label}{the response variable. User should not set this field,}
|
||||||
|
|
||||||
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
|
\itemize{
|
||||||
|
\item \code{objective} objective function, common ones are
|
||||||
|
\itemize{
|
||||||
|
\item \code{reg:linear} linear regression
|
||||||
|
\item \code{binary:logistic} logistic regression for classification
|
||||||
|
}
|
||||||
|
\item \code{eta} step size of each boosting step
|
||||||
|
\item \code{max_depth} maximum depth of the tree
|
||||||
|
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||||
|
}
|
||||||
|
|
||||||
|
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||||
|
further details. See also inst/examples/demo.R for walkthrough example in R.}
|
||||||
|
|
||||||
|
\item{nrounds}{the max number of iterations}
|
||||||
|
|
||||||
|
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
|
||||||
|
information of performance. If 2, xgboost will print information of both
|
||||||
|
performance and construction progress information}
|
||||||
|
|
||||||
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
A simple interface for xgboost in R
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
This is the modeling function for xgboost.
|
||||||
|
|
||||||
|
Parallelization is automatically enabled if OpenMP is present.
|
||||||
|
Number of threads can also be manually specified via "nthread" parameter
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]), nrounds = 2)
|
||||||
|
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
}
|
||||||
|
|
||||||
@ -1,28 +1,9 @@
|
|||||||
|
# package root
|
||||||
|
PKGROOT=../../
|
||||||
# _*_ mode: Makefile; _*_
|
# _*_ mode: Makefile; _*_
|
||||||
export CC = gcc
|
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
|
||||||
export CXX = g++
|
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||||
|
|
||||||
# expose these flags to R CMD SHLIB
|
|
||||||
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS)
|
|
||||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||||
|
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
|
||||||
|
|
||||||
ifeq ($(no_omp),1)
|
|
||||||
PKG_CPPFLAGS += -DDISABLE_OPENMP
|
|
||||||
endif
|
|
||||||
|
|
||||||
CXXOBJ= xgboost_wrapper.o xgboost_io.o
|
|
||||||
OBJECTS= xgboost_R.o $(CXXOBJ)
|
|
||||||
|
|
||||||
.PHONY: all clean
|
|
||||||
all: $(SHLIB)
|
|
||||||
$(SHLIB): $(OBJECTS)
|
|
||||||
|
|
||||||
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
|
|
||||||
xgboost_io.o: ../../src/io/io.cpp
|
|
||||||
|
|
||||||
$(CXXOBJ) :
|
|
||||||
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -rf *.so *.o *~ *.dll
|
|
||||||
|
|
||||||
|
|||||||
@ -1,32 +1,7 @@
|
|||||||
|
# package root
|
||||||
|
PKGROOT=../../
|
||||||
# _*_ mode: Makefile; _*_
|
# _*_ mode: Makefile; _*_
|
||||||
export CC = gcc
|
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
|
||||||
export CXX = g++
|
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||||
|
|
||||||
# expose these flags to R CMD SHLIB
|
|
||||||
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fopenmp -fPIC $(SHLIB_OPENMP_CFLAGS)
|
|
||||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||||
|
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
|
||||||
# add flag to build native code even in cross compiler
|
|
||||||
ifeq "$(WIN)" "64"
|
|
||||||
PKG_CPPFLAGS += -m64
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(no_omp),1)
|
|
||||||
PKG_CPPFLAGS += -DDISABLE_OPENMP
|
|
||||||
endif
|
|
||||||
|
|
||||||
CXXOBJ= xgboost_wrapper.o xgboost_io.o
|
|
||||||
OBJECTS= xgboost_R.o $(CXXOBJ)
|
|
||||||
|
|
||||||
.PHONY: all clean
|
|
||||||
all: $(SHLIB)
|
|
||||||
$(SHLIB): $(OBJECTS)
|
|
||||||
|
|
||||||
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
|
|
||||||
xgboost_io.o: ../../src/io/io.cpp
|
|
||||||
|
|
||||||
$(CXXOBJ) :
|
|
||||||
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -rf *.so *.o *~ *.dll
|
|
||||||
|
|||||||
@ -2,25 +2,54 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <cstdio>
|
||||||
#include "xgboost_R.h"
|
#include "xgboost_R.h"
|
||||||
#include "../../wrapper/xgboost_wrapper.h"
|
#include "wrapper/xgboost_wrapper.h"
|
||||||
#include "../../src/utils/utils.h"
|
#include "src/utils/utils.h"
|
||||||
#include "../../src/utils/omp.h"
|
#include "src/utils/omp.h"
|
||||||
#include "../../src/utils/matrix_csr.h"
|
using namespace std;
|
||||||
|
|
||||||
using namespace xgboost;
|
using namespace xgboost;
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
void XGBoostAssert_R(int exp, const char *fmt, ...);
|
||||||
|
void XGBoostCheck_R(int exp, const char *fmt, ...);
|
||||||
|
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
|
||||||
|
}
|
||||||
|
|
||||||
// implements error handling
|
// implements error handling
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace utils {
|
namespace utils {
|
||||||
void HandleAssertError(const char *msg) {
|
extern "C" {
|
||||||
error("%s", msg);
|
void (*Printf)(const char *fmt, ...) = Rprintf;
|
||||||
}
|
int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
|
||||||
void HandleCheckError(const char *msg) {
|
void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
|
||||||
error("%s", msg);
|
void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
|
||||||
|
void (*Error)(const char *fmt, ...) = error;
|
||||||
}
|
}
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
|
|
||||||
|
namespace random {
|
||||||
|
void Seed(unsigned seed) {
|
||||||
|
warning("parameter seed is ignored, please set random seed using set.seed");
|
||||||
|
}
|
||||||
|
double Uniform(void) {
|
||||||
|
return unif_rand();
|
||||||
|
}
|
||||||
|
double Normal(void) {
|
||||||
|
return norm_rand();
|
||||||
|
}
|
||||||
|
} // namespace random
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
// call before wrapper starts
|
||||||
|
inline void _WrapperBegin(void) {
|
||||||
|
GetRNGstate();
|
||||||
|
}
|
||||||
|
// call after wrapper starts
|
||||||
|
inline void _WrapperEnd(void) {
|
||||||
|
PutRNGstate();
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
void _DMatrixFinalizer(SEXP ext) {
|
void _DMatrixFinalizer(SEXP ext) {
|
||||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||||
@ -28,14 +57,17 @@ extern "C" {
|
|||||||
R_ClearExternalPtr(ext);
|
R_ClearExternalPtr(ext);
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
||||||
|
_WrapperBegin();
|
||||||
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
|
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||||
SEXP missing) {
|
SEXP missing) {
|
||||||
|
_WrapperBegin();
|
||||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
SEXP dim = getAttrib(mat, R_DimSymbol);
|
||||||
int nrow = INTEGER(dim)[0];
|
int nrow = INTEGER(dim)[0];
|
||||||
int ncol = INTEGER(dim)[1];
|
int ncol = INTEGER(dim)[1];
|
||||||
@ -47,55 +79,64 @@ extern "C" {
|
|||||||
data[i * ncol +j] = din[i + nrow * j];
|
data[i * ncol +j] = din[i + nrow * j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
|
void *handle = XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing));
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||||
SEXP indices,
|
SEXP indices,
|
||||||
SEXP data) {
|
SEXP data) {
|
||||||
const int *col_ptr = INTEGER(indptr);
|
_WrapperBegin();
|
||||||
const int *row_index = INTEGER(indices);
|
const int *p_indptr = INTEGER(indptr);
|
||||||
const double *col_data = REAL(data);
|
const int *p_indices = INTEGER(indices);
|
||||||
int ncol = length(indptr) - 1;
|
const double *p_data = REAL(data);
|
||||||
|
int nindptr = length(indptr);
|
||||||
int ndata = length(data);
|
int ndata = length(data);
|
||||||
// transform into CSR format
|
std::vector<bst_ulong> col_ptr_(nindptr);
|
||||||
std::vector<bst_ulong> row_ptr;
|
std::vector<unsigned> indices_(ndata);
|
||||||
std::vector< std::pair<unsigned, float> > csr_data;
|
std::vector<float> data_(ndata);
|
||||||
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
|
|
||||||
builder.InitBudget();
|
for (int i = 0; i < nindptr; ++i) {
|
||||||
for (int i = 0; i < ncol; ++i) {
|
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
|
||||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
|
||||||
builder.AddBudget(row_index[j]);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
builder.InitStorage();
|
|
||||||
for (int i = 0; i < ncol; ++i) {
|
|
||||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
|
||||||
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
|
|
||||||
std::vector<float> row_data(ndata);
|
|
||||||
std::vector<unsigned> col_index(ndata);
|
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int i = 0; i < ndata; ++i) {
|
for (int i = 0; i < ndata; ++i) {
|
||||||
col_index[i] = csr_data[i].first;
|
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||||
row_data[i] = csr_data[i].second;
|
data_[i] = static_cast<float>(p_data[i]);
|
||||||
}
|
}
|
||||||
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
|
void *handle = XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||||
|
BeginPtr(data_), nindptr, ndata);
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
|
||||||
|
_WrapperBegin();
|
||||||
|
int len = length(idxset);
|
||||||
|
std::vector<int> idxvec(len);
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
idxvec[i] = INTEGER(idxset)[i] - 1;
|
||||||
|
}
|
||||||
|
void *res = XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len);
|
||||||
|
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
|
||||||
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||||
|
_WrapperBegin();
|
||||||
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(fname)), asInteger(silent));
|
CHAR(asChar(fname)), asInteger(silent));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||||
|
_WrapperBegin();
|
||||||
int len = length(array);
|
int len = length(array);
|
||||||
const char *name = CHAR(asChar(field));
|
const char *name = CHAR(asChar(field));
|
||||||
if (!strcmp("group", name)) {
|
if (!strcmp("group", name)) {
|
||||||
@ -104,7 +145,8 @@ extern "C" {
|
|||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
||||||
}
|
}
|
||||||
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
|
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len);
|
||||||
|
_WrapperEnd();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@ -115,10 +157,12 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(field)),
|
CHAR(asChar(field)),
|
||||||
&vec[0], len);
|
BeginPtr(vec), len);
|
||||||
}
|
}
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
||||||
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
|
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(field)), &olen);
|
CHAR(asChar(field)), &olen);
|
||||||
@ -127,6 +171,7 @@ extern "C" {
|
|||||||
REAL(ret)[i] = res[i];
|
REAL(ret)[i] = res[i];
|
||||||
}
|
}
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
// functions related to booster
|
// functions related to booster
|
||||||
@ -136,28 +181,35 @@ extern "C" {
|
|||||||
R_ClearExternalPtr(ext);
|
R_ClearExternalPtr(ext);
|
||||||
}
|
}
|
||||||
SEXP XGBoosterCreate_R(SEXP dmats) {
|
SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||||
|
_WrapperBegin();
|
||||||
int len = length(dmats);
|
int len = length(dmats);
|
||||||
std::vector<void*> dvec;
|
std::vector<void*> dvec;
|
||||||
for (int i = 0; i < len; ++i){
|
for (int i = 0; i < len; ++i){
|
||||||
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
||||||
}
|
}
|
||||||
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
|
void *handle = XGBoosterCreate(BeginPtr(dvec), dvec.size());
|
||||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||||
|
_WrapperBegin();
|
||||||
XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(name)),
|
CHAR(asChar(name)),
|
||||||
CHAR(asChar(val)));
|
CHAR(asChar(val)));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||||
|
_WrapperBegin();
|
||||||
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||||
asInteger(iter),
|
asInteger(iter),
|
||||||
R_ExternalPtrAddr(dtrain));
|
R_ExternalPtrAddr(dtrain));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
||||||
|
_WrapperBegin();
|
||||||
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
|
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
|
||||||
int len = length(grad);
|
int len = length(grad);
|
||||||
std::vector<float> tgrad(len), thess(len);
|
std::vector<float> tgrad(len), thess(len);
|
||||||
@ -168,9 +220,11 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
||||||
R_ExternalPtrAddr(dtrain),
|
R_ExternalPtrAddr(dtrain),
|
||||||
&tgrad[0], &thess[0], len);
|
BeginPtr(tgrad), BeginPtr(thess), len);
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
||||||
|
_WrapperBegin();
|
||||||
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
|
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
|
||||||
int len = length(dmats);
|
int len = length(dmats);
|
||||||
std::vector<void*> vec_dmats;
|
std::vector<void*> vec_dmats;
|
||||||
@ -185,28 +239,37 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
||||||
asInteger(iter),
|
asInteger(iter),
|
||||||
&vec_dmats[0], &vec_sptr[0], len));
|
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
|
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
|
||||||
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
|
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
|
||||||
R_ExternalPtrAddr(dmat),
|
R_ExternalPtrAddr(dmat),
|
||||||
asInteger(output_margin),
|
asInteger(output_margin),
|
||||||
|
asInteger(ntree_limit),
|
||||||
&olen);
|
&olen);
|
||||||
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
||||||
for (size_t i = 0; i < olen; ++i) {
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
REAL(ret)[i] = res[i];
|
REAL(ret)[i] = res[i];
|
||||||
}
|
}
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
_WrapperEnd();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
||||||
|
_WrapperBegin();
|
||||||
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
||||||
|
_WrapperBegin();
|
||||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
|
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
|
||||||
|
_WrapperBegin();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(fmap)),
|
CHAR(asChar(fmap)),
|
||||||
@ -217,5 +280,6 @@ extern "C" {
|
|||||||
fprintf(fo, "%s", res[i]);
|
fprintf(fo, "%s", res[i]);
|
||||||
}
|
}
|
||||||
fclose(fo);
|
fclose(fo);
|
||||||
|
_WrapperEnd();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
*/
|
*/
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <Rinternals.h>
|
#include <Rinternals.h>
|
||||||
|
#include <R_ext/Random.h>
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
@ -36,6 +37,13 @@ extern "C" {
|
|||||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||||
SEXP indices,
|
SEXP indices,
|
||||||
SEXP data);
|
SEXP data);
|
||||||
|
/*!
|
||||||
|
* \brief create a new dmatrix from sliced content of existing matrix
|
||||||
|
* \param handle instance of data matrix to be sliced
|
||||||
|
* \param idxset index set
|
||||||
|
* \return a sliced new matrix
|
||||||
|
*/
|
||||||
|
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
|
||||||
/*!
|
/*!
|
||||||
* \brief load a data matrix into binary file
|
* \brief load a data matrix into binary file
|
||||||
* \param handle a instance of data matrix
|
* \param handle a instance of data matrix
|
||||||
@ -99,8 +107,9 @@ extern "C" {
|
|||||||
* \param handle handle
|
* \param handle handle
|
||||||
* \param dmat data matrix
|
* \param dmat data matrix
|
||||||
* \param output_margin whether only output raw margin value
|
* \param output_margin whether only output raw margin value
|
||||||
|
* \param ntree_limit limit number of trees used in prediction
|
||||||
*/
|
*/
|
||||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
|
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
|
||||||
/*!
|
/*!
|
||||||
* \brief load model from existing file
|
* \brief load model from existing file
|
||||||
* \param handle handle
|
* \param handle handle
|
||||||
@ -120,5 +129,5 @@ extern "C" {
|
|||||||
* \param fmap name to fmap can be empty string
|
* \param fmap name to fmap can be empty string
|
||||||
*/
|
*/
|
||||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
|
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
|
||||||
};
|
}
|
||||||
#endif // XGBOOST_WRAPPER_R_H_
|
#endif // XGBOOST_WRAPPER_R_H_
|
||||||
|
|||||||
33
R-package/src/xgboost_assert.c
Normal file
33
R-package/src/xgboost_assert.c
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <Rinternals.h>
|
||||||
|
|
||||||
|
// implements error handling
|
||||||
|
void XGBoostAssert_R(int exp, const char *fmt, ...) {
|
||||||
|
char buf[1024];
|
||||||
|
if (exp == 0) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vsprintf(buf, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
error("AssertError:%s\n", buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void XGBoostCheck_R(int exp, const char *fmt, ...) {
|
||||||
|
char buf[1024];
|
||||||
|
if (exp == 0) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vsprintf(buf, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
error("%s\n", buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
|
||||||
|
int ret;
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
ret = vsnprintf(buf, size, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
212
R-package/vignettes/xgboost.Rnw
Normal file
212
R-package/vignettes/xgboost.Rnw
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
\documentclass{article}
|
||||||
|
\RequirePackage{url}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\RequirePackage{amsmath}
|
||||||
|
\RequirePackage{natbib}
|
||||||
|
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
|
||||||
|
|
||||||
|
\makeatletter
|
||||||
|
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
|
||||||
|
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
|
||||||
|
%\VignettePackage{xgboost}
|
||||||
|
% \VignetteEngine{knitr::knitr}
|
||||||
|
\makeatother
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
%\SweaveOpts{concordance=TRUE}
|
||||||
|
|
||||||
|
<<knitropts,echo=FALSE,message=FALSE>>=
|
||||||
|
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
|
||||||
|
@
|
||||||
|
|
||||||
|
%
|
||||||
|
<<prelim,echo=FALSE>>=
|
||||||
|
xgboost.version = '0.3-0'
|
||||||
|
@
|
||||||
|
%
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\vspace*{6\baselineskip}
|
||||||
|
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
|
||||||
|
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
|
||||||
|
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
|
||||||
|
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
|
||||||
|
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
|
||||||
|
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
|
||||||
|
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
|
||||||
|
{\large \today}\par
|
||||||
|
\vfill
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\thispagestyle{empty}
|
||||||
|
|
||||||
|
\clearpage
|
||||||
|
|
||||||
|
\setcounter{page}{1}
|
||||||
|
|
||||||
|
\section{Introduction}
|
||||||
|
|
||||||
|
This is an introductory document of using the \verb@xgboost@ package in R.
|
||||||
|
|
||||||
|
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
|
||||||
|
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy}.
|
||||||
|
The package includes efficient linear model solver and tree learning algorithm.
|
||||||
|
It supports various objective functions, including regression, classification
|
||||||
|
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
|
||||||
|
Windows and Linux, with openmp. It is generally over 10 times faster than
|
||||||
|
\verb@gbm@.}
|
||||||
|
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
|
||||||
|
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
|
||||||
|
\item{Data File: }{Local data files}
|
||||||
|
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
|
||||||
|
\end{itemize}
|
||||||
|
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
|
||||||
|
and linear booster, and is optimized for sparse input.}
|
||||||
|
\item{Customization: }{\verb@xgboost@ supports customized objective function
|
||||||
|
and evaluation function}
|
||||||
|
\item{Performance: }{\verb@xgboost@ has better performance on several different
|
||||||
|
datasets.}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Example with iris}
|
||||||
|
|
||||||
|
In this section, we will illustrate some common usage of \verb@xgboost@.
|
||||||
|
|
||||||
|
<<Training and prediction with iris>>=
|
||||||
|
library(xgboost)
|
||||||
|
data(iris)
|
||||||
|
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]),
|
||||||
|
nrounds = 5)
|
||||||
|
xgb.save(bst, 'model.save')
|
||||||
|
bst = xgb.load('model.save')
|
||||||
|
pred <- predict(bst, as.matrix(iris[,1:4]))
|
||||||
|
@
|
||||||
|
|
||||||
|
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
||||||
|
\verb@predict@ does prediction on the model.
|
||||||
|
|
||||||
|
Here we can save the model to a binary local file, and load it when needed.
|
||||||
|
We can't inspect the trees inside. However we have another function to save the
|
||||||
|
model in plain text.
|
||||||
|
<<Dump Model>>=
|
||||||
|
xgb.dump(bst, 'model.dump')
|
||||||
|
@
|
||||||
|
|
||||||
|
The output looks like
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
booster[0]:
|
||||||
|
0:[f2<2.45] yes=1,no=2,missing=1
|
||||||
|
1:leaf=0.147059
|
||||||
|
2:[f3<1.65] yes=3,no=4,missing=3
|
||||||
|
3:leaf=0.464151
|
||||||
|
4:leaf=0.722449
|
||||||
|
booster[1]:
|
||||||
|
0:[f2<2.45] yes=1,no=2,missing=1
|
||||||
|
1:leaf=0.103806
|
||||||
|
2:[f2<4.85] yes=3,no=4,missing=3
|
||||||
|
3:leaf=0.316341
|
||||||
|
4:leaf=0.510365
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
||||||
|
It speeds up \verb@xgboost@, and is needed for advanced features such as
|
||||||
|
training from initial prediction value, weighted training instance.
|
||||||
|
|
||||||
|
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
||||||
|
<<xgb.DMatrix>>=
|
||||||
|
iris.mat <- as.matrix(iris[,1:4])
|
||||||
|
iris.label <- as.numeric(iris[,5])
|
||||||
|
diris <- xgb.DMatrix(iris.mat, label = iris.label)
|
||||||
|
class(diris)
|
||||||
|
getinfo(diris,'label')
|
||||||
|
@
|
||||||
|
|
||||||
|
We can also save the matrix to a binary file. Then load it simply with
|
||||||
|
\verb@xgb.DMatrix@
|
||||||
|
<<save model>>=
|
||||||
|
xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
|
||||||
|
diris = xgb.DMatrix('iris.xgb.DMatrix')
|
||||||
|
@
|
||||||
|
|
||||||
|
\section{Advanced Examples}
|
||||||
|
|
||||||
|
The function \verb@xgboost@ is a simple function with less parameter, in order
|
||||||
|
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
|
||||||
|
|
||||||
|
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
|
||||||
|
|
||||||
|
<<Customized loss function>>=
|
||||||
|
logregobj <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
preds <- 1/(1 + exp(-preds))
|
||||||
|
grad <- preds - labels
|
||||||
|
hess <- preds * (1 - preds)
|
||||||
|
return(list(grad = grad, hess = hess))
|
||||||
|
}
|
||||||
|
|
||||||
|
evalerror <- function(preds, dtrain) {
|
||||||
|
labels <- getinfo(dtrain, "label")
|
||||||
|
err <- sqrt(mean((preds-labels)^2))
|
||||||
|
return(list(metric = "MSE", value = err))
|
||||||
|
}
|
||||||
|
|
||||||
|
dtest <- slice(diris,1:100)
|
||||||
|
watchlist <- list(eval = dtest, train = diris)
|
||||||
|
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||||
|
|
||||||
|
bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
|
||||||
|
@
|
||||||
|
|
||||||
|
The gradient and second order gradient is required for the output of customized
|
||||||
|
objective function.
|
||||||
|
|
||||||
|
We also have \verb@slice@ for row extraction. It is useful in
|
||||||
|
cross-validation.
|
||||||
|
|
||||||
|
For a walkthrough demo, please see \verb@R-package/inst/examples/demo.R@ for further
|
||||||
|
details.
|
||||||
|
|
||||||
|
\section{The Higgs Boson competition}
|
||||||
|
|
||||||
|
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
|
||||||
|
Boson Machine Learning Challenge}.
|
||||||
|
|
||||||
|
Here are the instructions to make a submission
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
|
||||||
|
and extract them to \verb@data/@.
|
||||||
|
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
|
||||||
|
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
|
||||||
|
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
|
||||||
|
The computation will take less than a minute on Intel i7.
|
||||||
|
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
|
||||||
|
and submit your result.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
|
||||||
|
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
|
||||||
|
The training set contains 350000 records and 30 features.
|
||||||
|
|
||||||
|
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
|
||||||
|
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
|
||||||
|
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
|
||||||
|
still about two times faster than \verb@gbm@.
|
||||||
|
|
||||||
|
Meanwhile, the result from \verb@xgboost@ reaches
|
||||||
|
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
|
||||||
|
single model. This results stands in the
|
||||||
|
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
|
||||||
|
competition.
|
||||||
|
|
||||||
|
\bibliographystyle{jss}
|
||||||
|
\nocite{*} % list uncited references
|
||||||
|
\bibliography{xgboost}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
|
||||||
20
R-package/vignettes/xgboost.bib
Normal file
20
R-package/vignettes/xgboost.bib
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
@article{friedman2001greedy,
|
||||||
|
title={Greedy function approximation: a gradient boosting machine},
|
||||||
|
author={Friedman, Jerome H},
|
||||||
|
journal={Annals of Statistics},
|
||||||
|
pages={1189--1232},
|
||||||
|
year={2001},
|
||||||
|
publisher={JSTOR}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{friedman2000additive,
|
||||||
|
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
|
||||||
|
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
|
||||||
|
journal={The annals of statistics},
|
||||||
|
volume={28},
|
||||||
|
number={2},
|
||||||
|
pages={337--407},
|
||||||
|
year={2000},
|
||||||
|
publisher={Institute of Mathematical Statistics}
|
||||||
|
}
|
||||||
|
|
||||||
26
README.md
26
README.md
@ -1,26 +0,0 @@
|
|||||||
This is a Fork of XGBoost from https://github.com/tqchen/xgboost
|
|
||||||
|
|
||||||
In the main repo you already find 2 windows projects for the porting of the executable and the python library.
|
|
||||||
|
|
||||||
Here you have:
|
|
||||||
|
|
||||||
1) a c# dll wrapper, meaning the passage from unmanaged to managed code, in https://github.com/giuliohome/xgboost/tree/master/windows/xgboost_sharp_wrapper
|
|
||||||
|
|
||||||
2) the c# Higgs Kaggle demo, instead of the python one (actually you will get a higher score with the c# version, due to some changes I've made) in https://github.com/giuliohome/xgboost/tree/master/windows/kaggle_higgs_demo
|
|
||||||
|
|
||||||
Start the demo from the root folder like this:
|
|
||||||
|
|
||||||
bin\x64\Debug\kaggle_higgs_demo.exe training_path.csv test_path.csv sharp_pred.csv NFoldCV NRound
|
|
||||||
|
|
||||||
NFoldCV: 0 => no cv , 5 = 5-fold-cv, 10 = 10-fold-cv :-)
|
|
||||||
|
|
||||||
3) 5 fold cv implementation in c# for the demo: you see inline cv ams while training (of course on a completely separate set)
|
|
||||||
|
|
||||||
In my latest commit I've added
|
|
||||||
|
|
||||||
4) parallel execution of n-fold cv, on top of dotnet multithreading
|
|
||||||
|
|
||||||
5) double inputted model training, stopping at a configured ams objective
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
27
demo/README.md
Normal file
27
demo/README.md
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
XGBoost Examples
|
||||||
|
====
|
||||||
|
This folder contains the all example codes using xgboost.
|
||||||
|
|
||||||
|
* Contribution of exampls, benchmarks is more than welcomed!
|
||||||
|
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||||
|
|
||||||
|
Features Walkthrough
|
||||||
|
====
|
||||||
|
This is a list of short codes introducing different functionalities of xgboost and its wrapper.
|
||||||
|
* Basic walkthrough of wrappers [python](guide-python/basic_walkthrough.py)
|
||||||
|
* Cutomize loss function, and evaluation metric [python](guide-python/custom_objective.py)
|
||||||
|
* Boosting from existing prediction [python](guide-python/boost_from_prediction.py)
|
||||||
|
* Predicting using first n trees [python](guide-python/predict_first_ntree.py)
|
||||||
|
* Generalized Linear Model [python](guide-python/generalized_linear_model.py)
|
||||||
|
* Cross validation [python](guide-python/cross_validation.py)
|
||||||
|
|
||||||
|
Basic Examples by Tasks
|
||||||
|
====
|
||||||
|
* [Binary classification](binary_classification)
|
||||||
|
* [Multiclass classification](multiclass_classification)
|
||||||
|
* [Regression](regression)
|
||||||
|
* [Learning to Rank](rank)
|
||||||
|
|
||||||
|
Benchmarks
|
||||||
|
====
|
||||||
|
* [Starter script for Kaggle Higgs Boson](kaggle-higgs)
|
||||||
2
demo/data/README.md
Normal file
2
demo/data/README.md
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
This folder contains processed example dataset used by the demos.
|
||||||
|
Copyright of the dataset belongs to the original copyright holder
|
||||||
3
demo/guide-R/README.md
Normal file
3
demo/guide-R/README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
XGBoost R Feature Walkthrough
|
||||||
|
====
|
||||||
|
To be finished
|
||||||
5
demo/guide-R/runall.sh
Executable file
5
demo/guide-R/runall.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# todo
|
||||||
|
Rscript basic_walkthrough.R
|
||||||
|
Rscript custom_objective.R
|
||||||
|
Rscript boost_from_prediction.R
|
||||||
8
demo/guide-python/README.md
Normal file
8
demo/guide-python/README.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
XGBoost Python Feature Walkthrough
|
||||||
|
====
|
||||||
|
* [Basic walkthrough of wrappers](basic_walkthrough.py)
|
||||||
|
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
||||||
|
* [Boosting from existing prediction](boost_from_prediction.py)
|
||||||
|
* [Predicting using first n trees](predict_first_ntree.py)
|
||||||
|
* [Generalized Linear Model](generalized_linear_model.py)
|
||||||
|
* [Cross validation](cross_validation.py)
|
||||||
76
demo/guide-python/basic_walkthrough.py
Executable file
76
demo/guide-python/basic_walkthrough.py
Executable file
@ -0,0 +1,76 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse
|
||||||
|
# append the path to xgboost, you may need to change the following line
|
||||||
|
# alternatively, you can add the path to PYTHONPATH environment variable
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### simple example
|
||||||
|
# load file from text file, also binary buffer generated by xgboost
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||||
|
|
||||||
|
# specify parameters via map, definition are same as c++ version
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||||
|
|
||||||
|
# specify validations set to watch performance
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 2
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
|
# this is prediction
|
||||||
|
preds = bst.predict(dtest)
|
||||||
|
labels = dtest.get_label()
|
||||||
|
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||||
|
bst.save_model('0001.model')
|
||||||
|
# dump model
|
||||||
|
bst.dump_model('dump.raw.txt')
|
||||||
|
# dump model with feature map
|
||||||
|
bst.dump_model('dump.nice.txt','../data/featmap.txt')
|
||||||
|
|
||||||
|
# save dmatrix into binary buffer
|
||||||
|
dtest.save_binary('dtest.buffer')
|
||||||
|
bst.save_model('xgb.model')
|
||||||
|
# load model and data in
|
||||||
|
bst2 = xgb.Booster(model_file='xgb.model')
|
||||||
|
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||||
|
preds2 = bst2.predict(dtest2)
|
||||||
|
# assert they are the same
|
||||||
|
assert np.sum(np.abs(preds2-preds)) == 0
|
||||||
|
|
||||||
|
###
|
||||||
|
# build dmatrix from scipy.sparse
|
||||||
|
print ('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
||||||
|
labels = []
|
||||||
|
row = []; col = []; dat = []
|
||||||
|
i = 0
|
||||||
|
for l in open('../data/agaricus.txt.train'):
|
||||||
|
arr = l.split()
|
||||||
|
labels.append( int(arr[0]))
|
||||||
|
for it in arr[1:]:
|
||||||
|
k,v = it.split(':')
|
||||||
|
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||||
|
i += 1
|
||||||
|
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||||
|
dtrain = xgb.DMatrix( csr, label = labels )
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||||
|
|
||||||
|
print ('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
||||||
|
# we can also construct from csc matrix
|
||||||
|
csc = scipy.sparse.csc_matrix( (dat, (row,col)) )
|
||||||
|
dtrain = xgb.DMatrix(csc, label=labels)
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||||
|
|
||||||
|
print ('start running example of build DMatrix from numpy array')
|
||||||
|
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation
|
||||||
|
# then convert to DMatrix
|
||||||
|
npymat = csr.todense()
|
||||||
|
dtrain = xgb.DMatrix(npymat, label = labels)
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
bst = xgb.train( param, dtrain, num_round, watchlist )
|
||||||
|
|
||||||
|
|
||||||
26
demo/guide-python/boost_from_prediction.py
Executable file
26
demo/guide-python/boost_from_prediction.py
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
###
|
||||||
|
# advanced: start from a initial base prediction
|
||||||
|
#
|
||||||
|
print ('start running example to start from a initial prediction')
|
||||||
|
# specify parameters via map, definition are same as c++ version
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||||
|
# train xgboost for 1 round
|
||||||
|
bst = xgb.train( param, dtrain, 1, watchlist )
|
||||||
|
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||||
|
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||||
|
ptrain = bst.predict(dtrain, output_margin=True)
|
||||||
|
ptest = bst.predict(dtest, output_margin=True)
|
||||||
|
dtrain.set_base_margin(ptrain)
|
||||||
|
dtest.set_base_margin(ptest)
|
||||||
|
|
||||||
|
print ('this is result of running from initial prediction')
|
||||||
|
bst = xgb.train( param, dtrain, 1, watchlist )
|
||||||
63
demo/guide-python/cross_validation.py
Executable file
63
demo/guide-python/cross_validation.py
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### load data in do training
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||||
|
num_round = 2
|
||||||
|
|
||||||
|
print ('running cross validation')
|
||||||
|
# do cross validation, this will print result out as
|
||||||
|
# [iteration] metric_name:mean_value+std_value
|
||||||
|
# std_value is standard deviation of the metric
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'error'}, seed = 0)
|
||||||
|
|
||||||
|
print ('running cross validation, disable standard deviation display')
|
||||||
|
# do cross validation, this will print result out as
|
||||||
|
# [iteration] metric_name:mean_value+std_value
|
||||||
|
# std_value is standard deviation of the metric
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'error'}, seed = 0, show_stdv = False)
|
||||||
|
|
||||||
|
print ('running cross validation, with preprocessing function')
|
||||||
|
# define the preprocessing function
|
||||||
|
# used to return the preprocessed training, test data, and parameter
|
||||||
|
# we can use this to do weight rescale, etc.
|
||||||
|
# as a example, we try to set scale_pos_weight
|
||||||
|
def fpreproc(dtrain, dtest, param):
|
||||||
|
label = dtrain.get_label()
|
||||||
|
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||||
|
param['scale_pos_weight'] = ratio
|
||||||
|
return (dtrain, dtest, param)
|
||||||
|
|
||||||
|
# do cross validation, for each fold
|
||||||
|
# the dtrain, dtest, param will be passed into fpreproc
|
||||||
|
# then the return value of fpreproc will be used to generate
|
||||||
|
# results of that fold
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
||||||
|
|
||||||
|
###
|
||||||
|
# you can also do cross validation with cutomized loss function
|
||||||
|
# See custom_objective.py
|
||||||
|
##
|
||||||
|
print ('running cross validation, with cutomsized loss function')
|
||||||
|
def logregobj(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||||
|
grad = preds - labels
|
||||||
|
hess = preds * (1.0-preds)
|
||||||
|
return grad, hess
|
||||||
|
def evalerror(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||||
|
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||||
|
# train with customized objective
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||||
|
obj = logregobj, feval=evalerror)
|
||||||
|
|
||||||
44
demo/guide-python/custom_objective.py
Executable file
44
demo/guide-python/custom_objective.py
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
###
|
||||||
|
# advanced: cutomsized loss function
|
||||||
|
#
|
||||||
|
print ('start running example to used cutomized objective function')
|
||||||
|
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||||
|
|
||||||
|
# note: for customized objective function, we leave objective as default
|
||||||
|
# note: what we are getting is margin value in prediction
|
||||||
|
# you must know what you are doing
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 2
|
||||||
|
|
||||||
|
# user define objective function, given prediction, return gradient and second order gradient
|
||||||
|
# this is loglikelihood loss
|
||||||
|
def logregobj(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||||
|
grad = preds - labels
|
||||||
|
hess = preds * (1.0-preds)
|
||||||
|
return grad, hess
|
||||||
|
|
||||||
|
# user defined evaluation function, return a pair metric_name, result
|
||||||
|
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||||
|
# this may make buildin evalution metric not function properly
|
||||||
|
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||||
|
# the buildin evaluation error assumes input is after logistic transformation
|
||||||
|
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||||
|
def evalerror(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
# return a pair metric_name, result
|
||||||
|
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||||
|
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||||
|
|
||||||
|
# training with customized objective, we can also do step by step training
|
||||||
|
# simply look at xgboost.py's implementation of train
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||||
32
demo/guide-python/generalized_linear_model.py
Executable file
32
demo/guide-python/generalized_linear_model.py
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
##
|
||||||
|
# this script demonstrate how to fit generalized linear model in xgboost
|
||||||
|
# basically, we are using linear model, instead of tree for our boosters
|
||||||
|
##
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||||
|
# change booster to gblinear, so that we are fitting a linear model
|
||||||
|
# alpha is the L1 regularizer
|
||||||
|
# lambda is the L2 regularizer
|
||||||
|
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||||
|
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
|
||||||
|
'alpha': 0.0001, 'lambda': 1 }
|
||||||
|
|
||||||
|
# normally, you do not need to set eta (step_size)
|
||||||
|
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||||
|
# there could be affection on convergence with parallelization on certain cases
|
||||||
|
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||||
|
# param['eta'] = 1
|
||||||
|
|
||||||
|
##
|
||||||
|
# the rest of settings are the same
|
||||||
|
##
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 4
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
preds = bst.predict(dtest)
|
||||||
|
labels = dtest.get_label()
|
||||||
|
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||||
22
demo/guide-python/predict_first_ntree.py
Executable file
22
demo/guide-python/predict_first_ntree.py
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### load data in do training
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||||
|
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||||
|
num_round = 3
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
|
|
||||||
|
print ('start testing prediction from first n trees')
|
||||||
|
### predict using first 1 tree
|
||||||
|
label = dtest.get_label()
|
||||||
|
ypred1 = bst.predict(dtest, ntree_limit=1)
|
||||||
|
# by default, we predict using all the trees
|
||||||
|
ypred2 = bst.predict(dtest)
|
||||||
|
print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
|
||||||
|
print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
|
||||||
7
demo/guide-python/runall.sh
Executable file
7
demo/guide-python/runall.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
python basic_walkthrough.py
|
||||||
|
python custom_objective.py
|
||||||
|
python boost_from_prediction.py
|
||||||
|
python generalized_linear_model.py
|
||||||
|
python cross_validation.py
|
||||||
|
rm -rf *~ *.model *.buffer
|
||||||
@ -10,6 +10,7 @@ This script will achieve about 3.600 AMS score in public leadboard. To get start
|
|||||||
cd ../..
|
cd ../..
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
|
2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
|
||||||
|
|
||||||
3. Run ./run.sh
|
3. Run ./run.sh
|
||||||
@ -21,5 +22,5 @@ speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
|||||||
|
|
||||||
Using R module
|
Using R module
|
||||||
=====
|
=====
|
||||||
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R
|
* Alternatively, you can run using R, higgs-train.R and higgs-pred.R.
|
||||||
|
|
||||||
|
|||||||
39
demo/kaggle-higgs/higgs-cv.py
Executable file
39
demo/kaggle-higgs/higgs-cv.py
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### load data in do training
|
||||||
|
train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
|
||||||
|
label = train[:,32]
|
||||||
|
data = train[:,1:31]
|
||||||
|
weight = train[:,31]
|
||||||
|
dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||||
|
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
|
||||||
|
num_round = 120
|
||||||
|
|
||||||
|
print ('running cross validation, with preprocessing function')
|
||||||
|
# define the preprocessing function
|
||||||
|
# used to return the preprocessed training, test data, and parameter
|
||||||
|
# we can use this to do weight rescale, etc.
|
||||||
|
# as a example, we try to set scale_pos_weight
|
||||||
|
def fpreproc(dtrain, dtest, param):
|
||||||
|
label = dtrain.get_label()
|
||||||
|
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||||
|
param['scale_pos_weight'] = ratio
|
||||||
|
wtrain = dtrain.get_weight()
|
||||||
|
wtest = dtest.get_weight()
|
||||||
|
sum_weight = sum(wtrain) + sum(wtest)
|
||||||
|
wtrain *= sum_weight / sum(wtrain)
|
||||||
|
wtest *= sum_weight / sum(wtest)
|
||||||
|
dtrain.set_weight(wtrain)
|
||||||
|
dtest.set_weight(wtest)
|
||||||
|
return (dtrain, dtest, param)
|
||||||
|
|
||||||
|
# do cross validation, for each fold
|
||||||
|
# the dtrain, dtest, param will be passed into fpreproc
|
||||||
|
# then the return value of fpreproc will be used to generate
|
||||||
|
# results of that fold
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)
|
||||||
@ -1,5 +1,6 @@
|
|||||||
# include xgboost library, must set chdir=TRURE
|
# install xgboost package, see R-package in root folder
|
||||||
source("../../wrapper/xgboost.R", chdir=TRUE)
|
require(xgboost)
|
||||||
|
require(methods)
|
||||||
|
|
||||||
modelfile <- "higgs.model"
|
modelfile <- "higgs.model"
|
||||||
outfile <- "higgs.pred.csv"
|
outfile <- "higgs.pred.csv"
|
||||||
@ -8,8 +9,8 @@ data <- as.matrix(dtest[2:31])
|
|||||||
idx <- dtest[[1]]
|
idx <- dtest[[1]]
|
||||||
|
|
||||||
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
xgmat <- xgb.DMatrix(data, missing = -999.0)
|
||||||
bst <- xgb.Booster(params=list("nthread"=16), modelfile=modelfile)
|
bst <- xgb.load(modelfile=modelfile)
|
||||||
ypred <- xgb.predict(bst, xgmat)
|
ypred <- predict(bst, xgmat)
|
||||||
|
|
||||||
rorder <- rank(ypred, ties.method="first")
|
rorder <- rank(ypred, ties.method="first")
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# include xgboost library, must set chdir=TRURE
|
# install xgboost package, see R-package in root folder
|
||||||
source("../../wrapper/xgboost.R", chdir=TRUE)
|
require(xgboost)
|
||||||
|
require(methods)
|
||||||
|
|
||||||
testsize <- 550000
|
testsize <- 550000
|
||||||
|
|
||||||
dtrain <- read.csv("data/training.csv", header=TRUE)
|
dtrain <- read.csv("data/training.csv", header=TRUE)
|
||||||
@ -12,7 +14,7 @@ sumwpos <- sum(weight * (label==1.0))
|
|||||||
sumwneg <- sum(weight * (label==0.0))
|
sumwneg <- sum(weight * (label==0.0))
|
||||||
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||||
|
|
||||||
xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0)
|
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||||
param <- list("objective" = "binary:logitraw",
|
param <- list("objective" = "binary:logitraw",
|
||||||
"scale_pos_weight" = sumwneg / sumwpos,
|
"scale_pos_weight" = sumwneg / sumwpos,
|
||||||
"bst:eta" = 0.1,
|
"bst:eta" = 0.1,
|
||||||
|
|||||||
71
demo/kaggle-higgs/speedtest.R
Normal file
71
demo/kaggle-higgs/speedtest.R
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
# install xgboost package, see R-package in root folder
|
||||||
|
require(xgboost)
|
||||||
|
require(gbm)
|
||||||
|
require(methods)
|
||||||
|
|
||||||
|
testsize <- 550000
|
||||||
|
|
||||||
|
dtrain <- read.csv("data/training.csv", header=TRUE, nrows=350001)
|
||||||
|
|
||||||
|
# gbm.time = system.time({
|
||||||
|
# gbm.model <- gbm(Label ~ ., data = dtrain[, -c(1,32)], n.trees = 120,
|
||||||
|
# interaction.depth = 6, shrinkage = 0.1, bag.fraction = 1,
|
||||||
|
# verbose = TRUE)
|
||||||
|
# })
|
||||||
|
# print(gbm.time)
|
||||||
|
# Test result: 761.48 secs
|
||||||
|
|
||||||
|
dtrain[33] <- dtrain[33] == "s"
|
||||||
|
label <- as.numeric(dtrain[[33]])
|
||||||
|
data <- as.matrix(dtrain[2:31])
|
||||||
|
weight <- as.numeric(dtrain[[32]]) * testsize / length(label)
|
||||||
|
|
||||||
|
sumwpos <- sum(weight * (label==1.0))
|
||||||
|
sumwneg <- sum(weight * (label==0.0))
|
||||||
|
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
|
||||||
|
|
||||||
|
xgboost.time = list()
|
||||||
|
threads = c(1,2,4,8,16)
|
||||||
|
for (i in 1:length(threads)){
|
||||||
|
thread = threads[i]
|
||||||
|
xgboost.time[[i]] = system.time({
|
||||||
|
xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
|
||||||
|
param <- list("objective" = "binary:logitraw",
|
||||||
|
"scale_pos_weight" = sumwneg / sumwpos,
|
||||||
|
"bst:eta" = 0.1,
|
||||||
|
"bst:max_depth" = 6,
|
||||||
|
"eval_metric" = "auc",
|
||||||
|
"eval_metric" = "ams@0.15",
|
||||||
|
"silent" = 1,
|
||||||
|
"nthread" = thread)
|
||||||
|
watchlist <- list("train" = xgmat)
|
||||||
|
nround = 120
|
||||||
|
print ("loading data end, start to boost trees")
|
||||||
|
bst = xgb.train(param, xgmat, nround, watchlist );
|
||||||
|
# save out model
|
||||||
|
xgb.save(bst, "higgs.model")
|
||||||
|
print ('finish training')
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
xgboost.time
|
||||||
|
# [[1]]
|
||||||
|
# user system elapsed
|
||||||
|
# 444.98 1.96 450.22
|
||||||
|
#
|
||||||
|
# [[2]]
|
||||||
|
# user system elapsed
|
||||||
|
# 188.15 0.82 102.41
|
||||||
|
#
|
||||||
|
# [[3]]
|
||||||
|
# user system elapsed
|
||||||
|
# 143.29 0.79 44.18
|
||||||
|
#
|
||||||
|
# [[4]]
|
||||||
|
# user system elapsed
|
||||||
|
# 176.60 1.45 34.04
|
||||||
|
#
|
||||||
|
# [[5]]
|
||||||
|
# user system elapsed
|
||||||
|
# 180.15 2.85 35.26
|
||||||
|
|
||||||
@ -13,10 +13,10 @@ Project Logical Layout
|
|||||||
|
|
||||||
File Naming Convention
|
File Naming Convention
|
||||||
=======
|
=======
|
||||||
* The project is templatized, to make it easy to adjust input data structure.
|
|
||||||
* .h files are data structures and interface, which are needed to use functions in that layer.
|
* .h files are data structures and interface, which are needed to use functions in that layer.
|
||||||
* -inl.hpp files are implementations of interface, like cpp file in most project.
|
* -inl.hpp files are implementations of interface, like cpp file in most project.
|
||||||
- You only need to understand the interface file to understand the usage of that layer
|
- You only need to understand the interface file to understand the usage of that layer
|
||||||
|
* In each folder, there can be a .cpp file, that compiles the module of that layer
|
||||||
|
|
||||||
How to Hack the Code
|
How to Hack the Code
|
||||||
======
|
======
|
||||||
|
|||||||
334
src/data.h
334
src/data.h
@ -7,16 +7,8 @@
|
|||||||
*/
|
*/
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits>
|
|
||||||
#include <climits>
|
|
||||||
#include <cstring>
|
|
||||||
#include <algorithm>
|
|
||||||
#include "utils/io.h"
|
|
||||||
#include "utils/omp.h"
|
|
||||||
#include "utils/utils.h"
|
#include "utils/utils.h"
|
||||||
#include "utils/iterator.h"
|
#include "utils/iterator.h"
|
||||||
#include "utils/random.h"
|
|
||||||
#include "utils/matrix_csr.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
/*!
|
/*!
|
||||||
@ -70,12 +62,12 @@ struct SparseBatch {
|
|||||||
/*! \brief an entry of sparse vector */
|
/*! \brief an entry of sparse vector */
|
||||||
struct Entry {
|
struct Entry {
|
||||||
/*! \brief feature index */
|
/*! \brief feature index */
|
||||||
bst_uint findex;
|
bst_uint index;
|
||||||
/*! \brief feature value */
|
/*! \brief feature value */
|
||||||
bst_float fvalue;
|
bst_float fvalue;
|
||||||
// default constructor
|
// default constructor
|
||||||
Entry(void) {}
|
Entry(void) {}
|
||||||
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {}
|
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||||
/*! \brief reversely compare feature values */
|
/*! \brief reversely compare feature values */
|
||||||
inline static bool CmpValue(const Entry &a, const Entry &b) {
|
inline static bool CmpValue(const Entry &a, const Entry &b) {
|
||||||
return a.fvalue < b.fvalue;
|
return a.fvalue < b.fvalue;
|
||||||
@ -86,7 +78,7 @@ struct SparseBatch {
|
|||||||
/*! \brief pointer to the elements*/
|
/*! \brief pointer to the elements*/
|
||||||
const Entry *data;
|
const Entry *data;
|
||||||
/*! \brief length of the instance */
|
/*! \brief length of the instance */
|
||||||
const bst_uint length;
|
bst_uint length;
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
|
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
|
||||||
/*! \brief get i-th pair in the sparse vector*/
|
/*! \brief get i-th pair in the sparse vector*/
|
||||||
@ -96,298 +88,72 @@ struct SparseBatch {
|
|||||||
};
|
};
|
||||||
/*! \brief batch size */
|
/*! \brief batch size */
|
||||||
size_t size;
|
size_t size;
|
||||||
|
};
|
||||||
|
/*! \brief read-only row batch, used to access row continuously */
|
||||||
|
struct RowBatch : public SparseBatch {
|
||||||
/*! \brief the offset of rowid of this batch */
|
/*! \brief the offset of rowid of this batch */
|
||||||
size_t base_rowid;
|
size_t base_rowid;
|
||||||
/*! \brief array[size+1], row pointer of each of the elements */
|
/*! \brief array[size+1], row pointer of each of the elements */
|
||||||
const size_t *row_ptr;
|
const size_t *ind_ptr;
|
||||||
/*! \brief array[row_ptr.back()], content of the sparse element */
|
/*! \brief array[ind_ptr.back()], content of the sparse element */
|
||||||
const Entry *data_ptr;
|
const Entry *data_ptr;
|
||||||
/*! \brief get i-th row from the batch */
|
/*! \brief get i-th row from the batch */
|
||||||
inline Inst operator[](size_t i) const {
|
inline Inst operator[](size_t i) const {
|
||||||
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
|
return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief This is a interface convention via template, defining the way to access features,
|
|
||||||
* column access rule is defined by template, for efficiency purpose,
|
|
||||||
* row access is defined by iterator of sparse batches
|
|
||||||
* \tparam Derived type of actual implementation
|
|
||||||
*/
|
|
||||||
template<typename Derived>
|
|
||||||
class FMatrixInterface {
|
|
||||||
public:
|
|
||||||
/*! \brief example iterator over one column */
|
|
||||||
struct ColIter{
|
|
||||||
/*!
|
/*!
|
||||||
* \brief move to next position
|
* \brief read-only column batch, used to access columns,
|
||||||
* \return whether there is element in next position
|
* the columns are not required to be continuous
|
||||||
*/
|
*/
|
||||||
inline bool Next(void);
|
struct ColBatch : public SparseBatch {
|
||||||
/*! \return row index of current position */
|
/*! \brief column index of each columns in the data */
|
||||||
inline bst_uint rindex(void) const;
|
const bst_uint *col_index;
|
||||||
/*! \return feature value in current position */
|
/*! \brief pointer to the column data */
|
||||||
inline bst_float fvalue(void) const;
|
const Inst *col_data;
|
||||||
|
/*! \brief get i-th row from the batch */
|
||||||
|
inline Inst operator[](size_t i) const {
|
||||||
|
return col_data[i];
|
||||||
|
}
|
||||||
};
|
};
|
||||||
/*! \brief backward iterator over column */
|
/**
|
||||||
struct ColBackIter : public ColIter {};
|
* \brief interface of feature matrix, needed for tree construction
|
||||||
|
* this interface defines two way to access features,
|
||||||
|
* row access is defined by iterator of RowBatch
|
||||||
|
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
|
||||||
|
*/
|
||||||
|
class IFMatrix {
|
||||||
public:
|
public:
|
||||||
// column access is needed by some of tree construction algorithms
|
// the interface only need to ganrantee row iter
|
||||||
|
// column iter is active, when ColIterator is called, row_iter can be disabled
|
||||||
|
/*! \brief get the row iterator associated with FMatrix */
|
||||||
|
virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
|
||||||
|
/*!\brief get column iterator */
|
||||||
|
virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
|
||||||
/*!
|
/*!
|
||||||
* \brief get column iterator, the columns must be sorted by feature value
|
* \brief get the column iterator associated with FMatrix with subset of column features
|
||||||
* \param cidx column index
|
* \param fset is the list of column index set that must be contained in the returning Column iterator
|
||||||
* \return column iterator
|
* \return the column iterator, initialized so that it reads the elements in fset
|
||||||
*/
|
*/
|
||||||
inline ColIter GetSortedCol(size_t cidx) const;
|
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
|
||||||
/*!
|
|
||||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
|
||||||
* \param cidx column index
|
|
||||||
* \return reverse column iterator
|
|
||||||
*/
|
|
||||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
|
|
||||||
/*!
|
|
||||||
* \brief get number of columns
|
|
||||||
* \return number of columns
|
|
||||||
*/
|
|
||||||
inline size_t NumCol(void) const;
|
|
||||||
/*!
|
/*!
|
||||||
* \brief check if column access is supported, if not, initialize column access
|
* \brief check if column access is supported, if not, initialize column access
|
||||||
* \param max_rows maximum number of rows allowed in constructor
|
* \param subsample subsample ratio when generating column access
|
||||||
*/
|
*/
|
||||||
inline void InitColAccess(void);
|
virtual void InitColAccess(float subsample) = 0;
|
||||||
|
// the following are column meta data, should be able to answer them fast
|
||||||
/*! \return whether column access is enabled */
|
/*! \return whether column access is enabled */
|
||||||
inline bool HaveColAccess(void) const;
|
virtual bool HaveColAccess(void) const = 0;
|
||||||
/*! \breif return #entries-in-col */
|
/*! \return number of columns in the FMatrix */
|
||||||
inline size_t GetColSize(size_t cidx) const;
|
virtual size_t NumCol(void) const = 0;
|
||||||
/*!
|
/*! \brief get number of non-missing entries in column */
|
||||||
* \breif return #entries-in-col / #rows
|
virtual size_t GetColSize(size_t cidx) const = 0;
|
||||||
* \param cidx column index
|
|
||||||
* this function is used to help speedup,
|
|
||||||
* doese not necessarily implement it if not sure, return 0.0;
|
|
||||||
* \return column density
|
|
||||||
*/
|
|
||||||
inline float GetColDensity(size_t cidx) const;
|
|
||||||
/*! \brief get the row iterator associated with FMatrix */
|
|
||||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief sparse matrix that support column access, CSC
|
|
||||||
*/
|
|
||||||
class FMatrixS : public FMatrixInterface<FMatrixS>{
|
|
||||||
public:
|
|
||||||
typedef SparseBatch::Entry Entry;
|
|
||||||
/*! \brief row iterator */
|
|
||||||
struct ColIter{
|
|
||||||
const Entry *dptr_, *end_;
|
|
||||||
ColIter(const Entry* begin, const Entry* end)
|
|
||||||
:dptr_(begin), end_(end) {}
|
|
||||||
inline bool Next(void) {
|
|
||||||
if (dptr_ == end_) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
++dptr_; return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline bst_uint rindex(void) const {
|
|
||||||
return dptr_->findex;
|
|
||||||
}
|
|
||||||
inline bst_float fvalue(void) const {
|
|
||||||
return dptr_->fvalue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
/*! \brief reverse column iterator */
|
|
||||||
struct ColBackIter : public ColIter {
|
|
||||||
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
|
|
||||||
// shadows ColIter::Next
|
|
||||||
inline bool Next(void) {
|
|
||||||
if (dptr_ == end_) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
--dptr_; return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
/*! \brief constructor */
|
|
||||||
FMatrixS(void) {
|
|
||||||
iter_ = NULL;
|
|
||||||
}
|
|
||||||
// destructor
|
|
||||||
~FMatrixS(void) {
|
|
||||||
if (iter_ != NULL) delete iter_;
|
|
||||||
}
|
|
||||||
/*! \return whether column access is enabled */
|
|
||||||
inline bool HaveColAccess(void) const {
|
|
||||||
return col_ptr_.size() != 0;
|
|
||||||
}
|
|
||||||
/*! \brief get number of colmuns */
|
|
||||||
inline size_t NumCol(void) const {
|
|
||||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
|
||||||
return col_ptr_.size() - 1;
|
|
||||||
}
|
|
||||||
/*! \brief get number of buffered rows */
|
|
||||||
inline const std::vector<bst_uint> buffered_rowset(void) const {
|
|
||||||
return buffered_rowset_;
|
|
||||||
}
|
|
||||||
/*! \brief get col sorted iterator */
|
|
||||||
inline ColIter GetSortedCol(size_t cidx) const {
|
|
||||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
|
||||||
return ColIter(&col_data_[0] + col_ptr_[cidx] - 1,
|
|
||||||
&col_data_[0] + col_ptr_[cidx + 1] - 1);
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief get reversed col iterator,
|
|
||||||
* this function will be deprecated at some point
|
|
||||||
*/
|
|
||||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
|
|
||||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
|
||||||
return ColBackIter(&col_data_[0] + col_ptr_[cidx + 1],
|
|
||||||
&col_data_[0] + col_ptr_[cidx]);
|
|
||||||
}
|
|
||||||
/*! \brief get col size */
|
|
||||||
inline size_t GetColSize(size_t cidx) const {
|
|
||||||
return col_ptr_[cidx+1] - col_ptr_[cidx];
|
|
||||||
}
|
|
||||||
/*! \brief get column density */
|
/*! \brief get column density */
|
||||||
inline float GetColDensity(size_t cidx) const {
|
virtual float GetColDensity(size_t cidx) const = 0;
|
||||||
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
|
/*! \brief reference of buffered rowset */
|
||||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
|
||||||
}
|
// virtual destructor
|
||||||
inline void InitColAccess(float pkeep = 1.0f) {
|
virtual ~IFMatrix(void){}
|
||||||
if (this->HaveColAccess()) return;
|
|
||||||
this->InitColData(pkeep);
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief get the row iterator associated with FMatrix
|
|
||||||
* this function is not threadsafe, returns iterator stored in FMatrixS
|
|
||||||
*/
|
|
||||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
|
|
||||||
iter_->BeforeFirst();
|
|
||||||
return iter_;
|
|
||||||
}
|
|
||||||
/*! \brief set iterator */
|
|
||||||
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
|
|
||||||
this->iter_ = iter;
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief save column access data into stream
|
|
||||||
* \param fo output stream to save to
|
|
||||||
*/
|
|
||||||
inline void SaveColAccess(utils::IStream &fo) const {
|
|
||||||
fo.Write(buffered_rowset_);
|
|
||||||
if (buffered_rowset_.size() != 0) {
|
|
||||||
SaveBinary(fo, col_ptr_, col_data_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load column access data from stream
|
|
||||||
* \param fo output stream to load from
|
|
||||||
*/
|
|
||||||
inline void LoadColAccess(utils::IStream &fi) {
|
|
||||||
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
|
|
||||||
if (buffered_rowset_.size() != 0) {
|
|
||||||
LoadBinary(fi, &col_ptr_, &col_data_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief save data to binary stream
|
|
||||||
* \param fo output stream
|
|
||||||
* \param ptr pointer data
|
|
||||||
* \param data data content
|
|
||||||
*/
|
|
||||||
inline static void SaveBinary(utils::IStream &fo,
|
|
||||||
const std::vector<size_t> &ptr,
|
|
||||||
const std::vector<SparseBatch::Entry> &data) {
|
|
||||||
size_t nrow = ptr.size() - 1;
|
|
||||||
fo.Write(&nrow, sizeof(size_t));
|
|
||||||
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
|
|
||||||
if (data.size() != 0) {
|
|
||||||
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief load data from binary stream
|
|
||||||
* \param fi input stream
|
|
||||||
* \param out_ptr pointer data
|
|
||||||
* \param out_data data content
|
|
||||||
*/
|
|
||||||
inline static void LoadBinary(utils::IStream &fi,
|
|
||||||
std::vector<size_t> *out_ptr,
|
|
||||||
std::vector<SparseBatch::Entry> *out_data) {
|
|
||||||
size_t nrow;
|
|
||||||
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
|
||||||
out_ptr->resize(nrow + 1);
|
|
||||||
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
|
|
||||||
"invalid input file format");
|
|
||||||
out_data->resize(out_ptr->back());
|
|
||||||
if (out_data->size() != 0) {
|
|
||||||
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
|
|
||||||
"invalid input file format");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
/*!
|
|
||||||
* \brief intialize column data
|
|
||||||
* \param pkeep probability to keep a row
|
|
||||||
*/
|
|
||||||
inline void InitColData(float pkeep) {
|
|
||||||
buffered_rowset_.clear();
|
|
||||||
// note: this part of code is serial, todo, parallelize this transformer
|
|
||||||
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
|
|
||||||
builder.InitBudget(0);
|
|
||||||
// start working
|
|
||||||
iter_->BeforeFirst();
|
|
||||||
while (iter_->Next()) {
|
|
||||||
const SparseBatch &batch = iter_->Value();
|
|
||||||
for (size_t i = 0; i < batch.size; ++i) {
|
|
||||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
|
||||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
|
||||||
SparseBatch::Inst inst = batch[i];
|
|
||||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
|
||||||
builder.AddBudget(inst[j].findex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
builder.InitStorage();
|
|
||||||
|
|
||||||
iter_->BeforeFirst();
|
|
||||||
size_t ktop = 0;
|
|
||||||
while (iter_->Next()) {
|
|
||||||
const SparseBatch &batch = iter_->Value();
|
|
||||||
for (size_t i = 0; i < batch.size; ++i) {
|
|
||||||
if (ktop < buffered_rowset_.size() &&
|
|
||||||
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
|
||||||
++ktop;
|
|
||||||
SparseBatch::Inst inst = batch[i];
|
|
||||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
|
||||||
builder.PushElem(inst[j].findex,
|
|
||||||
Entry((bst_uint)(batch.base_rowid+i),
|
|
||||||
inst[j].fvalue));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// sort columns
|
|
||||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
|
||||||
std::sort(&col_data_[0] + col_ptr_[i],
|
|
||||||
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
// --- data structure used to support InitColAccess --
|
|
||||||
utils::IIterator<SparseBatch> *iter_;
|
|
||||||
/*! \brief list of row index that are buffered */
|
|
||||||
std::vector<bst_uint> buffered_rowset_;
|
|
||||||
/*! \brief column pointer of CSC format */
|
|
||||||
std::vector<size_t> col_ptr_;
|
|
||||||
/*! \brief column datas in CSC format */
|
|
||||||
std::vector<SparseBatch::Entry> col_data_;
|
|
||||||
};
|
};
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_DATA_H
|
#endif // XGBOOST_DATA_H
|
||||||
|
|||||||
@ -18,13 +18,13 @@ namespace gbm {
|
|||||||
* \brief gradient boosted linear model
|
* \brief gradient boosted linear model
|
||||||
* \tparam FMatrix the data type updater taking
|
* \tparam FMatrix the data type updater taking
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
class GBLinear : public IGradBooster {
|
||||||
class GBLinear : public IGradBooster<FMatrix> {
|
|
||||||
public:
|
public:
|
||||||
virtual ~GBLinear(void) {
|
virtual ~GBLinear(void) {
|
||||||
}
|
}
|
||||||
// set model parameters
|
// set model parameters
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strncmp(name, "bst:", 4)) {
|
if (!strncmp(name, "bst:", 4)) {
|
||||||
param.SetParam(name + 4, val);
|
param.SetParam(name + 4, val);
|
||||||
}
|
}
|
||||||
@ -41,13 +41,12 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
virtual void InitModel(void) {
|
virtual void InitModel(void) {
|
||||||
model.InitModel();
|
model.InitModel();
|
||||||
}
|
}
|
||||||
virtual void DoBoost(const FMatrix &fmat,
|
virtual void DoBoost(IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<bst_gpair> *in_gpair) {
|
std::vector<bst_gpair> *in_gpair) {
|
||||||
this->InitFeatIndex(fmat);
|
|
||||||
std::vector<bst_gpair> &gpair = *in_gpair;
|
std::vector<bst_gpair> &gpair = *in_gpair;
|
||||||
const int ngroup = model.param.num_output_group;
|
const int ngroup = model.param.num_output_group;
|
||||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
|
||||||
// for all the output group
|
// for all the output group
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
double sum_grad = 0.0, sum_hess = 0.0;
|
double sum_grad = 0.0, sum_hess = 0.0;
|
||||||
@ -72,16 +71,20 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
|
||||||
|
while (iter->Next()) {
|
||||||
// number of features
|
// number of features
|
||||||
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
|
const ColBatch &batch = iter->Value();
|
||||||
|
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||||
const bst_uint fid = feat_index[i];
|
const bst_uint fid = batch.col_index[i];
|
||||||
|
ColBatch::Inst col = batch[i];
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
double sum_grad = 0.0, sum_hess = 0.0;
|
double sum_grad = 0.0, sum_hess = 0.0;
|
||||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
for (bst_uint j = 0; j < col.length; ++j) {
|
||||||
const float v = it.fvalue();
|
const float v = col[j].fvalue;
|
||||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||||
if (p.hess < 0.0f) continue;
|
if (p.hess < 0.0f) continue;
|
||||||
sum_grad += p.grad * v;
|
sum_grad += p.grad * v;
|
||||||
sum_hess += p.hess * v * v;
|
sum_hess += p.hess * v * v;
|
||||||
@ -90,27 +93,30 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
||||||
w += dw;
|
w += dw;
|
||||||
// update grad value
|
// update grad value
|
||||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
for (bst_uint j = 0; j < col.length; ++j) {
|
||||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||||
if (p.hess < 0.0f) continue;
|
if (p.hess < 0.0f) continue;
|
||||||
p.grad += p.hess * it.fvalue() * dw;
|
p.grad += p.hess * col[j].fvalue * dw;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void Predict(const FMatrix &fmat,
|
virtual void Predict(IFMatrix *p_fmat,
|
||||||
int64_t buffer_offset,
|
int64_t buffer_offset,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<float> *out_preds) {
|
std::vector<float> *out_preds,
|
||||||
|
unsigned ntree_limit = 0) {
|
||||||
|
utils::Check(ntree_limit == 0,
|
||||||
|
"GBLinear::Predict ntrees is only valid for gbtree predictor");
|
||||||
std::vector<float> &preds = *out_preds;
|
std::vector<float> &preds = *out_preds;
|
||||||
preds.resize(0);
|
preds.resize(0);
|
||||||
// start collecting the prediction
|
// start collecting the prediction
|
||||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||||
iter->BeforeFirst();
|
|
||||||
const int ngroup = model.param.num_output_group;
|
const int ngroup = model.param.num_output_group;
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
const SparseBatch &batch = iter->Value();
|
const RowBatch &batch = iter->Value();
|
||||||
utils::Assert(batch.base_rowid * ngroup == preds.size(),
|
utils::Assert(batch.base_rowid * ngroup == preds.size(),
|
||||||
"base_rowid is not set correctly");
|
"base_rowid is not set correctly");
|
||||||
// output convention: nrow * k, where nrow is number of rows
|
// output convention: nrow * k, where nrow is number of rows
|
||||||
@ -134,23 +140,11 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
inline void InitFeatIndex(const FMatrix &fmat) {
|
inline void Pred(const RowBatch::Inst &inst, float *preds) {
|
||||||
if (feat_index.size() != 0) return;
|
|
||||||
// initialize feature index
|
|
||||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
|
||||||
feat_index.reserve(ncol);
|
|
||||||
for (unsigned i = 0; i < ncol; ++i) {
|
|
||||||
if (fmat.GetColSize(i) != 0) {
|
|
||||||
feat_index.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
random::Shuffle(feat_index);
|
|
||||||
}
|
|
||||||
inline void Pred(const SparseBatch::Inst &inst, float *preds) {
|
|
||||||
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||||
float psum = model.bias()[gid];
|
float psum = model.bias()[gid];
|
||||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||||
psum += inst[i].fvalue * model[inst[i].findex][gid];
|
psum += inst[i].fvalue * model[inst[i].index][gid];
|
||||||
}
|
}
|
||||||
preds[gid] = psum;
|
preds[gid] = psum;
|
||||||
}
|
}
|
||||||
@ -173,6 +167,7 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
learning_rate = 1.0f;
|
learning_rate = 1.0f;
|
||||||
}
|
}
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
// sync-names
|
// sync-names
|
||||||
if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
|
if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
|
||||||
if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
|
if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
|
||||||
@ -214,9 +209,10 @@ class GBLinear : public IGradBooster<FMatrix> {
|
|||||||
Param(void) {
|
Param(void) {
|
||||||
num_feature = 0;
|
num_feature = 0;
|
||||||
num_output_group = 1;
|
num_output_group = 1;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
std::memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val);
|
if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val);
|
||||||
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
|
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
|
||||||
}
|
}
|
||||||
|
|||||||
19
src/gbm/gbm.cpp
Normal file
19
src/gbm/gbm.cpp
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
#include <cstring>
|
||||||
|
#include "./gbm.h"
|
||||||
|
#include "./gbtree-inl.hpp"
|
||||||
|
#include "./gblinear-inl.hpp"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace gbm {
|
||||||
|
IGradBooster* CreateGradBooster(const char *name) {
|
||||||
|
using namespace std;
|
||||||
|
if (!strcmp("gbtree", name)) return new GBTree();
|
||||||
|
if (!strcmp("gblinear", name)) return new GBLinear();
|
||||||
|
utils::Error("unknown booster type: %s", name);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
} // namespace gbm
|
||||||
|
} // namespace xgboost
|
||||||
|
|
||||||
@ -7,6 +7,7 @@
|
|||||||
*/
|
*/
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "../data.h"
|
#include "../data.h"
|
||||||
|
#include "../utils/io.h"
|
||||||
#include "../utils/fmap.h"
|
#include "../utils/fmap.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -14,9 +15,7 @@ namespace xgboost {
|
|||||||
namespace gbm {
|
namespace gbm {
|
||||||
/*!
|
/*!
|
||||||
* \brief interface of gradient boosting model
|
* \brief interface of gradient boosting model
|
||||||
* \tparam FMatrix the data type updater taking
|
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
|
||||||
class IGradBooster {
|
class IGradBooster {
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
@ -41,28 +40,31 @@ class IGradBooster {
|
|||||||
virtual void InitModel(void) = 0;
|
virtual void InitModel(void) = 0;
|
||||||
/*!
|
/*!
|
||||||
* \brief peform update to the model(boosting)
|
* \brief peform update to the model(boosting)
|
||||||
* \param fmat feature matrix that provide access to features
|
* \param p_fmat feature matrix that provide access to features
|
||||||
* \param info meta information about training
|
* \param info meta information about training
|
||||||
* \param in_gpair address of the gradient pair statistics of the data
|
* \param in_gpair address of the gradient pair statistics of the data
|
||||||
* the booster may change content of gpair
|
* the booster may change content of gpair
|
||||||
*/
|
*/
|
||||||
virtual void DoBoost(const FMatrix &fmat,
|
virtual void DoBoost(IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<bst_gpair> *in_gpair) = 0;
|
std::vector<bst_gpair> *in_gpair) = 0;
|
||||||
/*!
|
/*!
|
||||||
* \brief generate predictions for given feature matrix
|
* \brief generate predictions for given feature matrix
|
||||||
* \param fmat feature matrix
|
* \param p_fmat feature matrix
|
||||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||||
* this means we do not have buffer index allocated to the gbm
|
* this means we do not have buffer index allocated to the gbm
|
||||||
* a buffer index is assigned to each instance that requires repeative prediction
|
* a buffer index is assigned to each instance that requires repeative prediction
|
||||||
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
|
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
|
||||||
* \param info extra side information that may be needed for prediction
|
* \param info extra side information that may be needed for prediction
|
||||||
* \param out_preds output vector to hold the predictions
|
* \param out_preds output vector to hold the predictions
|
||||||
|
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
|
||||||
|
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
|
||||||
*/
|
*/
|
||||||
virtual void Predict(const FMatrix &fmat,
|
virtual void Predict(IFMatrix *p_fmat,
|
||||||
int64_t buffer_offset,
|
int64_t buffer_offset,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<float> *out_preds) = 0;
|
std::vector<float> *out_preds,
|
||||||
|
unsigned ntree_limit = 0) = 0;
|
||||||
/*!
|
/*!
|
||||||
* \brief dump the model in text format
|
* \brief dump the model in text format
|
||||||
* \param fmap feature map that may help give interpretations of feature
|
* \param fmap feature map that may help give interpretations of feature
|
||||||
@ -73,21 +75,11 @@ class IGradBooster {
|
|||||||
// destrcutor
|
// destrcutor
|
||||||
virtual ~IGradBooster(void){}
|
virtual ~IGradBooster(void){}
|
||||||
};
|
};
|
||||||
} // namespace gbm
|
/*!
|
||||||
} // namespace xgboost
|
* \breif create a gradient booster from given name
|
||||||
|
* \param name name of gradient booster
|
||||||
#include "gbtree-inl.hpp"
|
*/
|
||||||
#include "gblinear-inl.hpp"
|
IGradBooster* CreateGradBooster(const char *name);
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
namespace gbm {
|
|
||||||
template<typename FMatrix>
|
|
||||||
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
|
|
||||||
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
|
|
||||||
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
|
|
||||||
utils::Error("unknown booster type: %s", name);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
} // namespace gbm
|
} // namespace gbm
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_GBM_GBM_H_
|
#endif // XGBOOST_GBM_GBM_H_
|
||||||
|
|||||||
@ -9,21 +9,21 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "./gbm.h"
|
#include "./gbm.h"
|
||||||
|
#include "../utils/omp.h"
|
||||||
#include "../tree/updater.h"
|
#include "../tree/updater.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace gbm {
|
namespace gbm {
|
||||||
/*!
|
/*!
|
||||||
* \brief gradient boosted tree
|
* \brief gradient boosted tree
|
||||||
* \tparam FMatrix the data type updater taking
|
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
class GBTree : public IGradBooster {
|
||||||
class GBTree : public IGradBooster<FMatrix> {
|
|
||||||
public:
|
public:
|
||||||
virtual ~GBTree(void) {
|
virtual ~GBTree(void) {
|
||||||
this->Clear();
|
this->Clear();
|
||||||
}
|
}
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strncmp(name, "bst:", 4)) {
|
if (!strncmp(name, "bst:", 4)) {
|
||||||
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
||||||
// set into updaters, if already intialized
|
// set into updaters, if already intialized
|
||||||
@ -82,12 +82,12 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
||||||
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
||||||
}
|
}
|
||||||
virtual void DoBoost(const FMatrix &fmat,
|
virtual void DoBoost(IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<bst_gpair> *in_gpair) {
|
std::vector<bst_gpair> *in_gpair) {
|
||||||
const std::vector<bst_gpair> &gpair = *in_gpair;
|
const std::vector<bst_gpair> &gpair = *in_gpair;
|
||||||
if (mparam.num_output_group == 1) {
|
if (mparam.num_output_group == 1) {
|
||||||
this->BoostNewTrees(gpair, fmat, info, 0);
|
this->BoostNewTrees(gpair, p_fmat, info, 0);
|
||||||
} else {
|
} else {
|
||||||
const int ngroup = mparam.num_output_group;
|
const int ngroup = mparam.num_output_group;
|
||||||
utils::Check(gpair.size() % ngroup == 0,
|
utils::Check(gpair.size() % ngroup == 0,
|
||||||
@ -99,14 +99,15 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
tmp[i] = gpair[i * ngroup + gid];
|
tmp[i] = gpair[i * ngroup + gid];
|
||||||
}
|
}
|
||||||
this->BoostNewTrees(tmp, fmat, info, gid);
|
this->BoostNewTrees(tmp, p_fmat, info, gid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual void Predict(const FMatrix &fmat,
|
virtual void Predict(IFMatrix *p_fmat,
|
||||||
int64_t buffer_offset,
|
int64_t buffer_offset,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<float> *out_preds) {
|
std::vector<float> *out_preds,
|
||||||
|
unsigned ntree_limit = 0) {
|
||||||
int nthread;
|
int nthread;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
@ -118,17 +119,13 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> &preds = *out_preds;
|
std::vector<float> &preds = *out_preds;
|
||||||
preds.resize(0);
|
const size_t stride = info.num_row * mparam.num_output_group;
|
||||||
|
preds.resize(stride * (mparam.size_leaf_vector+1));
|
||||||
// start collecting the prediction
|
// start collecting the prediction
|
||||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||||
iter->BeforeFirst();
|
iter->BeforeFirst();
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
const SparseBatch &batch = iter->Value();
|
const RowBatch &batch = iter->Value();
|
||||||
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
|
|
||||||
"base_rowid is not set correctly");
|
|
||||||
// output convention: nrow * k, where nrow is number of rows
|
|
||||||
// k is number of group
|
|
||||||
preds.resize(preds.size() + batch.size * mparam.num_output_group);
|
|
||||||
// parallel over local batch
|
// parallel over local batch
|
||||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
@ -136,13 +133,14 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
tree::RegTree::FVec &feats = thread_temp[tid];
|
tree::RegTree::FVec &feats = thread_temp[tid];
|
||||||
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
|
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
|
||||||
const unsigned root_idx = info.GetRoot(ridx);
|
utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
|
||||||
// loop over output groups
|
// loop over output groups
|
||||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||||
preds[ridx * mparam.num_output_group + gid] =
|
|
||||||
this->Pred(batch[i],
|
this->Pred(batch[i],
|
||||||
buffer_offset < 0 ? -1 : buffer_offset + ridx,
|
buffer_offset < 0 ? -1 : buffer_offset + ridx,
|
||||||
gid, root_idx, &feats);
|
gid, info.GetRoot(ridx), &feats,
|
||||||
|
&preds[ridx * mparam.num_output_group + gid], stride,
|
||||||
|
ntree_limit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,20 +172,20 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
updaters.clear();
|
updaters.clear();
|
||||||
std::string tval = tparam.updater_seq;
|
std::string tval = tparam.updater_seq;
|
||||||
char *pstr;
|
char *pstr;
|
||||||
pstr = strtok(&tval[0], ",");
|
pstr = std::strtok(&tval[0], ",");
|
||||||
while (pstr != NULL) {
|
while (pstr != NULL) {
|
||||||
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
|
updaters.push_back(tree::CreateUpdater(pstr));
|
||||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||||
// set parameters
|
// set parameters
|
||||||
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||||
}
|
}
|
||||||
pstr = strtok(NULL, ",");
|
pstr = std::strtok(NULL, ",");
|
||||||
}
|
}
|
||||||
tparam.updater_initialized = 1;
|
tparam.updater_initialized = 1;
|
||||||
}
|
}
|
||||||
// do group specific group
|
// do group specific group
|
||||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
int bst_group) {
|
int bst_group) {
|
||||||
this->InitUpdater();
|
this->InitUpdater();
|
||||||
@ -202,7 +200,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
}
|
}
|
||||||
// update the trees
|
// update the trees
|
||||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||||
updaters[i]->Update(gpair, fmat, info, new_trees);
|
updaters[i]->Update(gpair, p_fmat, info, new_trees);
|
||||||
}
|
}
|
||||||
// push back to model
|
// push back to model
|
||||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||||
@ -212,34 +210,53 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
mparam.num_trees += tparam.num_parallel_tree;
|
mparam.num_trees += tparam.num_parallel_tree;
|
||||||
}
|
}
|
||||||
// make a prediction for a single instance
|
// make a prediction for a single instance
|
||||||
inline float Pred(const SparseBatch::Inst &inst,
|
inline void Pred(const RowBatch::Inst &inst,
|
||||||
int64_t buffer_index,
|
int64_t buffer_index,
|
||||||
int bst_group,
|
int bst_group,
|
||||||
unsigned root_index,
|
unsigned root_index,
|
||||||
tree::RegTree::FVec *p_feats) {
|
tree::RegTree::FVec *p_feats,
|
||||||
|
float *out_pred, size_t stride, unsigned ntree_limit) {
|
||||||
size_t itop = 0;
|
size_t itop = 0;
|
||||||
float psum = 0.0f;
|
float psum = 0.0f;
|
||||||
|
// sum of leaf vector
|
||||||
|
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
|
||||||
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
|
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||||
|
// number of valid trees
|
||||||
|
unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
|
||||||
// load buffered results if any
|
// load buffered results if any
|
||||||
if (bid >= 0) {
|
if (bid >= 0 && ntree_limit == 0) {
|
||||||
itop = pred_counter[bid];
|
itop = pred_counter[bid];
|
||||||
psum = pred_buffer[bid];
|
psum = pred_buffer[bid];
|
||||||
|
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||||
|
vec_psum[i] = pred_buffer[bid + i + 1];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (itop != trees.size()) {
|
if (itop != trees.size()) {
|
||||||
p_feats->Fill(inst);
|
p_feats->Fill(inst);
|
||||||
for (size_t i = itop; i < trees.size(); ++i) {
|
for (size_t i = itop; i < trees.size(); ++i) {
|
||||||
if (tree_info[i] == bst_group) {
|
if (tree_info[i] == bst_group) {
|
||||||
psum += trees[i]->Predict(*p_feats, root_index);
|
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
|
||||||
|
psum += (*trees[i])[tid].leaf_value();
|
||||||
|
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
|
||||||
|
vec_psum[j] += trees[i]->leafvec(tid)[j];
|
||||||
|
}
|
||||||
|
if(--treeleft == 0) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p_feats->Drop(inst);
|
p_feats->Drop(inst);
|
||||||
}
|
}
|
||||||
// updated the buffered results
|
// updated the buffered results
|
||||||
if (bid >= 0) {
|
if (bid >= 0 && ntree_limit == 0) {
|
||||||
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
||||||
pred_buffer[bid] = psum;
|
pred_buffer[bid] = psum;
|
||||||
|
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||||
|
pred_buffer[bid + i + 1] = vec_psum[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out_pred[0] = psum;
|
||||||
|
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||||
|
out_pred[stride * (i + 1)] = vec_psum[i];
|
||||||
}
|
}
|
||||||
return psum;
|
|
||||||
}
|
}
|
||||||
// --- data structure ---
|
// --- data structure ---
|
||||||
/*! \brief training parameters */
|
/*! \brief training parameters */
|
||||||
@ -263,6 +280,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
updater_initialized = 0;
|
updater_initialized = 0;
|
||||||
}
|
}
|
||||||
inline void SetParam(const char *name, const char *val){
|
inline void SetParam(const char *name, const char *val){
|
||||||
|
using namespace std;
|
||||||
if (!strcmp(name, "updater") &&
|
if (!strcmp(name, "updater") &&
|
||||||
strcmp(updater_seq.c_str(), val) != 0) {
|
strcmp(updater_seq.c_str(), val) != 0) {
|
||||||
updater_seq = val;
|
updater_seq = val;
|
||||||
@ -292,15 +310,18 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
* suppose we have n instance and k group, output will be k*n
|
* suppose we have n instance and k group, output will be k*n
|
||||||
*/
|
*/
|
||||||
int num_output_group;
|
int num_output_group;
|
||||||
|
/*! \brief size of leaf vector needed in tree */
|
||||||
|
int size_leaf_vector;
|
||||||
/*! \brief reserved parameters */
|
/*! \brief reserved parameters */
|
||||||
int reserved[32];
|
int reserved[31];
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
ModelParam(void) {
|
ModelParam(void) {
|
||||||
num_trees = 0;
|
num_trees = 0;
|
||||||
num_roots = num_feature = 0;
|
num_roots = num_feature = 0;
|
||||||
num_pbuffer = 0;
|
num_pbuffer = 0;
|
||||||
num_output_group = 1;
|
num_output_group = 1;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
size_leaf_vector = 0;
|
||||||
|
std::memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
@ -308,14 +329,16 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
|
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
|
||||||
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
|
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
|
||||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
|
if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
|
||||||
}
|
}
|
||||||
/*! \return size of prediction buffer actually needed */
|
/*! \return size of prediction buffer actually needed */
|
||||||
inline size_t PredBufferSize(void) const {
|
inline size_t PredBufferSize(void) const {
|
||||||
return num_output_group * num_pbuffer;
|
return num_output_group * num_pbuffer * (size_leaf_vector + 1);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief get the buffer offset given a buffer index and group id
|
* \brief get the buffer offset given a buffer index and group id
|
||||||
@ -324,7 +347,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
|
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
|
||||||
if (buffer_index < 0) return -1;
|
if (buffer_index < 0) return -1;
|
||||||
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
|
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
|
||||||
return buffer_index + num_pbuffer * bst_group;
|
return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// training parameter
|
// training parameter
|
||||||
@ -345,7 +368,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
|||||||
// temporal storage for per thread
|
// temporal storage for per thread
|
||||||
std::vector<tree::RegTree::FVec> thread_temp;
|
std::vector<tree::RegTree::FVec> thread_temp;
|
||||||
// the updaters that can be applied to each of tree
|
// the updaters that can be applied to each of tree
|
||||||
std::vector< tree::IUpdater<FMatrix>* > updaters;
|
std::vector<tree::IUpdater*> updaters;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace gbm
|
} // namespace gbm
|
||||||
|
|||||||
@ -13,7 +13,7 @@ namespace xgboost {
|
|||||||
/*! \brief namespace related to data format */
|
/*! \brief namespace related to data format */
|
||||||
namespace io {
|
namespace io {
|
||||||
/*! \brief DMatrix object that I/O module support save/load */
|
/*! \brief DMatrix object that I/O module support save/load */
|
||||||
typedef learner::DMatrix<FMatrixS> DataMatrix;
|
typedef learner::DMatrix DataMatrix;
|
||||||
/*!
|
/*!
|
||||||
* \brief load DataMatrix from stream
|
* \brief load DataMatrix from stream
|
||||||
* \param fname file name to be loaded
|
* \param fname file name to be loaded
|
||||||
|
|||||||
@ -16,6 +16,7 @@
|
|||||||
#include "../utils/utils.h"
|
#include "../utils/utils.h"
|
||||||
#include "../learner/dmatrix.h"
|
#include "../learner/dmatrix.h"
|
||||||
#include "./io.h"
|
#include "./io.h"
|
||||||
|
#include "./simple_fmatrix-inl.hpp"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace io {
|
namespace io {
|
||||||
@ -24,11 +25,16 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
public:
|
public:
|
||||||
// constructor
|
// constructor
|
||||||
DMatrixSimple(void) : DataMatrix(kMagic) {
|
DMatrixSimple(void) : DataMatrix(kMagic) {
|
||||||
this->fmat.set_iter(new OneBatchIter(this));
|
fmat_ = new FMatrixS(new OneBatchIter(this));
|
||||||
this->Clear();
|
this->Clear();
|
||||||
}
|
}
|
||||||
// virtual destructor
|
// virtual destructor
|
||||||
virtual ~DMatrixSimple(void) {}
|
virtual ~DMatrixSimple(void) {
|
||||||
|
delete fmat_;
|
||||||
|
}
|
||||||
|
virtual IFMatrix *fmat(void) const {
|
||||||
|
return fmat_;
|
||||||
|
}
|
||||||
/*! \brief clear the storage */
|
/*! \brief clear the storage */
|
||||||
inline void Clear(void) {
|
inline void Clear(void) {
|
||||||
row_ptr_.clear();
|
row_ptr_.clear();
|
||||||
@ -41,15 +47,17 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
this->info = src.info;
|
this->info = src.info;
|
||||||
this->Clear();
|
this->Clear();
|
||||||
// clone data content in thos matrix
|
// clone data content in thos matrix
|
||||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||||
iter->BeforeFirst();
|
iter->BeforeFirst();
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
const SparseBatch &batch = iter->Value();
|
const RowBatch &batch = iter->Value();
|
||||||
for (size_t i = 0; i < batch.size; ++i) {
|
for (size_t i = 0; i < batch.size; ++i) {
|
||||||
SparseBatch::Inst inst = batch[i];
|
RowBatch::Inst inst = batch[i];
|
||||||
row_data_.resize(row_data_.size() + inst.length);
|
row_data_.resize(row_data_.size() + inst.length);
|
||||||
memcpy(&row_data_[row_ptr_.back()], inst.data,
|
if (inst.length != 0) {
|
||||||
sizeof(SparseBatch::Entry) * inst.length);
|
std::memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||||
|
sizeof(RowBatch::Entry) * inst.length);
|
||||||
|
}
|
||||||
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -59,10 +67,10 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
* \param feats features
|
* \param feats features
|
||||||
* \return the index of added row
|
* \return the index of added row
|
||||||
*/
|
*/
|
||||||
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) {
|
inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
|
||||||
for (size_t i = 0; i < feats.size(); ++i) {
|
for (size_t i = 0; i < feats.size(); ++i) {
|
||||||
row_data_.push_back(feats[i]);
|
row_data_.push_back(feats[i]);
|
||||||
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].findex+1));
|
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
|
||||||
}
|
}
|
||||||
row_ptr_.push_back(row_ptr_.back() + feats.size());
|
row_ptr_.push_back(row_ptr_.back() + feats.size());
|
||||||
info.info.num_row += 1;
|
info.info.num_row += 1;
|
||||||
@ -74,14 +82,15 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
* \param silent whether print information or not
|
* \param silent whether print information or not
|
||||||
*/
|
*/
|
||||||
inline void LoadText(const char* fname, bool silent = false) {
|
inline void LoadText(const char* fname, bool silent = false) {
|
||||||
|
using namespace std;
|
||||||
this->Clear();
|
this->Clear();
|
||||||
FILE* file = utils::FopenCheck(fname, "r");
|
FILE* file = utils::FopenCheck(fname, "r");
|
||||||
float label; bool init = true;
|
float label; bool init = true;
|
||||||
char tmp[1024];
|
char tmp[1024];
|
||||||
std::vector<SparseBatch::Entry> feats;
|
std::vector<RowBatch::Entry> feats;
|
||||||
while (fscanf(file, "%s", tmp) == 1) {
|
while (fscanf(file, "%s", tmp) == 1) {
|
||||||
SparseBatch::Entry e;
|
RowBatch::Entry e;
|
||||||
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
|
if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
|
||||||
feats.push_back(e);
|
feats.push_back(e);
|
||||||
} else {
|
} else {
|
||||||
if (!init) {
|
if (!init) {
|
||||||
@ -98,8 +107,10 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
this->AddRow(feats);
|
this->AddRow(feats);
|
||||||
|
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
||||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
static_cast<unsigned long>(info.num_row()),
|
||||||
|
static_cast<unsigned long>(info.num_col()),
|
||||||
|
static_cast<unsigned long>(row_data_.size()), fname);
|
||||||
}
|
}
|
||||||
fclose(file);
|
fclose(file);
|
||||||
// try to load in additional file
|
// try to load in additional file
|
||||||
@ -125,7 +136,7 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
* \return whether loading is success
|
* \return whether loading is success
|
||||||
*/
|
*/
|
||||||
inline bool LoadBinary(const char* fname, bool silent = false) {
|
inline bool LoadBinary(const char* fname, bool silent = false) {
|
||||||
FILE *fp = fopen64(fname, "rb");
|
std::FILE *fp = fopen64(fname, "rb");
|
||||||
if (fp == NULL) return false;
|
if (fp == NULL) return false;
|
||||||
utils::FileStream fs(fp);
|
utils::FileStream fs(fp);
|
||||||
this->LoadBinary(fs, silent, fname);
|
this->LoadBinary(fs, silent, fname);
|
||||||
@ -139,24 +150,26 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
* \param fname file name, used to print message
|
* \param fname file name, used to print message
|
||||||
*/
|
*/
|
||||||
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
|
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
|
||||||
int magic;
|
int tmagic;
|
||||||
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
|
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
||||||
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
|
utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
|
||||||
|
|
||||||
info.LoadBinary(fs);
|
info.LoadBinary(fs);
|
||||||
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
|
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
|
||||||
fmat.LoadColAccess(fs);
|
fmat_->LoadColAccess(fs);
|
||||||
|
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("%lux%lu matrix with %lu entries is loaded",
|
utils::Printf("%lux%lu matrix with %lu entries is loaded",
|
||||||
info.num_row(), info.num_col(), row_data_.size());
|
static_cast<unsigned long>(info.num_row()),
|
||||||
|
static_cast<unsigned long>(info.num_col()),
|
||||||
|
static_cast<unsigned long>(row_data_.size()));
|
||||||
if (fname != NULL) {
|
if (fname != NULL) {
|
||||||
printf(" from %s\n", fname);
|
utils::Printf(" from %s\n", fname);
|
||||||
} else {
|
} else {
|
||||||
printf("\n");
|
utils::Printf("\n");
|
||||||
}
|
}
|
||||||
if (info.group_ptr.size() != 0) {
|
if (info.group_ptr.size() != 0) {
|
||||||
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,19 +180,22 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
*/
|
*/
|
||||||
inline void SaveBinary(const char* fname, bool silent = false) const {
|
inline void SaveBinary(const char* fname, bool silent = false) const {
|
||||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||||
int magic = kMagic;
|
int tmagic = kMagic;
|
||||||
fs.Write(&magic, sizeof(magic));
|
fs.Write(&tmagic, sizeof(tmagic));
|
||||||
|
|
||||||
info.SaveBinary(fs);
|
info.SaveBinary(fs);
|
||||||
FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
|
FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
|
||||||
fmat.SaveColAccess(fs);
|
fmat_->SaveColAccess(fs);
|
||||||
fs.Close();
|
fs.Close();
|
||||||
|
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
||||||
info.num_row(), info.num_col(), row_data_.size(), fname);
|
static_cast<unsigned long>(info.num_row()),
|
||||||
|
static_cast<unsigned long>(info.num_col()),
|
||||||
|
static_cast<unsigned long>(row_data_.size()), fname);
|
||||||
if (info.group_ptr.size() != 0) {
|
if (info.group_ptr.size() != 0) {
|
||||||
printf("data contains %lu groups\n", info.group_ptr.size()-1);
|
utils::Printf("data contains %u groups\n",
|
||||||
|
static_cast<unsigned>(info.group_ptr.size()-1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -193,6 +209,7 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
* \param savebuffer whether do save binary buffer if it is text
|
* \param savebuffer whether do save binary buffer if it is text
|
||||||
*/
|
*/
|
||||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
|
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
|
||||||
|
using namespace std;
|
||||||
size_t len = strlen(fname);
|
size_t len = strlen(fname);
|
||||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
|
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
|
||||||
if (!this->LoadBinary(fname, silent)) {
|
if (!this->LoadBinary(fname, silent)) {
|
||||||
@ -201,7 +218,7 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
char bname[1024];
|
char bname[1024];
|
||||||
snprintf(bname, sizeof(bname), "%s.buffer", fname);
|
utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
|
||||||
if (!this->LoadBinary(bname, silent)) {
|
if (!this->LoadBinary(bname, silent)) {
|
||||||
this->LoadText(fname, silent);
|
this->LoadText(fname, silent);
|
||||||
if (savebuffer) this->SaveBinary(bname, silent);
|
if (savebuffer) this->SaveBinary(bname, silent);
|
||||||
@ -211,13 +228,15 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
/*! \brief row pointer of CSR sparse storage */
|
/*! \brief row pointer of CSR sparse storage */
|
||||||
std::vector<size_t> row_ptr_;
|
std::vector<size_t> row_ptr_;
|
||||||
/*! \brief data in the row */
|
/*! \brief data in the row */
|
||||||
std::vector<SparseBatch::Entry> row_data_;
|
std::vector<RowBatch::Entry> row_data_;
|
||||||
|
/*! \brief the real fmatrix */
|
||||||
|
FMatrixS *fmat_;
|
||||||
/*! \brief magic number used to identify DMatrix */
|
/*! \brief magic number used to identify DMatrix */
|
||||||
static const int kMagic = 0xffffab01;
|
static const int kMagic = 0xffffab01;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// one batch iterator that return content in the matrix
|
// one batch iterator that return content in the matrix
|
||||||
struct OneBatchIter: utils::IIterator<SparseBatch> {
|
struct OneBatchIter: utils::IIterator<RowBatch> {
|
||||||
explicit OneBatchIter(DMatrixSimple *parent)
|
explicit OneBatchIter(DMatrixSimple *parent)
|
||||||
: at_first_(true), parent_(parent) {}
|
: at_first_(true), parent_(parent) {}
|
||||||
virtual ~OneBatchIter(void) {}
|
virtual ~OneBatchIter(void) {}
|
||||||
@ -229,11 +248,11 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
at_first_ = false;
|
at_first_ = false;
|
||||||
batch_.size = parent_->row_ptr_.size() - 1;
|
batch_.size = parent_->row_ptr_.size() - 1;
|
||||||
batch_.base_rowid = 0;
|
batch_.base_rowid = 0;
|
||||||
batch_.row_ptr = &parent_->row_ptr_[0];
|
batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
|
||||||
batch_.data_ptr = &parent_->row_data_[0];
|
batch_.data_ptr = BeginPtr(parent_->row_data_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
virtual const SparseBatch &Value(void) const {
|
virtual const RowBatch &Value(void) const {
|
||||||
return batch_;
|
return batch_;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,7 +262,7 @@ class DMatrixSimple : public DataMatrix {
|
|||||||
// pointer to parient
|
// pointer to parient
|
||||||
DMatrixSimple *parent_;
|
DMatrixSimple *parent_;
|
||||||
// temporal space for batch
|
// temporal space for batch
|
||||||
SparseBatch batch_;
|
RowBatch batch_;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
} // namespace io
|
} // namespace io
|
||||||
|
|||||||
242
src/io/simple_fmatrix-inl.hpp
Normal file
242
src/io/simple_fmatrix-inl.hpp
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||||
|
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||||
|
/*!
|
||||||
|
* \file simple_fmatrix-inl.hpp
|
||||||
|
* \brief the input data structure for gradient boosting
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
#include "../data.h"
|
||||||
|
#include "../utils/utils.h"
|
||||||
|
#include "../utils/random.h"
|
||||||
|
#include "../utils/omp.h"
|
||||||
|
#include "../utils/matrix_csr.h"
|
||||||
|
namespace xgboost {
|
||||||
|
namespace io {
|
||||||
|
/*!
|
||||||
|
* \brief sparse matrix that support column access, CSC
|
||||||
|
*/
|
||||||
|
class FMatrixS : public IFMatrix{
|
||||||
|
public:
|
||||||
|
typedef SparseBatch::Entry Entry;
|
||||||
|
/*! \brief constructor */
|
||||||
|
FMatrixS(utils::IIterator<RowBatch> *iter) {
|
||||||
|
this->iter_ = iter;
|
||||||
|
}
|
||||||
|
// destructor
|
||||||
|
virtual ~FMatrixS(void) {
|
||||||
|
if (iter_ != NULL) delete iter_;
|
||||||
|
}
|
||||||
|
/*! \return whether column access is enabled */
|
||||||
|
virtual bool HaveColAccess(void) const {
|
||||||
|
return col_ptr_.size() != 0;
|
||||||
|
}
|
||||||
|
/*! \brief get number of colmuns */
|
||||||
|
virtual size_t NumCol(void) const {
|
||||||
|
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||||
|
return col_ptr_.size() - 1;
|
||||||
|
}
|
||||||
|
/*! \brief get number of buffered rows */
|
||||||
|
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||||
|
return buffered_rowset_;
|
||||||
|
}
|
||||||
|
/*! \brief get column size */
|
||||||
|
virtual size_t GetColSize(size_t cidx) const {
|
||||||
|
return col_ptr_[cidx+1] - col_ptr_[cidx];
|
||||||
|
}
|
||||||
|
/*! \brief get column density */
|
||||||
|
virtual float GetColDensity(size_t cidx) const {
|
||||||
|
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
|
||||||
|
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||||
|
}
|
||||||
|
virtual void InitColAccess(float pkeep = 1.0f) {
|
||||||
|
if (this->HaveColAccess()) return;
|
||||||
|
this->InitColData(pkeep);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief get the row iterator associated with FMatrix
|
||||||
|
*/
|
||||||
|
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||||
|
iter_->BeforeFirst();
|
||||||
|
return iter_;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief get the column based iterator
|
||||||
|
*/
|
||||||
|
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||||
|
size_t ncol = this->NumCol();
|
||||||
|
col_iter_.col_index_.resize(ncol);
|
||||||
|
for (size_t i = 0; i < ncol; ++i) {
|
||||||
|
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||||
|
}
|
||||||
|
col_iter_.SetBatch(col_ptr_, col_data_);
|
||||||
|
return &col_iter_;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief colmun based iterator
|
||||||
|
*/
|
||||||
|
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||||
|
col_iter_.col_index_ = fset;
|
||||||
|
col_iter_.SetBatch(col_ptr_, col_data_);
|
||||||
|
return &col_iter_;
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save column access data into stream
|
||||||
|
* \param fo output stream to save to
|
||||||
|
*/
|
||||||
|
inline void SaveColAccess(utils::IStream &fo) const {
|
||||||
|
fo.Write(buffered_rowset_);
|
||||||
|
if (buffered_rowset_.size() != 0) {
|
||||||
|
SaveBinary(fo, col_ptr_, col_data_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load column access data from stream
|
||||||
|
* \param fo output stream to load from
|
||||||
|
*/
|
||||||
|
inline void LoadColAccess(utils::IStream &fi) {
|
||||||
|
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
|
||||||
|
if (buffered_rowset_.size() != 0) {
|
||||||
|
LoadBinary(fi, &col_ptr_, &col_data_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief save data to binary stream
|
||||||
|
* \param fo output stream
|
||||||
|
* \param ptr pointer data
|
||||||
|
* \param data data content
|
||||||
|
*/
|
||||||
|
inline static void SaveBinary(utils::IStream &fo,
|
||||||
|
const std::vector<size_t> &ptr,
|
||||||
|
const std::vector<RowBatch::Entry> &data) {
|
||||||
|
size_t nrow = ptr.size() - 1;
|
||||||
|
fo.Write(&nrow, sizeof(size_t));
|
||||||
|
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
|
||||||
|
if (data.size() != 0) {
|
||||||
|
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief load data from binary stream
|
||||||
|
* \param fi input stream
|
||||||
|
* \param out_ptr pointer data
|
||||||
|
* \param out_data data content
|
||||||
|
*/
|
||||||
|
inline static void LoadBinary(utils::IStream &fi,
|
||||||
|
std::vector<size_t> *out_ptr,
|
||||||
|
std::vector<RowBatch::Entry> *out_data) {
|
||||||
|
size_t nrow;
|
||||||
|
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
||||||
|
out_ptr->resize(nrow + 1);
|
||||||
|
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
|
||||||
|
"invalid input file format");
|
||||||
|
out_data->resize(out_ptr->back());
|
||||||
|
if (out_data->size() != 0) {
|
||||||
|
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
|
||||||
|
"invalid input file format");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/*!
|
||||||
|
* \brief intialize column data
|
||||||
|
* \param pkeep probability to keep a row
|
||||||
|
*/
|
||||||
|
inline void InitColData(float pkeep) {
|
||||||
|
buffered_rowset_.clear();
|
||||||
|
// note: this part of code is serial, todo, parallelize this transformer
|
||||||
|
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
|
||||||
|
builder.InitBudget(0);
|
||||||
|
// start working
|
||||||
|
iter_->BeforeFirst();
|
||||||
|
while (iter_->Next()) {
|
||||||
|
const RowBatch &batch = iter_->Value();
|
||||||
|
for (size_t i = 0; i < batch.size; ++i) {
|
||||||
|
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||||
|
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||||
|
RowBatch::Inst inst = batch[i];
|
||||||
|
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||||
|
builder.AddBudget(inst[j].index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
builder.InitStorage();
|
||||||
|
|
||||||
|
iter_->BeforeFirst();
|
||||||
|
size_t ktop = 0;
|
||||||
|
while (iter_->Next()) {
|
||||||
|
const RowBatch &batch = iter_->Value();
|
||||||
|
for (size_t i = 0; i < batch.size; ++i) {
|
||||||
|
if (ktop < buffered_rowset_.size() &&
|
||||||
|
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
||||||
|
++ktop;
|
||||||
|
RowBatch::Inst inst = batch[i];
|
||||||
|
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||||
|
builder.PushElem(inst[j].index,
|
||||||
|
Entry((bst_uint)(batch.base_rowid+i),
|
||||||
|
inst[j].fvalue));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// sort columns
|
||||||
|
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||||
|
std::sort(&col_data_[0] + col_ptr_[i],
|
||||||
|
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// one batch iterator that return content in the matrix
|
||||||
|
struct OneBatchIter: utils::IIterator<ColBatch> {
|
||||||
|
OneBatchIter(void) : at_first_(true){}
|
||||||
|
virtual ~OneBatchIter(void) {}
|
||||||
|
virtual void BeforeFirst(void) {
|
||||||
|
at_first_ = true;
|
||||||
|
}
|
||||||
|
virtual bool Next(void) {
|
||||||
|
if (!at_first_) return false;
|
||||||
|
at_first_ = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
virtual const ColBatch &Value(void) const {
|
||||||
|
return batch_;
|
||||||
|
}
|
||||||
|
inline void SetBatch(const std::vector<size_t> &ptr,
|
||||||
|
const std::vector<ColBatch::Entry> &data) {
|
||||||
|
batch_.size = col_index_.size();
|
||||||
|
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0));
|
||||||
|
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||||
|
const bst_uint ridx = col_index_[i];
|
||||||
|
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
|
||||||
|
static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
|
||||||
|
}
|
||||||
|
batch_.col_index = BeginPtr(col_index_);
|
||||||
|
batch_.col_data = BeginPtr(col_data_);
|
||||||
|
this->BeforeFirst();
|
||||||
|
}
|
||||||
|
// data content
|
||||||
|
std::vector<bst_uint> col_index_;
|
||||||
|
std::vector<ColBatch::Inst> col_data_;
|
||||||
|
// whether is at first
|
||||||
|
bool at_first_;
|
||||||
|
// temporal space for batch
|
||||||
|
ColBatch batch_;
|
||||||
|
};
|
||||||
|
// --- data structure used to support InitColAccess --
|
||||||
|
// column iterator
|
||||||
|
OneBatchIter col_iter_;
|
||||||
|
// row iterator
|
||||||
|
utils::IIterator<RowBatch> *iter_;
|
||||||
|
/*! \brief list of row index that are buffered */
|
||||||
|
std::vector<bst_uint> buffered_rowset_;
|
||||||
|
/*! \brief column pointer of CSC format */
|
||||||
|
std::vector<size_t> col_ptr_;
|
||||||
|
/*! \brief column datas in CSC format */
|
||||||
|
std::vector<ColBatch::Entry> col_data_;
|
||||||
|
};
|
||||||
|
} // namespace io
|
||||||
|
} // namespace xgboost
|
||||||
|
#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
|
||||||
@ -7,8 +7,9 @@
|
|||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cstring>
|
||||||
#include "../data.h"
|
#include "../data.h"
|
||||||
|
#include "../utils/io.h"
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace learner {
|
namespace learner {
|
||||||
/*!
|
/*!
|
||||||
@ -89,6 +90,7 @@ struct MetaInfo {
|
|||||||
}
|
}
|
||||||
// try to load group information from file, if exists
|
// try to load group information from file, if exists
|
||||||
inline bool TryLoadGroup(const char* fname, bool silent = false) {
|
inline bool TryLoadGroup(const char* fname, bool silent = false) {
|
||||||
|
using namespace std;
|
||||||
FILE *fi = fopen64(fname, "r");
|
FILE *fi = fopen64(fname, "r");
|
||||||
if (fi == NULL) return false;
|
if (fi == NULL) return false;
|
||||||
group_ptr.push_back(0);
|
group_ptr.push_back(0);
|
||||||
@ -97,12 +99,14 @@ struct MetaInfo {
|
|||||||
group_ptr.push_back(group_ptr.back()+nline);
|
group_ptr.push_back(group_ptr.back()+nline);
|
||||||
}
|
}
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname);
|
utils::Printf("%u groups are loaded from %s\n",
|
||||||
|
static_cast<unsigned>(group_ptr.size()-1), fname);
|
||||||
}
|
}
|
||||||
fclose(fi);
|
fclose(fi);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
inline std::vector<float>& GetFloatInfo(const char *field) {
|
inline std::vector<float>& GetFloatInfo(const char *field) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp(field, "label")) return labels;
|
if (!strcmp(field, "label")) return labels;
|
||||||
if (!strcmp(field, "weight")) return weights;
|
if (!strcmp(field, "weight")) return weights;
|
||||||
if (!strcmp(field, "base_margin")) return base_margin;
|
if (!strcmp(field, "base_margin")) return base_margin;
|
||||||
@ -113,6 +117,7 @@ struct MetaInfo {
|
|||||||
return ((MetaInfo*)this)->GetFloatInfo(field);
|
return ((MetaInfo*)this)->GetFloatInfo(field);
|
||||||
}
|
}
|
||||||
inline std::vector<unsigned> &GetUIntInfo(const char *field) {
|
inline std::vector<unsigned> &GetUIntInfo(const char *field) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp(field, "root_index")) return info.root_index;
|
if (!strcmp(field, "root_index")) return info.root_index;
|
||||||
if (!strcmp(field, "fold_index")) return info.fold_index;
|
if (!strcmp(field, "fold_index")) return info.fold_index;
|
||||||
utils::Error("unknown field %s", field);
|
utils::Error("unknown field %s", field);
|
||||||
@ -123,15 +128,16 @@ struct MetaInfo {
|
|||||||
}
|
}
|
||||||
// try to load weight information from file, if exists
|
// try to load weight information from file, if exists
|
||||||
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
|
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
|
||||||
std::vector<float> &weights = this->GetFloatInfo(field);
|
using namespace std;
|
||||||
|
std::vector<float> &data = this->GetFloatInfo(field);
|
||||||
FILE *fi = fopen64(fname, "r");
|
FILE *fi = fopen64(fname, "r");
|
||||||
if (fi == NULL) return false;
|
if (fi == NULL) return false;
|
||||||
float wt;
|
float wt;
|
||||||
while (fscanf(fi, "%f", &wt) == 1) {
|
while (fscanf(fi, "%f", &wt) == 1) {
|
||||||
weights.push_back(wt);
|
data.push_back(wt);
|
||||||
}
|
}
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("loading %s from %s\n", field, fname);
|
utils::Printf("loading %s from %s\n", field, fname);
|
||||||
}
|
}
|
||||||
fclose(fi);
|
fclose(fi);
|
||||||
return true;
|
return true;
|
||||||
@ -142,7 +148,6 @@ struct MetaInfo {
|
|||||||
* \brief data object used for learning,
|
* \brief data object used for learning,
|
||||||
* \tparam FMatrix type of feature data source
|
* \tparam FMatrix type of feature data source
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
|
||||||
struct DMatrix {
|
struct DMatrix {
|
||||||
/*!
|
/*!
|
||||||
* \brief magic number associated with this object
|
* \brief magic number associated with this object
|
||||||
@ -151,8 +156,6 @@ struct DMatrix {
|
|||||||
const int magic;
|
const int magic;
|
||||||
/*! \brief meta information about the dataset */
|
/*! \brief meta information about the dataset */
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
/*! \brief feature matrix about data content */
|
|
||||||
FMatrix fmat;
|
|
||||||
/*!
|
/*!
|
||||||
* \brief cache pointer to verify if the data structure is cached in some learner
|
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||||
* used to verify if DMatrix is cached
|
* used to verify if DMatrix is cached
|
||||||
@ -160,6 +163,8 @@ struct DMatrix {
|
|||||||
void *cache_learner_ptr_;
|
void *cache_learner_ptr_;
|
||||||
/*! \brief default constructor */
|
/*! \brief default constructor */
|
||||||
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
|
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
|
||||||
|
/*! \brief get feature matrix about data content */
|
||||||
|
virtual IFMatrix *fmat(void) const = 0;
|
||||||
// virtual destructor
|
// virtual destructor
|
||||||
virtual ~DMatrix(void){}
|
virtual ~DMatrix(void){}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -8,8 +8,8 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <climits>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <climits>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "./evaluation.h"
|
#include "./evaluation.h"
|
||||||
#include "./helper_utils.h"
|
#include "./helper_utils.h"
|
||||||
@ -24,9 +24,12 @@ template<typename Derived>
|
|||||||
struct EvalEWiseBase : public IEvaluator {
|
struct EvalEWiseBase : public IEvaluator {
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const MetaInfo &info) const {
|
const MetaInfo &info) const {
|
||||||
utils::Check(preds.size() == info.labels.size(),
|
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||||
|
utils::Check(preds.size() % info.labels.size() == 0,
|
||||||
"label and prediction size not match");
|
"label and prediction size not match");
|
||||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
|
||||||
|
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
|
||||||
|
|
||||||
float sum = 0.0, wsum = 0.0;
|
float sum = 0.0, wsum = 0.0;
|
||||||
#pragma omp parallel for reduction(+: sum, wsum) schedule(static)
|
#pragma omp parallel for reduction(+: sum, wsum) schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||||
@ -99,17 +102,58 @@ struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*! \brief ctest */
|
||||||
|
struct EvalCTest: public IEvaluator {
|
||||||
|
EvalCTest(IEvaluator *base, const char *name)
|
||||||
|
: base_(base), name_(name) {}
|
||||||
|
virtual ~EvalCTest(void) {
|
||||||
|
delete base_;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const {
|
||||||
|
return name_.c_str();
|
||||||
|
}
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const MetaInfo &info) const {
|
||||||
|
utils::Check(preds.size() % info.labels.size() == 0,
|
||||||
|
"label and prediction size not match");
|
||||||
|
size_t ngroup = preds.size() / info.labels.size() - 1;
|
||||||
|
const unsigned ndata = static_cast<unsigned>(info.labels.size());
|
||||||
|
utils::Check(ngroup > 1, "pred size does not meet requirement");
|
||||||
|
utils::Check(ndata == info.info.fold_index.size(), "need fold index");
|
||||||
|
double wsum = 0.0;
|
||||||
|
for (size_t k = 0; k < ngroup; ++k) {
|
||||||
|
std::vector<float> tpred;
|
||||||
|
MetaInfo tinfo;
|
||||||
|
for (unsigned i = 0; i < ndata; ++i) {
|
||||||
|
if (info.info.fold_index[i] == k) {
|
||||||
|
tpred.push_back(preds[i + (k + 1) * ndata]);
|
||||||
|
tinfo.labels.push_back(info.labels[i]);
|
||||||
|
tinfo.weights.push_back(info.GetWeight(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wsum += base_->Eval(tpred, tinfo);
|
||||||
|
}
|
||||||
|
return static_cast<float>(wsum / ngroup);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
IEvaluator *base_;
|
||||||
|
std::string name_;
|
||||||
|
};
|
||||||
|
|
||||||
/*! \brief AMS: also records best threshold */
|
/*! \brief AMS: also records best threshold */
|
||||||
struct EvalAMS : public IEvaluator {
|
struct EvalAMS : public IEvaluator {
|
||||||
public:
|
public:
|
||||||
explicit EvalAMS(const char *name) {
|
explicit EvalAMS(const char *name) {
|
||||||
name_ = name;
|
name_ = name;
|
||||||
// note: ams@0 will automatically select which ratio to go
|
// note: ams@0 will automatically select which ratio to go
|
||||||
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
||||||
}
|
}
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const MetaInfo &info) const {
|
const MetaInfo &info) const {
|
||||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
using namespace std;
|
||||||
|
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
|
||||||
|
|
||||||
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
|
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
|
||||||
std::vector< std::pair<float, unsigned> > rec(ndata);
|
std::vector< std::pair<float, unsigned> > rec(ndata);
|
||||||
|
|
||||||
@ -140,7 +184,7 @@ struct EvalAMS : public IEvaluator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ntop == ndata) {
|
if (ntop == ndata) {
|
||||||
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
utils::Printf("\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
||||||
return static_cast<float>(tams);
|
return static_cast<float>(tams);
|
||||||
} else {
|
} else {
|
||||||
return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
|
return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
|
||||||
@ -159,6 +203,7 @@ struct EvalAMS : public IEvaluator {
|
|||||||
struct EvalPrecisionRatio : public IEvaluator{
|
struct EvalPrecisionRatio : public IEvaluator{
|
||||||
public:
|
public:
|
||||||
explicit EvalPrecisionRatio(const char *name) : name_(name) {
|
explicit EvalPrecisionRatio(const char *name) : name_(name) {
|
||||||
|
using namespace std;
|
||||||
if (sscanf(name, "apratio@%f", &ratio_) == 1) {
|
if (sscanf(name, "apratio@%f", &ratio_) == 1) {
|
||||||
use_ap = 1;
|
use_ap = 1;
|
||||||
} else {
|
} else {
|
||||||
@ -168,9 +213,11 @@ struct EvalPrecisionRatio : public IEvaluator{
|
|||||||
}
|
}
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const MetaInfo &info) const {
|
const MetaInfo &info) const {
|
||||||
utils::Assert(preds.size() == info.labels.size(), "label size predict size not match");
|
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||||
|
utils::Assert(preds.size() % info.labels.size() == 0,
|
||||||
|
"label size predict size not match");
|
||||||
std::vector< std::pair<float, unsigned> > rec;
|
std::vector< std::pair<float, unsigned> > rec;
|
||||||
for (size_t j = 0; j < preds.size(); ++j) {
|
for (size_t j = 0; j < info.labels.size(); ++j) {
|
||||||
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
|
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
|
||||||
}
|
}
|
||||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||||
@ -206,10 +253,14 @@ struct EvalPrecisionRatio : public IEvaluator{
|
|||||||
struct EvalAuc : public IEvaluator {
|
struct EvalAuc : public IEvaluator {
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
const MetaInfo &info) const {
|
const MetaInfo &info) const {
|
||||||
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
|
utils::Check(info.labels.size() != 0, "label set cannot be empty");
|
||||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
|
utils::Check(preds.size() % info.labels.size() == 0,
|
||||||
|
"label size predict size not match");
|
||||||
|
std::vector<unsigned> tgptr(2, 0);
|
||||||
|
tgptr[1] = static_cast<unsigned>(info.labels.size());
|
||||||
|
|
||||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||||
utils::Check(gptr.back() == preds.size(),
|
utils::Check(gptr.back() == info.labels.size(),
|
||||||
"EvalAuc: group structure must match number of prediction");
|
"EvalAuc: group structure must match number of prediction");
|
||||||
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||||
// sum statictis
|
// sum statictis
|
||||||
@ -293,6 +344,7 @@ struct EvalRankList : public IEvaluator {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
explicit EvalRankList(const char *name) {
|
explicit EvalRankList(const char *name) {
|
||||||
|
using namespace std;
|
||||||
name_ = name;
|
name_ = name;
|
||||||
minus_ = false;
|
minus_ = false;
|
||||||
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
|
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
|
||||||
@ -339,7 +391,7 @@ struct EvalNDCG : public EvalRankList{
|
|||||||
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
|
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
|
||||||
const unsigned rel = rec[i].second;
|
const unsigned rel = rec[i].second;
|
||||||
if (rel != 0) {
|
if (rel != 0) {
|
||||||
sumdcg += ((1 << rel) - 1) / log(i + 2.0);
|
sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return static_cast<float>(sumdcg);
|
return static_cast<float>(sumdcg);
|
||||||
|
|||||||
@ -36,6 +36,7 @@ struct IEvaluator{
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace learner {
|
namespace learner {
|
||||||
inline IEvaluator* CreateEvaluator(const char *name) {
|
inline IEvaluator* CreateEvaluator(const char *name) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp(name, "rmse")) return new EvalRMSE();
|
if (!strcmp(name, "rmse")) return new EvalRMSE();
|
||||||
if (!strcmp(name, "error")) return new EvalError();
|
if (!strcmp(name, "error")) return new EvalError();
|
||||||
if (!strcmp(name, "merror")) return new EvalMatchError();
|
if (!strcmp(name, "merror")) return new EvalMatchError();
|
||||||
@ -45,7 +46,9 @@ inline IEvaluator* CreateEvaluator(const char *name) {
|
|||||||
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
||||||
if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
|
if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
|
||||||
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
||||||
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
|
if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
|
||||||
|
if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
|
||||||
|
|
||||||
utils::Error("unknown evaluation metric type: %s", name);
|
utils::Error("unknown evaluation metric type: %s", name);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@ -54,6 +57,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
|
|||||||
class EvalSet{
|
class EvalSet{
|
||||||
public:
|
public:
|
||||||
inline void AddEval(const char *name) {
|
inline void AddEval(const char *name) {
|
||||||
|
using namespace std;
|
||||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||||
if (!strcmp(name, evals_[i]->Name())) return;
|
if (!strcmp(name, evals_[i]->Name())) return;
|
||||||
}
|
}
|
||||||
@ -71,11 +75,14 @@ class EvalSet{
|
|||||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||||
float res = evals_[i]->Eval(preds, info);
|
float res = evals_[i]->Eval(preds, info);
|
||||||
char tmp[1024];
|
char tmp[1024];
|
||||||
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||||
result += tmp;
|
result += tmp;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
inline size_t Size(void) const {
|
||||||
|
return evals_.size();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<const IEvaluator*> evals_;
|
std::vector<const IEvaluator*> evals_;
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
*/
|
*/
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace learner {
|
namespace learner {
|
||||||
|
|||||||
@ -21,7 +21,6 @@ namespace learner {
|
|||||||
* \brief learner that takes do gradient boosting on specific objective functions
|
* \brief learner that takes do gradient boosting on specific objective functions
|
||||||
* and do training and prediction
|
* and do training and prediction
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
|
||||||
class BoostLearner {
|
class BoostLearner {
|
||||||
public:
|
public:
|
||||||
BoostLearner(void) {
|
BoostLearner(void) {
|
||||||
@ -44,7 +43,7 @@ class BoostLearner {
|
|||||||
* data matrices to continue training otherwise it will cause error
|
* data matrices to continue training otherwise it will cause error
|
||||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
*/
|
*/
|
||||||
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
|
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
|
||||||
// estimate feature bound
|
// estimate feature bound
|
||||||
unsigned num_feature = 0;
|
unsigned num_feature = 0;
|
||||||
// assign buffer index
|
// assign buffer index
|
||||||
@ -64,13 +63,14 @@ class BoostLearner {
|
|||||||
}
|
}
|
||||||
char str_temp[25];
|
char str_temp[25];
|
||||||
if (num_feature > mparam.num_feature) {
|
if (num_feature > mparam.num_feature) {
|
||||||
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||||
this->SetParam("bst:num_feature", str_temp);
|
this->SetParam("bst:num_feature", str_temp);
|
||||||
}
|
}
|
||||||
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
|
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
|
||||||
|
static_cast<unsigned long>(buffer_size));
|
||||||
this->SetParam("num_pbuffer", str_temp);
|
this->SetParam("num_pbuffer", str_temp);
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
printf("buffer_size=%ld\n", buffer_size);
|
utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -79,6 +79,7 @@ class BoostLearner {
|
|||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
// in this version, bst: prefix is no longer required
|
// in this version, bst: prefix is no longer required
|
||||||
if (strncmp(name, "bst:", 4) != 0) {
|
if (strncmp(name, "bst:", 4) != 0) {
|
||||||
std::string n = "bst:"; n += name;
|
std::string n = "bst:"; n += name;
|
||||||
@ -158,18 +159,18 @@ class BoostLearner {
|
|||||||
* if not intialize it
|
* if not intialize it
|
||||||
* \param p_train pointer to the matrix used by training
|
* \param p_train pointer to the matrix used by training
|
||||||
*/
|
*/
|
||||||
inline void CheckInit(DMatrix<FMatrix> *p_train) {
|
inline void CheckInit(DMatrix *p_train) {
|
||||||
p_train->fmat.InitColAccess(prob_buffer_row);
|
p_train->fmat()->InitColAccess(prob_buffer_row);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief update the model for one iteration
|
* \brief update the model for one iteration
|
||||||
* \param iter current iteration number
|
* \param iter current iteration number
|
||||||
* \param p_train pointer to the data matrix
|
* \param p_train pointer to the data matrix
|
||||||
*/
|
*/
|
||||||
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
|
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
||||||
this->PredictRaw(train, &preds_);
|
this->PredictRaw(train, &preds_);
|
||||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||||
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
|
gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief evaluate the model for specific iteration
|
* \brief evaluate the model for specific iteration
|
||||||
@ -179,11 +180,11 @@ class BoostLearner {
|
|||||||
* \return a string corresponding to the evaluation result
|
* \return a string corresponding to the evaluation result
|
||||||
*/
|
*/
|
||||||
inline std::string EvalOneIter(int iter,
|
inline std::string EvalOneIter(int iter,
|
||||||
const std::vector<const DMatrix<FMatrix>*> &evals,
|
const std::vector<const DMatrix*> &evals,
|
||||||
const std::vector<std::string> &evname) {
|
const std::vector<std::string> &evname) {
|
||||||
std::string res;
|
std::string res;
|
||||||
char tmp[256];
|
char tmp[256];
|
||||||
snprintf(tmp, sizeof(tmp), "[%d]", iter);
|
utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter);
|
||||||
res = tmp;
|
res = tmp;
|
||||||
for (size_t i = 0; i < evals.size(); ++i) {
|
for (size_t i = 0; i < evals.size(); ++i) {
|
||||||
this->PredictRaw(*evals[i], &preds_);
|
this->PredictRaw(*evals[i], &preds_);
|
||||||
@ -198,7 +199,7 @@ class BoostLearner {
|
|||||||
* \param metric name of metric
|
* \param metric name of metric
|
||||||
* \return a pair of <evaluation name, result>
|
* \return a pair of <evaluation name, result>
|
||||||
*/
|
*/
|
||||||
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
|
std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
|
||||||
if (metric == "auto") metric = obj_->DefaultEvalMetric();
|
if (metric == "auto") metric = obj_->DefaultEvalMetric();
|
||||||
IEvaluator *ev = CreateEvaluator(metric.c_str());
|
IEvaluator *ev = CreateEvaluator(metric.c_str());
|
||||||
this->PredictRaw(data, &preds_);
|
this->PredictRaw(data, &preds_);
|
||||||
@ -212,11 +213,14 @@ class BoostLearner {
|
|||||||
* \param data input data
|
* \param data input data
|
||||||
* \param output_margin whether to only predict margin value instead of transformed prediction
|
* \param output_margin whether to only predict margin value instead of transformed prediction
|
||||||
* \param out_preds output vector that stores the prediction
|
* \param out_preds output vector that stores the prediction
|
||||||
|
* \param ntree_limit limit number of trees used for boosted tree
|
||||||
|
* predictor, when it equals 0, this means we are using all the trees
|
||||||
*/
|
*/
|
||||||
inline void Predict(const DMatrix<FMatrix> &data,
|
inline void Predict(const DMatrix &data,
|
||||||
bool output_margin,
|
bool output_margin,
|
||||||
std::vector<float> *out_preds) const {
|
std::vector<float> *out_preds,
|
||||||
this->PredictRaw(data, out_preds);
|
unsigned ntree_limit = 0) const {
|
||||||
|
this->PredictRaw(data, out_preds, ntree_limit);
|
||||||
if (!output_margin) {
|
if (!output_margin) {
|
||||||
obj_->PredTransform(out_preds);
|
obj_->PredTransform(out_preds);
|
||||||
}
|
}
|
||||||
@ -235,22 +239,27 @@ class BoostLearner {
|
|||||||
if (obj_ != NULL) return;
|
if (obj_ != NULL) return;
|
||||||
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
|
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
|
||||||
obj_ = CreateObjFunction(name_obj_.c_str());
|
obj_ = CreateObjFunction(name_obj_.c_str());
|
||||||
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
|
gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
|
||||||
for (size_t i = 0; i < cfg_.size(); ++i) {
|
for (size_t i = 0; i < cfg_.size(); ++i) {
|
||||||
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||||
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||||
}
|
}
|
||||||
|
if (evaluator_.Size() == 0) {
|
||||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief get un-transformed prediction
|
* \brief get un-transformed prediction
|
||||||
* \param data training data matrix
|
* \param data training data matrix
|
||||||
* \param out_preds output vector that stores the prediction
|
* \param out_preds output vector that stores the prediction
|
||||||
|
* \param ntree_limit limit number of trees used for boosted tree
|
||||||
|
* predictor, when it equals 0, this means we are using all the trees
|
||||||
*/
|
*/
|
||||||
inline void PredictRaw(const DMatrix<FMatrix> &data,
|
inline void PredictRaw(const DMatrix &data,
|
||||||
std::vector<float> *out_preds) const {
|
std::vector<float> *out_preds,
|
||||||
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
|
unsigned ntree_limit = 0) const {
|
||||||
data.info.info, out_preds);
|
gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
|
||||||
|
data.info.info, out_preds, ntree_limit);
|
||||||
// add base margin
|
// add base margin
|
||||||
std::vector<float> &preds = *out_preds;
|
std::vector<float> &preds = *out_preds;
|
||||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
|
||||||
@ -284,7 +293,7 @@ class BoostLearner {
|
|||||||
base_score = 0.5f;
|
base_score = 0.5f;
|
||||||
num_feature = 0;
|
num_feature = 0;
|
||||||
num_class = 0;
|
num_class = 0;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
std::memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
@ -292,6 +301,7 @@ class BoostLearner {
|
|||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
|
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
|
||||||
if (!strcmp("num_class", name)) num_class = atoi(val);
|
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||||
@ -307,7 +317,7 @@ class BoostLearner {
|
|||||||
// model parameter
|
// model parameter
|
||||||
ModelParam mparam;
|
ModelParam mparam;
|
||||||
// gbm model that back everything
|
// gbm model that back everything
|
||||||
gbm::IGradBooster<FMatrix> *gbm_;
|
gbm::IGradBooster *gbm_;
|
||||||
// name of gbm model used for training
|
// name of gbm model used for training
|
||||||
std::string name_gbm_;
|
std::string name_gbm_;
|
||||||
// objective fnction
|
// objective fnction
|
||||||
@ -324,14 +334,14 @@ class BoostLearner {
|
|||||||
private:
|
private:
|
||||||
// cache entry object that helps handle feature caching
|
// cache entry object that helps handle feature caching
|
||||||
struct CacheEntry {
|
struct CacheEntry {
|
||||||
const DMatrix<FMatrix> *mat_;
|
const DMatrix *mat_;
|
||||||
size_t buffer_offset_;
|
size_t buffer_offset_;
|
||||||
size_t num_row_;
|
size_t num_row_;
|
||||||
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
|
CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
|
||||||
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
||||||
};
|
};
|
||||||
// find internal bufer offset for certain matrix, if not exist, return -1
|
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||||
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
|
inline int64_t FindBufferOffset(const DMatrix &mat) const {
|
||||||
for (size_t i = 0; i < cache_.size(); ++i) {
|
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||||
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
||||||
if (cache_[i].num_row_ == mat.info.num_row()) {
|
if (cache_[i].num_row_ == mat.info.num_row()) {
|
||||||
|
|||||||
@ -6,9 +6,9 @@
|
|||||||
* \author Tianqi Chen, Kailong Chen
|
* \author Tianqi Chen, Kailong Chen
|
||||||
*/
|
*/
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include "../data.h"
|
#include "../data.h"
|
||||||
#include "./objective.h"
|
#include "./objective.h"
|
||||||
@ -37,7 +37,7 @@ struct LossType {
|
|||||||
case kLogisticRaw:
|
case kLogisticRaw:
|
||||||
case kLinearSquare: return x;
|
case kLinearSquare: return x;
|
||||||
case kLogisticClassify:
|
case kLogisticClassify:
|
||||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x));
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -50,7 +50,7 @@ struct LossType {
|
|||||||
inline float FirstOrderGradient(float predt, float label) const {
|
inline float FirstOrderGradient(float predt, float label) const {
|
||||||
switch (loss_type) {
|
switch (loss_type) {
|
||||||
case kLinearSquare: return predt - label;
|
case kLinearSquare: return predt - label;
|
||||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
|
||||||
case kLogisticClassify:
|
case kLogisticClassify:
|
||||||
case kLogisticNeglik: return predt - label;
|
case kLogisticNeglik: return predt - label;
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
@ -65,7 +65,7 @@ struct LossType {
|
|||||||
inline float SecondOrderGradient(float predt, float label) const {
|
inline float SecondOrderGradient(float predt, float label) const {
|
||||||
switch (loss_type) {
|
switch (loss_type) {
|
||||||
case kLinearSquare: return 1.0f;
|
case kLinearSquare: return 1.0f;
|
||||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
|
||||||
case kLogisticClassify:
|
case kLogisticClassify:
|
||||||
case kLogisticNeglik: return predt * (1 - predt);
|
case kLogisticNeglik: return predt * (1 - predt);
|
||||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||||
@ -80,7 +80,7 @@ struct LossType {
|
|||||||
loss_type == kLogisticNeglik ) {
|
loss_type == kLogisticNeglik ) {
|
||||||
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
||||||
"base_score must be in (0,1) for logistic loss");
|
"base_score must be in (0,1) for logistic loss");
|
||||||
base_score = -logf(1.0f / base_score - 1.0f);
|
base_score = -std::log(1.0f / base_score - 1.0f);
|
||||||
}
|
}
|
||||||
return base_score;
|
return base_score;
|
||||||
}
|
}
|
||||||
@ -101,6 +101,7 @@ class RegLossObj : public IObjFunction{
|
|||||||
}
|
}
|
||||||
virtual ~RegLossObj(void) {}
|
virtual ~RegLossObj(void) {}
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("scale_pos_weight", name)) {
|
if (!strcmp("scale_pos_weight", name)) {
|
||||||
scale_pos_weight = static_cast<float>(atof(val));
|
scale_pos_weight = static_cast<float>(atof(val));
|
||||||
}
|
}
|
||||||
@ -123,7 +124,7 @@ class RegLossObj : public IObjFunction{
|
|||||||
float p = loss.PredTransform(preds[i]);
|
float p = loss.PredTransform(preds[i]);
|
||||||
float w = info.GetWeight(j);
|
float w = info.GetWeight(j);
|
||||||
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
|
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
|
||||||
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
||||||
loss.SecondOrderGradient(p, info.labels[j]) * w);
|
loss.SecondOrderGradient(p, info.labels[j]) * w);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -156,6 +157,7 @@ class SoftmaxMultiClassObj : public IObjFunction {
|
|||||||
}
|
}
|
||||||
virtual ~SoftmaxMultiClassObj(void) {}
|
virtual ~SoftmaxMultiClassObj(void) {}
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp( "num_class", name )) nclass = atoi(val);
|
if (!strcmp( "num_class", name )) nclass = atoi(val);
|
||||||
}
|
}
|
||||||
virtual void GetGradient(const std::vector<float> &preds,
|
virtual void GetGradient(const std::vector<float> &preds,
|
||||||
@ -247,6 +249,7 @@ class LambdaRankObj : public IObjFunction {
|
|||||||
}
|
}
|
||||||
virtual ~LambdaRankObj(void) {}
|
virtual ~LambdaRankObj(void) {}
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
|
if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
|
||||||
if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
|
if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
|
||||||
if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
|
if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
|
||||||
@ -419,8 +422,8 @@ class LambdaRankObjNDCG : public LambdaRankObj {
|
|||||||
for (size_t i = 0; i < pairs.size(); ++i) {
|
for (size_t i = 0; i < pairs.size(); ++i) {
|
||||||
unsigned pos_idx = pairs[i].pos_index;
|
unsigned pos_idx = pairs[i].pos_index;
|
||||||
unsigned neg_idx = pairs[i].neg_index;
|
unsigned neg_idx = pairs[i].neg_index;
|
||||||
float pos_loginv = 1.0f / logf(pos_idx + 2.0f);
|
float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
|
||||||
float neg_loginv = 1.0f / logf(neg_idx + 2.0f);
|
float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
|
||||||
int pos_label = static_cast<int>(sorted_list[pos_idx].label);
|
int pos_label = static_cast<int>(sorted_list[pos_idx].label);
|
||||||
int neg_label = static_cast<int>(sorted_list[neg_idx].label);
|
int neg_label = static_cast<int>(sorted_list[neg_idx].label);
|
||||||
float original =
|
float original =
|
||||||
@ -438,7 +441,7 @@ class LambdaRankObjNDCG : public LambdaRankObj {
|
|||||||
for (size_t i = 0; i < labels.size(); ++i) {
|
for (size_t i = 0; i < labels.size(); ++i) {
|
||||||
const unsigned rel = static_cast<unsigned>(labels[i]);
|
const unsigned rel = static_cast<unsigned>(labels[i]);
|
||||||
if (rel != 0) {
|
if (rel != 0) {
|
||||||
sumdcg += ((1 << rel) - 1) / logf(static_cast<float>(i + 2));
|
sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return static_cast<float>(sumdcg);
|
return static_cast<float>(sumdcg);
|
||||||
|
|||||||
@ -67,6 +67,7 @@ namespace xgboost {
|
|||||||
namespace learner {
|
namespace learner {
|
||||||
/*! \brief factory funciton to create objective function by name */
|
/*! \brief factory funciton to create objective function by name */
|
||||||
inline IObjFunction* CreateObjFunction(const char *name) {
|
inline IObjFunction* CreateObjFunction(const char *name) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
|
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
|
||||||
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
|
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
|
||||||
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
|
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
|
||||||
|
|||||||
@ -53,7 +53,7 @@ class TreeModel {
|
|||||||
Param(void) {
|
Param(void) {
|
||||||
max_depth = 0;
|
max_depth = 0;
|
||||||
size_leaf_vector = 0;
|
size_leaf_vector = 0;
|
||||||
memset(reserved, 0, sizeof(reserved));
|
std::memset(reserved, 0, sizeof(reserved));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
@ -61,6 +61,7 @@ class TreeModel {
|
|||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("num_roots", name)) num_roots = atoi(val);
|
if (!strcmp("num_roots", name)) num_roots = atoi(val);
|
||||||
if (!strcmp("num_feature", name)) num_feature = atoi(val);
|
if (!strcmp("num_feature", name)) num_feature = atoi(val);
|
||||||
if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
|
if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
|
||||||
@ -272,6 +273,7 @@ class TreeModel {
|
|||||||
param.num_nodes = param.num_roots;
|
param.num_nodes = param.num_roots;
|
||||||
nodes.resize(param.num_nodes);
|
nodes.resize(param.num_nodes);
|
||||||
stats.resize(param.num_nodes);
|
stats.resize(param.num_nodes);
|
||||||
|
leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
|
||||||
for (int i = 0; i < param.num_nodes; i ++) {
|
for (int i = 0; i < param.num_nodes; i ++) {
|
||||||
nodes[i].set_leaf(0.0f);
|
nodes[i].set_leaf(0.0f);
|
||||||
nodes[i].set_parent(-1);
|
nodes[i].set_parent(-1);
|
||||||
@ -289,6 +291,9 @@ class TreeModel {
|
|||||||
"TreeModel: wrong format");
|
"TreeModel: wrong format");
|
||||||
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
|
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
|
||||||
"TreeModel: wrong format");
|
"TreeModel: wrong format");
|
||||||
|
if (param.size_leaf_vector != 0) {
|
||||||
|
utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
|
||||||
|
}
|
||||||
// chg deleted nodes
|
// chg deleted nodes
|
||||||
deleted_nodes.resize(0);
|
deleted_nodes.resize(0);
|
||||||
for (int i = param.num_roots; i < param.num_nodes; i ++) {
|
for (int i = param.num_roots; i < param.num_nodes; i ++) {
|
||||||
@ -309,6 +314,7 @@ class TreeModel {
|
|||||||
fo.Write(¶m, sizeof(Param));
|
fo.Write(¶m, sizeof(Param));
|
||||||
fo.Write(&nodes[0], sizeof(Node) * nodes.size());
|
fo.Write(&nodes[0], sizeof(Node) * nodes.size());
|
||||||
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
|
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
|
||||||
|
if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief add child nodes to node
|
* \brief add child nodes to node
|
||||||
@ -486,15 +492,15 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
|||||||
std::fill(data.begin(), data.end(), e);
|
std::fill(data.begin(), data.end(), e);
|
||||||
}
|
}
|
||||||
/*! \brief fill the vector with sparse vector */
|
/*! \brief fill the vector with sparse vector */
|
||||||
inline void Fill(const SparseBatch::Inst &inst) {
|
inline void Fill(const RowBatch::Inst &inst) {
|
||||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||||
data[inst[i].findex].fvalue = inst[i].fvalue;
|
data[inst[i].index].fvalue = inst[i].fvalue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \brief drop the trace after fill, must be called after fill */
|
/*! \brief drop the trace after fill, must be called after fill */
|
||||||
inline void Drop(const SparseBatch::Inst &inst) {
|
inline void Drop(const RowBatch::Inst &inst) {
|
||||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||||
data[inst[i].findex].flag = -1;
|
data[inst[i].index].flag = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \brief get ith value */
|
/*! \brief get ith value */
|
||||||
|
|||||||
149
src/tree/param.h
149
src/tree/param.h
@ -22,10 +22,10 @@ struct TrainParam{
|
|||||||
//----- the rest parameters are less important ----
|
//----- the rest parameters are less important ----
|
||||||
// minimum amount of hessian(weight) allowed in a child
|
// minimum amount of hessian(weight) allowed in a child
|
||||||
float min_child_weight;
|
float min_child_weight;
|
||||||
// weight decay parameter used to control leaf fitting
|
// L2 regularization factor
|
||||||
float reg_lambda;
|
float reg_lambda;
|
||||||
// reg method
|
// L1 regularization factor
|
||||||
int reg_method;
|
float reg_alpha;
|
||||||
// default direction choice
|
// default direction choice
|
||||||
int default_direction;
|
int default_direction;
|
||||||
// whether we want to do subsample
|
// whether we want to do subsample
|
||||||
@ -36,6 +36,8 @@ struct TrainParam{
|
|||||||
float colsample_bytree;
|
float colsample_bytree;
|
||||||
// speed optimization for dense column
|
// speed optimization for dense column
|
||||||
float opt_dense_col;
|
float opt_dense_col;
|
||||||
|
// leaf vector size
|
||||||
|
int size_leaf_vector;
|
||||||
// number of threads to be used for tree construction,
|
// number of threads to be used for tree construction,
|
||||||
// if OpenMP is enabled, if equals 0, use system default
|
// if OpenMP is enabled, if equals 0, use system default
|
||||||
int nthread;
|
int nthread;
|
||||||
@ -45,13 +47,14 @@ struct TrainParam{
|
|||||||
min_child_weight = 1.0f;
|
min_child_weight = 1.0f;
|
||||||
max_depth = 6;
|
max_depth = 6;
|
||||||
reg_lambda = 1.0f;
|
reg_lambda = 1.0f;
|
||||||
reg_method = 2;
|
reg_alpha = 0.0f;
|
||||||
default_direction = 0;
|
default_direction = 0;
|
||||||
subsample = 1.0f;
|
subsample = 1.0f;
|
||||||
colsample_bytree = 1.0f;
|
colsample_bytree = 1.0f;
|
||||||
colsample_bylevel = 1.0f;
|
colsample_bylevel = 1.0f;
|
||||||
opt_dense_col = 1.0f;
|
opt_dense_col = 1.0f;
|
||||||
nthread = 0;
|
nthread = 0;
|
||||||
|
size_leaf_vector = 0;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set parameters from outside
|
* \brief set parameters from outside
|
||||||
@ -59,19 +62,22 @@ struct TrainParam{
|
|||||||
* \param val value of the parameter
|
* \param val value of the parameter
|
||||||
*/
|
*/
|
||||||
inline void SetParam(const char *name, const char *val) {
|
inline void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
// sync-names
|
// sync-names
|
||||||
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
|
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
|
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
|
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||||
|
if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
|
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
|
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
|
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
|
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "reg_method")) reg_method = atoi(val);
|
if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
|
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
|
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
|
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
|
||||||
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
||||||
|
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
|
||||||
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
||||||
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
||||||
if (!strcmp(name, "default_direction")) {
|
if (!strcmp(name, "default_direction")) {
|
||||||
@ -82,31 +88,31 @@ struct TrainParam{
|
|||||||
}
|
}
|
||||||
// calculate the cost of loss function
|
// calculate the cost of loss function
|
||||||
inline double CalcGain(double sum_grad, double sum_hess) const {
|
inline double CalcGain(double sum_grad, double sum_hess) const {
|
||||||
if (sum_hess < min_child_weight) {
|
if (sum_hess < min_child_weight) return 0.0;
|
||||||
return 0.0;
|
if (reg_alpha == 0.0f) {
|
||||||
|
return Sqr(sum_grad) / (sum_hess + reg_lambda);
|
||||||
|
} else {
|
||||||
|
return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
|
||||||
}
|
}
|
||||||
switch (reg_method) {
|
}
|
||||||
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess;
|
// calculate cost of loss function with four stati
|
||||||
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda);
|
inline double CalcGain(double sum_grad, double sum_hess,
|
||||||
case 3 : return
|
double test_grad, double test_hess) const {
|
||||||
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) /
|
double w = CalcWeight(sum_grad, sum_hess);
|
||||||
(sum_hess + 0.5 * reg_lambda);
|
double ret = test_grad * w + 0.5 * (test_hess + reg_lambda) * Sqr(w);
|
||||||
default: return Sqr(sum_grad) / sum_hess;
|
if (reg_alpha == 0.0f) {
|
||||||
|
return - 2.0 * ret;
|
||||||
|
} else {
|
||||||
|
return - 2.0 * (ret + reg_alpha * std::abs(w));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// calculate weight given the statistics
|
// calculate weight given the statistics
|
||||||
inline double CalcWeight(double sum_grad, double sum_hess) const {
|
inline double CalcWeight(double sum_grad, double sum_hess) const {
|
||||||
if (sum_hess < min_child_weight) {
|
if (sum_hess < min_child_weight) return 0.0;
|
||||||
return 0.0;
|
if (reg_alpha == 0.0f) {
|
||||||
|
return -sum_grad / (sum_hess + reg_lambda);
|
||||||
} else {
|
} else {
|
||||||
switch (reg_method) {
|
return -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
|
||||||
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
|
|
||||||
case 2: return - sum_grad / (sum_hess + reg_lambda);
|
|
||||||
case 3: return
|
|
||||||
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
|
|
||||||
(sum_hess + 0.5 * reg_lambda);
|
|
||||||
default: return - sum_grad / sum_hess;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \brief whether need forward small to big search: default right */
|
/*! \brief whether need forward small to big search: default right */
|
||||||
@ -153,6 +159,9 @@ struct GradStats {
|
|||||||
inline void Clear(void) {
|
inline void Clear(void) {
|
||||||
sum_grad = sum_hess = 0.0f;
|
sum_grad = sum_hess = 0.0f;
|
||||||
}
|
}
|
||||||
|
/*! \brief check if necessary information is ready */
|
||||||
|
inline static void CheckInfo(const BoosterInfo &info) {
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief accumulate statistics,
|
* \brief accumulate statistics,
|
||||||
* \param gpair the vector storing the gradient statistics
|
* \param gpair the vector storing the gradient statistics
|
||||||
@ -189,13 +198,87 @@ struct GradStats {
|
|||||||
/*! \brief set leaf vector value based on statistics */
|
/*! \brief set leaf vector value based on statistics */
|
||||||
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{
|
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{
|
||||||
}
|
}
|
||||||
protected:
|
// constructor to allow inheritance
|
||||||
|
GradStats(void) {}
|
||||||
/*! \brief add statistics to the data */
|
/*! \brief add statistics to the data */
|
||||||
inline void Add(double grad, double hess) {
|
inline void Add(double grad, double hess) {
|
||||||
sum_grad += grad; sum_hess += hess;
|
sum_grad += grad; sum_hess += hess;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*! \brief vectorized cv statistics */
|
||||||
|
template<unsigned vsize>
|
||||||
|
struct CVGradStats : public GradStats {
|
||||||
|
// additional statistics
|
||||||
|
GradStats train[vsize], valid[vsize];
|
||||||
|
// constructor
|
||||||
|
explicit CVGradStats(const TrainParam ¶m) {
|
||||||
|
utils::Check(param.size_leaf_vector == vsize,
|
||||||
|
"CVGradStats: vsize must match size_leaf_vector");
|
||||||
|
this->Clear();
|
||||||
|
}
|
||||||
|
/*! \brief check if necessary information is ready */
|
||||||
|
inline static void CheckInfo(const BoosterInfo &info) {
|
||||||
|
utils::Check(info.fold_index.size() != 0,
|
||||||
|
"CVGradStats: require fold_index");
|
||||||
|
}
|
||||||
|
/*! \brief clear the statistics */
|
||||||
|
inline void Clear(void) {
|
||||||
|
GradStats::Clear();
|
||||||
|
for (unsigned i = 0; i < vsize; ++i) {
|
||||||
|
train[i].Clear(); valid[i].Clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void Add(const std::vector<bst_gpair> &gpair,
|
||||||
|
const BoosterInfo &info,
|
||||||
|
bst_uint ridx) {
|
||||||
|
GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
|
||||||
|
const size_t step = info.fold_index.size();
|
||||||
|
for (unsigned i = 0; i < vsize; ++i) {
|
||||||
|
const bst_gpair &b = gpair[(i + 1) * step + ridx];
|
||||||
|
if (info.fold_index[ridx] == i) {
|
||||||
|
valid[i].Add(b.grad, b.hess);
|
||||||
|
} else {
|
||||||
|
train[i].Add(b.grad, b.hess);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief calculate gain of the solution */
|
||||||
|
inline double CalcGain(const TrainParam ¶m) const {
|
||||||
|
double ret = 0.0;
|
||||||
|
for (unsigned i = 0; i < vsize; ++i) {
|
||||||
|
ret += param.CalcGain(train[i].sum_grad,
|
||||||
|
train[i].sum_hess,
|
||||||
|
vsize * valid[i].sum_grad,
|
||||||
|
vsize * valid[i].sum_hess);
|
||||||
|
}
|
||||||
|
return ret / vsize;
|
||||||
|
}
|
||||||
|
/*! \brief add statistics to the data */
|
||||||
|
inline void Add(const CVGradStats &b) {
|
||||||
|
GradStats::Add(b);
|
||||||
|
for (unsigned i = 0; i < vsize; ++i) {
|
||||||
|
train[i].Add(b.train[i]);
|
||||||
|
valid[i].Add(b.valid[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief set current value to a - b */
|
||||||
|
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
|
||||||
|
GradStats::SetSubstract(a, b);
|
||||||
|
for (int i = 0; i < vsize; ++i) {
|
||||||
|
train[i].SetSubstract(a.train[i], b.train[i]);
|
||||||
|
valid[i].SetSubstract(a.valid[i], b.valid[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief set leaf vector value based on statistics */
|
||||||
|
inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const{
|
||||||
|
for (int i = 0; i < vsize; ++i) {
|
||||||
|
vec[i] = param.learning_rate *
|
||||||
|
param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief statistics that is helpful to store
|
* \brief statistics that is helpful to store
|
||||||
* and represent a split solution for the tree
|
* and represent a split solution for the tree
|
||||||
@ -216,11 +299,11 @@ struct SplitEntry{
|
|||||||
* \param loss_chg the loss reduction get through the split
|
* \param loss_chg the loss reduction get through the split
|
||||||
* \param split_index the feature index where the split is on
|
* \param split_index the feature index where the split is on
|
||||||
*/
|
*/
|
||||||
inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const {
|
inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
|
||||||
if (this->split_index() <= split_index) {
|
if (this->split_index() <= split_index) {
|
||||||
return loss_chg > this->loss_chg;
|
return new_loss_chg > this->loss_chg;
|
||||||
} else {
|
} else {
|
||||||
return !(this->loss_chg > loss_chg);
|
return !(this->loss_chg > new_loss_chg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -246,13 +329,13 @@ struct SplitEntry{
|
|||||||
* \param default_left whether the missing value goes to left
|
* \param default_left whether the missing value goes to left
|
||||||
* \return whether the proposed split is better and can replace current split
|
* \return whether the proposed split is better and can replace current split
|
||||||
*/
|
*/
|
||||||
inline bool Update(bst_float loss_chg, unsigned split_index,
|
inline bool Update(bst_float new_loss_chg, unsigned split_index,
|
||||||
float split_value, bool default_left) {
|
float new_split_value, bool default_left) {
|
||||||
if (this->NeedReplace(loss_chg, split_index)) {
|
if (this->NeedReplace(new_loss_chg, split_index)) {
|
||||||
this->loss_chg = loss_chg;
|
this->loss_chg = new_loss_chg;
|
||||||
if (default_left) split_index |= (1U << 31);
|
if (default_left) split_index |= (1U << 31);
|
||||||
this->sindex = split_index;
|
this->sindex = split_index;
|
||||||
this->split_value = split_value;
|
this->split_value = new_split_value;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
21
src/tree/updater.cpp
Normal file
21
src/tree/updater.cpp
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
#include <cstring>
|
||||||
|
#include "./updater.h"
|
||||||
|
#include "./updater_prune-inl.hpp"
|
||||||
|
#include "./updater_refresh-inl.hpp"
|
||||||
|
#include "./updater_colmaker-inl.hpp"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
IUpdater* CreateUpdater(const char *name) {
|
||||||
|
using namespace std;
|
||||||
|
if (!strcmp(name, "prune")) return new TreePruner();
|
||||||
|
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
|
||||||
|
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
|
||||||
|
utils::Error("unknown updater:%s", name);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
@ -14,9 +14,7 @@ namespace xgboost {
|
|||||||
namespace tree {
|
namespace tree {
|
||||||
/*!
|
/*!
|
||||||
* \brief interface of tree update module, that performs update of a tree
|
* \brief interface of tree update module, that performs update of a tree
|
||||||
* \tparam FMatrix the data type updater taking
|
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
|
||||||
class IUpdater {
|
class IUpdater {
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
@ -28,7 +26,7 @@ class IUpdater {
|
|||||||
/*!
|
/*!
|
||||||
* \brief peform update to the tree models
|
* \brief peform update to the tree models
|
||||||
* \param gpair the gradient pair statistics of the data
|
* \param gpair the gradient pair statistics of the data
|
||||||
* \param fmat feature matrix that provide access to features
|
* \param p_fmat feature matrix that provide access to features
|
||||||
* \param info extra side information that may be need, such as root index
|
* \param info extra side information that may be need, such as root index
|
||||||
* \param trees pointer to the trese to be updated, upater will change the content of the tree
|
* \param trees pointer to the trese to be updated, upater will change the content of the tree
|
||||||
* note: all the trees in the vector are updated, with the same statistics,
|
* note: all the trees in the vector are updated, with the same statistics,
|
||||||
@ -36,36 +34,18 @@ class IUpdater {
|
|||||||
* there can be multiple trees when we train random forest style model
|
* there can be multiple trees when we train random forest style model
|
||||||
*/
|
*/
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
const std::vector<RegTree*> &trees) = 0;
|
const std::vector<RegTree*> &trees) = 0;
|
||||||
// destructor
|
// destructor
|
||||||
virtual ~IUpdater(void) {}
|
virtual ~IUpdater(void) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tree
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|
||||||
#include "./updater_prune-inl.hpp"
|
|
||||||
#include "./updater_refresh-inl.hpp"
|
|
||||||
#include "./updater_colmaker-inl.hpp"
|
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
namespace tree {
|
|
||||||
/*!
|
/*!
|
||||||
* \brief create a updater based on name
|
* \brief create a updater based on name
|
||||||
* \param name name of updater
|
* \param name name of updater
|
||||||
* \return return the updater instance
|
* \return return the updater instance
|
||||||
*/
|
*/
|
||||||
template<typename FMatrix>
|
IUpdater* CreateUpdater(const char *name);
|
||||||
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
|
|
||||||
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
|
|
||||||
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
|
|
||||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
|
|
||||||
utils::Error("unknown updater:%s", name);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_TREE_UPDATER_H_
|
#endif // XGBOOST_TREE_UPDATER_H_
|
||||||
|
|||||||
@ -15,8 +15,8 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
/*! \brief pruner that prunes a tree after growing finishs */
|
/*! \brief pruner that prunes a tree after growing finishs */
|
||||||
template<typename FMatrix, typename TStats>
|
template<typename TStats>
|
||||||
class ColMaker: public IUpdater<FMatrix> {
|
class ColMaker: public IUpdater {
|
||||||
public:
|
public:
|
||||||
virtual ~ColMaker(void) {}
|
virtual ~ColMaker(void) {}
|
||||||
// set training parameter
|
// set training parameter
|
||||||
@ -24,16 +24,17 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
param.SetParam(name, val);
|
param.SetParam(name, val);
|
||||||
}
|
}
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
const std::vector<RegTree*> &trees) {
|
const std::vector<RegTree*> &trees) {
|
||||||
|
TStats::CheckInfo(info);
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
float lr = param.learning_rate;
|
float lr = param.learning_rate;
|
||||||
param.learning_rate = lr / trees.size();
|
param.learning_rate = lr / trees.size();
|
||||||
// build tree
|
// build tree
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
for (size_t i = 0; i < trees.size(); ++i) {
|
||||||
Builder builder(param);
|
Builder builder(param);
|
||||||
builder.Update(gpair, fmat, info, trees[i]);
|
builder.Update(gpair, p_fmat, info, trees[i]);
|
||||||
}
|
}
|
||||||
param.learning_rate = lr;
|
param.learning_rate = lr;
|
||||||
}
|
}
|
||||||
@ -76,23 +77,22 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
explicit Builder(const TrainParam ¶m) : param(param) {}
|
explicit Builder(const TrainParam ¶m) : param(param) {}
|
||||||
// update one tree, growing
|
// update one tree, growing
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
RegTree *p_tree) {
|
RegTree *p_tree) {
|
||||||
this->InitData(gpair, fmat, info.root_index, *p_tree);
|
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
|
||||||
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
|
this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
|
||||||
|
|
||||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||||
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree);
|
this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
|
||||||
this->ResetPosition(this->qexpand, fmat, *p_tree);
|
this->ResetPosition(qexpand_, p_fmat, *p_tree);
|
||||||
this->UpdateQueueExpand(*p_tree, &this->qexpand);
|
this->UpdateQueueExpand(*p_tree, &qexpand_);
|
||||||
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
|
this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
|
||||||
// if nothing left to be expand, break
|
// if nothing left to be expand, break
|
||||||
if (qexpand.size() == 0) break;
|
if (qexpand_.size() == 0) break;
|
||||||
}
|
}
|
||||||
// set all the rest expanding nodes to leaf
|
// set all the rest expanding nodes to leaf
|
||||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
for (size_t i = 0; i < qexpand_.size(); ++i) {
|
||||||
const int nid = qexpand[i];
|
const int nid = qexpand_[i];
|
||||||
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
||||||
}
|
}
|
||||||
// remember auxiliary statistics in the tree node
|
// remember auxiliary statistics in the tree node
|
||||||
@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
private:
|
private:
|
||||||
// initialize temp data structure
|
// initialize temp data structure
|
||||||
inline void InitData(const std::vector<bst_gpair> &gpair,
|
inline void InitData(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
const IFMatrix &fmat,
|
||||||
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
||||||
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
|
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
|
||||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||||
@ -138,7 +138,6 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// initialize feature index
|
// initialize feature index
|
||||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
||||||
@ -166,16 +165,16 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
snode.reserve(256);
|
snode.reserve(256);
|
||||||
}
|
}
|
||||||
{// expand query
|
{// expand query
|
||||||
qexpand.reserve(256); qexpand.clear();
|
qexpand_.reserve(256); qexpand_.clear();
|
||||||
for (int i = 0; i < tree.param.num_roots; ++i) {
|
for (int i = 0; i < tree.param.num_roots; ++i) {
|
||||||
qexpand.push_back(i);
|
qexpand_.push_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
|
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
|
||||||
inline void InitNewNode(const std::vector<int> &qexpand,
|
inline void InitNewNode(const std::vector<int> &qexpand,
|
||||||
const std::vector<bst_gpair> &gpair,
|
const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
const IFMatrix &fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
const RegTree &tree) {
|
const RegTree &tree) {
|
||||||
{// setup statistics space for each tree node
|
{// setup statistics space for each tree node
|
||||||
@ -222,24 +221,26 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
qexpand = newnodes;
|
qexpand = newnodes;
|
||||||
}
|
}
|
||||||
// enumerate the split values of specific feature
|
// enumerate the split values of specific feature
|
||||||
template<typename Iter>
|
inline void EnumerateSplit(const ColBatch::Entry *begin,
|
||||||
inline void EnumerateSplit(Iter it, unsigned fid,
|
const ColBatch::Entry *end,
|
||||||
|
int d_step,
|
||||||
|
bst_uint fid,
|
||||||
const std::vector<bst_gpair> &gpair,
|
const std::vector<bst_gpair> &gpair,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<ThreadEntry> &temp,
|
std::vector<ThreadEntry> &temp) {
|
||||||
bool is_forward_search) {
|
const std::vector<int> &qexpand = qexpand_;
|
||||||
// clear all the temp statistics
|
// clear all the temp statistics
|
||||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||||
temp[qexpand[j]].stats.Clear();
|
temp[qexpand[j]].stats.Clear();
|
||||||
}
|
}
|
||||||
// left statistics
|
// left statistics
|
||||||
TStats c(param);
|
TStats c(param);
|
||||||
while (it.Next()) {
|
for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
|
||||||
const bst_uint ridx = it.rindex();
|
const bst_uint ridx = it->index;
|
||||||
const int nid = position[ridx];
|
const int nid = position[ridx];
|
||||||
if (nid < 0) continue;
|
if (nid < 0) continue;
|
||||||
// start working
|
// start working
|
||||||
const float fvalue = it.fvalue();
|
const float fvalue = it->fvalue;
|
||||||
// get the statistics of nid
|
// get the statistics of nid
|
||||||
ThreadEntry &e = temp[nid];
|
ThreadEntry &e = temp[nid];
|
||||||
// test if first hit, this is fine, because we set 0 during init
|
// test if first hit, this is fine, because we set 0 during init
|
||||||
@ -248,11 +249,11 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
e.last_fvalue = fvalue;
|
e.last_fvalue = fvalue;
|
||||||
} else {
|
} else {
|
||||||
// try to find a split
|
// try to find a split
|
||||||
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||||
c.SetSubstract(snode[nid].stats, e.stats);
|
c.SetSubstract(snode[nid].stats, e.stats);
|
||||||
if (c.sum_hess >= param.min_child_weight) {
|
if (c.sum_hess >= param.min_child_weight) {
|
||||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||||
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
|
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// update the statistics
|
// update the statistics
|
||||||
@ -267,38 +268,46 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
c.SetSubstract(snode[nid].stats, e.stats);
|
c.SetSubstract(snode[nid].stats, e.stats);
|
||||||
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
||||||
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
|
||||||
const float delta = is_forward_search ? rt_eps : -rt_eps;
|
const float delta = d_step == +1 ? rt_eps : -rt_eps;
|
||||||
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
|
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// find splits at current level, do split per level
|
// find splits at current level, do split per level
|
||||||
inline void FindSplit(int depth, const std::vector<int> &qexpand,
|
inline void FindSplit(int depth,
|
||||||
|
const std::vector<int> &qexpand,
|
||||||
const std::vector<bst_gpair> &gpair,
|
const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
RegTree *p_tree) {
|
RegTree *p_tree) {
|
||||||
std::vector<unsigned> feat_set = feat_index;
|
std::vector<bst_uint> feat_set = feat_index;
|
||||||
if (param.colsample_bylevel != 1.0f) {
|
if (param.colsample_bylevel != 1.0f) {
|
||||||
random::Shuffle(feat_set);
|
random::Shuffle(feat_set);
|
||||||
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
|
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
|
||||||
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
|
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
|
||||||
feat_set.resize(n);
|
feat_set.resize(n);
|
||||||
}
|
}
|
||||||
|
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
|
||||||
|
while (iter->Next()) {
|
||||||
|
const ColBatch &batch = iter->Value();
|
||||||
// start enumeration
|
// start enumeration
|
||||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size());
|
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
|
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
|
||||||
#endif
|
#endif
|
||||||
#pragma omp parallel for schedule(dynamic, batch_size)
|
#pragma omp parallel for schedule(dynamic, batch_size)
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
const unsigned fid = feat_set[i];
|
const bst_uint fid = batch.col_index[i];
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
if (param.need_forward_search(fmat.GetColDensity(fid))) {
|
const ColBatch::Inst c = batch[i];
|
||||||
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true);
|
if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
|
||||||
|
this->EnumerateSplit(c.data, c.data + c.length, +1,
|
||||||
|
fid, gpair, info, stemp[tid]);
|
||||||
|
}
|
||||||
|
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
|
||||||
|
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
|
||||||
|
fid, gpair, info, stemp[tid]);
|
||||||
}
|
}
|
||||||
if (param.need_backward_search(fmat.GetColDensity(fid))) {
|
|
||||||
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// after this each thread's stemp will get the best candidates, aggregate results
|
// after this each thread's stemp will get the best candidates, aggregate results
|
||||||
@ -318,8 +327,8 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// reset position of each data points after split is created in the tree
|
// reset position of each data points after split is created in the tree
|
||||||
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
|
inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
|
||||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
|
||||||
// step 1, set default direct nodes to default, and leaf nodes to -1
|
// step 1, set default direct nodes to default, and leaf nodes to -1
|
||||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
@ -343,19 +352,24 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
std::sort(fsplits.begin(), fsplits.end());
|
std::sort(fsplits.begin(), fsplits.end());
|
||||||
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
|
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
|
||||||
// start put things into right place
|
|
||||||
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size());
|
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
|
||||||
#pragma omp parallel for schedule(dynamic, 1)
|
while (iter->Next()) {
|
||||||
for (bst_omp_uint i = 0; i < nfeats; ++i) {
|
const ColBatch &batch = iter->Value();
|
||||||
const unsigned fid = fsplits[i];
|
for (size_t i = 0; i < batch.size; ++i) {
|
||||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
ColBatch::Inst col = batch[i];
|
||||||
const bst_uint ridx = it.rindex();
|
const bst_uint fid = batch.col_index[i];
|
||||||
|
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||||
|
const bst_uint ridx = col[j].index;
|
||||||
|
const float fvalue = col[j].fvalue;
|
||||||
int nid = position[ridx];
|
int nid = position[ridx];
|
||||||
if (nid == -1) continue;
|
if (nid == -1) continue;
|
||||||
// go back to parent, correct those who are not default
|
// go back to parent, correct those who are not default
|
||||||
nid = tree[nid].parent();
|
nid = tree[nid].parent();
|
||||||
if (tree[nid].split_index() == fid) {
|
if (tree[nid].split_index() == fid) {
|
||||||
if (it.fvalue() < tree[nid].split_cond()) {
|
if (fvalue < tree[nid].split_cond()) {
|
||||||
position[ridx] = tree[nid].cleft();
|
position[ridx] = tree[nid].cleft();
|
||||||
} else {
|
} else {
|
||||||
position[ridx] = tree[nid].cright();
|
position[ridx] = tree[nid].cright();
|
||||||
@ -364,12 +378,13 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
//--data fields--
|
//--data fields--
|
||||||
const TrainParam ¶m;
|
const TrainParam ¶m;
|
||||||
// number of omp thread used during training
|
// number of omp thread used during training
|
||||||
int nthread;
|
int nthread;
|
||||||
// Per feature: shuffle index of each feature index
|
// Per feature: shuffle index of each feature index
|
||||||
std::vector<unsigned> feat_index;
|
std::vector<bst_uint> feat_index;
|
||||||
// Instance Data: current node position in the tree of each instance
|
// Instance Data: current node position in the tree of each instance
|
||||||
std::vector<int> position;
|
std::vector<int> position;
|
||||||
// PerThread x PerTreeNode: statistics for per thread construction
|
// PerThread x PerTreeNode: statistics for per thread construction
|
||||||
@ -377,7 +392,7 @@ class ColMaker: public IUpdater<FMatrix> {
|
|||||||
/*! \brief TreeNode Data: statistics for each constructed node */
|
/*! \brief TreeNode Data: statistics for each constructed node */
|
||||||
std::vector<NodeEntry> snode;
|
std::vector<NodeEntry> snode;
|
||||||
/*! \brief queue of nodes to be expanded */
|
/*! \brief queue of nodes to be expanded */
|
||||||
std::vector<int> qexpand;
|
std::vector<int> qexpand_;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -12,18 +12,18 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
/*! \brief pruner that prunes a tree after growing finishs */
|
/*! \brief pruner that prunes a tree after growing finishs */
|
||||||
template<typename FMatrix>
|
class TreePruner: public IUpdater {
|
||||||
class TreePruner: public IUpdater<FMatrix> {
|
|
||||||
public:
|
public:
|
||||||
virtual ~TreePruner(void) {}
|
virtual ~TreePruner(void) {}
|
||||||
// set training parameter
|
// set training parameter
|
||||||
virtual void SetParam(const char *name, const char *val) {
|
virtual void SetParam(const char *name, const char *val) {
|
||||||
|
using namespace std;
|
||||||
param.SetParam(name, val);
|
param.SetParam(name, val);
|
||||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||||
}
|
}
|
||||||
// update the tree, do pruning
|
// update the tree, do pruning
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
const std::vector<RegTree*> &trees) {
|
const std::vector<RegTree*> &trees) {
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
@ -64,7 +64,7 @@ class TreePruner: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (silent == 0) {
|
if (silent == 0) {
|
||||||
printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||||
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
|
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -75,7 +75,6 @@ class TreePruner: public IUpdater<FMatrix> {
|
|||||||
// training parameter
|
// training parameter
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
||||||
|
|||||||
@ -9,12 +9,13 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "./updater.h"
|
#include "./updater.h"
|
||||||
|
#include "../utils/omp.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
/*! \brief pruner that prunes a tree after growing finishs */
|
/*! \brief pruner that prunes a tree after growing finishs */
|
||||||
template<typename FMatrix, typename TStats>
|
template<typename TStats>
|
||||||
class TreeRefresher: public IUpdater<FMatrix> {
|
class TreeRefresher: public IUpdater {
|
||||||
public:
|
public:
|
||||||
virtual ~TreeRefresher(void) {}
|
virtual ~TreeRefresher(void) {}
|
||||||
// set training parameter
|
// set training parameter
|
||||||
@ -23,16 +24,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
|||||||
}
|
}
|
||||||
// update the tree, do pruning
|
// update the tree, do pruning
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
const FMatrix &fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
const std::vector<RegTree*> &trees) {
|
const std::vector<RegTree*> &trees) {
|
||||||
if (trees.size() == 0) return;
|
if (trees.size() == 0) return;
|
||||||
// number of threads
|
// number of threads
|
||||||
int nthread;
|
|
||||||
// thread temporal space
|
// thread temporal space
|
||||||
std::vector< std::vector<TStats> > stemp;
|
std::vector< std::vector<TStats> > stemp;
|
||||||
std::vector<RegTree::FVec> fvec_temp;
|
std::vector<RegTree::FVec> fvec_temp;
|
||||||
// setup temp space for each thread
|
// setup temp space for each thread
|
||||||
|
int nthread;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
nthread = omp_get_num_threads();
|
nthread = omp_get_num_threads();
|
||||||
@ -50,16 +51,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
|||||||
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
||||||
}
|
}
|
||||||
// start accumulating statistics
|
// start accumulating statistics
|
||||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||||
iter->BeforeFirst();
|
iter->BeforeFirst();
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
const SparseBatch &batch = iter->Value();
|
const RowBatch &batch = iter->Value();
|
||||||
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
|
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
|
||||||
"too large batch size ");
|
"too large batch size ");
|
||||||
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||||
SparseBatch::Inst inst = batch[i];
|
RowBatch::Inst inst = batch[i];
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||||
RegTree::FVec &feats = fvec_temp[tid];
|
RegTree::FVec &feats = fvec_temp[tid];
|
||||||
@ -126,8 +127,6 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
|||||||
this->Refresh(gstats, tree[nid].cright(), p_tree);
|
this->Refresh(gstats, tree[nid].cright(), p_tree);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// number of thread in the data
|
|
||||||
int nthread;
|
|
||||||
// training parameter
|
// training parameter
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -24,15 +24,15 @@ class FeatMap {
|
|||||||
// function definitions
|
// function definitions
|
||||||
/*! \brief load feature map from text format */
|
/*! \brief load feature map from text format */
|
||||||
inline void LoadText(const char *fname) {
|
inline void LoadText(const char *fname) {
|
||||||
FILE *fi = utils::FopenCheck(fname, "r");
|
std::FILE *fi = utils::FopenCheck(fname, "r");
|
||||||
this->LoadText(fi);
|
this->LoadText(fi);
|
||||||
fclose(fi);
|
std::fclose(fi);
|
||||||
}
|
}
|
||||||
/*! \brief load feature map from text format */
|
/*! \brief load feature map from text format */
|
||||||
inline void LoadText(FILE *fi) {
|
inline void LoadText(std::FILE *fi) {
|
||||||
int fid;
|
int fid;
|
||||||
char fname[1256], ftype[1256];
|
char fname[1256], ftype[1256];
|
||||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
||||||
this->PushBack(fid, fname, ftype);
|
this->PushBack(fid, fname, ftype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -62,6 +62,7 @@ class FeatMap {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
inline static Type GetType(const char *tname) {
|
inline static Type GetType(const char *tname) {
|
||||||
|
using namespace std;
|
||||||
if (!strcmp("i", tname)) return kIndicator;
|
if (!strcmp("i", tname)) return kIndicator;
|
||||||
if (!strcmp("q", tname)) return kQuantitive;
|
if (!strcmp("q", tname)) return kQuantitive;
|
||||||
if (!strcmp("int", tname)) return kInteger;
|
if (!strcmp("int", tname)) return kInteger;
|
||||||
|
|||||||
@ -91,22 +91,21 @@ class IStream {
|
|||||||
/*! \brief implementation of file i/o stream */
|
/*! \brief implementation of file i/o stream */
|
||||||
class FileStream : public IStream {
|
class FileStream : public IStream {
|
||||||
private:
|
private:
|
||||||
FILE *fp;
|
std::FILE *fp;
|
||||||
public:
|
public:
|
||||||
explicit FileStream(FILE *fp) {
|
explicit FileStream(std::FILE *fp) : fp(fp) {
|
||||||
this->fp = fp;
|
|
||||||
}
|
}
|
||||||
virtual size_t Read(void *ptr, size_t size) {
|
virtual size_t Read(void *ptr, size_t size) {
|
||||||
return fread(ptr, size, 1, fp);
|
return std::fread(ptr, size, 1, fp);
|
||||||
}
|
}
|
||||||
virtual void Write(const void *ptr, size_t size) {
|
virtual void Write(const void *ptr, size_t size) {
|
||||||
fwrite(ptr, size, 1, fp);
|
std::fwrite(ptr, size, 1, fp);
|
||||||
}
|
}
|
||||||
inline void Seek(size_t pos) {
|
inline void Seek(size_t pos) {
|
||||||
fseek(fp, 0, SEEK_SET);
|
std::fseek(fp, 0, SEEK_SET);
|
||||||
}
|
}
|
||||||
inline void Close(void) {
|
inline void Close(void) {
|
||||||
fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -9,13 +9,8 @@
|
|||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#else
|
#else
|
||||||
#ifndef DISABLE_OPENMP
|
#ifndef DISABLE_OPENMP
|
||||||
#ifndef _MSC_VER
|
// use pragma message instead of warning
|
||||||
#warning "OpenMP is not available, compile to single thread code."\
|
#pragma message ("Warning: OpenMP is not available, xgboost will be compiled into single-thread code. Use OpenMP-enabled compiler to get benefit of multi-threading")
|
||||||
"You may want to ungrade your compiler to enable OpenMP support,"\
|
|
||||||
"to get benefit of multi-threading."
|
|
||||||
#else
|
|
||||||
// TODO add warning for msvc
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
inline int omp_get_thread_num() { return 0; }
|
inline int omp_get_thread_num() { return 0; }
|
||||||
inline int omp_get_num_threads() { return 1; }
|
inline int omp_get_num_threads() { return 1; }
|
||||||
|
|||||||
@ -16,30 +16,21 @@
|
|||||||
/*! namespace of PRNG */
|
/*! namespace of PRNG */
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace random {
|
namespace random {
|
||||||
|
#ifndef XGBOOST_CUSTOMIZE_PRNG_
|
||||||
/*! \brief seed the PRNG */
|
/*! \brief seed the PRNG */
|
||||||
inline void Seed(uint32_t seed) {
|
inline void Seed(unsigned seed) {
|
||||||
srand(seed);
|
srand(seed);
|
||||||
}
|
}
|
||||||
/*! \brief return a real number uniform in [0,1) */
|
/*! \brief basic function, uniform */
|
||||||
inline double NextDouble(void) {
|
inline double Uniform(void) {
|
||||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||||
}
|
}
|
||||||
/*! \brief return a real numer uniform in (0,1) */
|
/*! \brief return a real numer uniform in (0,1) */
|
||||||
inline double NextDouble2(void) {
|
inline double NextDouble2(void) {
|
||||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief return a random number */
|
|
||||||
inline uint32_t NextUInt32(void) {
|
|
||||||
return (uint32_t)rand();
|
|
||||||
}
|
|
||||||
/*! \brief return a random number in n */
|
|
||||||
inline uint32_t NextUInt32(uint32_t n) {
|
|
||||||
return (uint32_t)floor(NextDouble() * n);
|
|
||||||
}
|
|
||||||
/*! \brief return x~N(0,1) */
|
/*! \brief return x~N(0,1) */
|
||||||
inline double SampleNormal() {
|
inline double Normal(void) {
|
||||||
double x, y, s;
|
double x, y, s;
|
||||||
do {
|
do {
|
||||||
x = 2 * NextDouble2() - 1.0;
|
x = 2 * NextDouble2() - 1.0;
|
||||||
@ -49,22 +40,24 @@ inline double SampleNormal() {
|
|||||||
|
|
||||||
return x * sqrt(-2.0 * log(s) / s);
|
return x * sqrt(-2.0 * log(s) / s);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
// include declarations, to be implemented
|
||||||
|
void Seed(unsigned seed);
|
||||||
|
double Uniform(void);
|
||||||
|
double Normal(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
/*! \brief return iid x,y ~N(0,1) */
|
/*! \brief return a real number uniform in [0,1) */
|
||||||
inline void SampleNormal2D(double &xx, double &yy) {
|
inline double NextDouble(void) {
|
||||||
double x, y, s;
|
return Uniform();
|
||||||
do {
|
}
|
||||||
x = 2 * NextDouble2() - 1.0;
|
/*! \brief return a random number in n */
|
||||||
y = 2 * NextDouble2() - 1.0;
|
inline uint32_t NextUInt32(uint32_t n) {
|
||||||
s = x*x + y*y;
|
return (uint32_t)std::floor(NextDouble() * n);
|
||||||
} while (s >= 1.0 || s == 0.0);
|
|
||||||
double t = sqrt(-2.0 * log(s) / s);
|
|
||||||
xx = x * t;
|
|
||||||
yy = y * t;
|
|
||||||
}
|
}
|
||||||
/*! \brief return x~N(mu,sigma^2) */
|
/*! \brief return x~N(mu,sigma^2) */
|
||||||
inline double SampleNormal(double mu, double sigma) {
|
inline double SampleNormal(double mu, double sigma) {
|
||||||
return SampleNormal() * sigma + mu;
|
return Normal() * sigma + mu;
|
||||||
}
|
}
|
||||||
/*! \brief return 1 with probability p, coin flip */
|
/*! \brief return 1 with probability p, coin flip */
|
||||||
inline int SampleBinary(double p) {
|
inline int SampleBinary(double p) {
|
||||||
@ -90,7 +83,7 @@ struct Random{
|
|||||||
inline void Seed(unsigned sd) {
|
inline void Seed(unsigned sd) {
|
||||||
this->rseed = sd;
|
this->rseed = sd;
|
||||||
#if defined(_MSC_VER)||defined(_WIN32)
|
#if defined(_MSC_VER)||defined(_WIN32)
|
||||||
srand(rseed);
|
::xgboost::random::Seed(sd);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
/*! \brief return a real number uniform in [0,1) */
|
/*! \brief return a real number uniform in [0,1) */
|
||||||
@ -98,8 +91,8 @@ struct Random{
|
|||||||
// use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
|
// use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
|
||||||
// For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
|
// For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
|
||||||
// todo, replace with another PRNG
|
// todo, replace with another PRNG
|
||||||
#if defined(_MSC_VER)||defined(_WIN32)
|
#if defined(_MSC_VER)||defined(_WIN32)||defined(XGBOOST_STRICT_CXX98_)
|
||||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0);
|
return Uniform();
|
||||||
#else
|
#else
|
||||||
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
|
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -7,10 +7,18 @@
|
|||||||
*/
|
*/
|
||||||
#define _CRT_SECURE_NO_WARNINGS
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdarg>
|
#include <string>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#ifndef XGBOOST_STRICT_CXX98_
|
||||||
|
#include <cstdarg>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__GNUC__)
|
||||||
|
#define fopen64 std::fopen
|
||||||
|
#endif
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define fopen64 fopen
|
|
||||||
// NOTE: sprintf_s is not equivalent to snprintf,
|
// NOTE: sprintf_s is not equivalent to snprintf,
|
||||||
// they are equivalent when success, which is sufficient for our case
|
// they are equivalent when success, which is sufficient for our case
|
||||||
#define snprintf sprintf_s
|
#define snprintf sprintf_s
|
||||||
@ -18,19 +26,18 @@
|
|||||||
#else
|
#else
|
||||||
#ifdef _FILE_OFFSET_BITS
|
#ifdef _FILE_OFFSET_BITS
|
||||||
#if _FILE_OFFSET_BITS == 32
|
#if _FILE_OFFSET_BITS == 32
|
||||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
#define off64_t off_t
|
#define off64_t off_t
|
||||||
#define fopen64 fopen
|
#define fopen64 std::fopen
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define _FILE_OFFSET_BITS 64
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
};
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
@ -46,10 +53,11 @@ typedef long int64_t;
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
/*! \brief namespace for helper utils of the project */
|
/*! \brief namespace for helper utils of the project */
|
||||||
namespace utils {
|
namespace utils {
|
||||||
/*! \brief error message buffer length */
|
|
||||||
const int kErrorBuffer = 1 << 12;
|
|
||||||
|
|
||||||
#ifndef XGBOOST_CUSTOMIZE_ERROR_
|
/*! \brief error message buffer length */
|
||||||
|
const int kPrintBuffer = 1 << 12;
|
||||||
|
|
||||||
|
#ifndef XGBOOST_CUSTOMIZE_MSG_
|
||||||
/*!
|
/*!
|
||||||
* \brief handling of Assert error, caused by in-apropriate input
|
* \brief handling of Assert error, caused by in-apropriate input
|
||||||
* \param msg error message
|
* \param msg error message
|
||||||
@ -66,19 +74,50 @@ inline void HandleCheckError(const char *msg) {
|
|||||||
fprintf(stderr, "%s\n", msg);
|
fprintf(stderr, "%s\n", msg);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
inline void HandlePrint(const char *msg) {
|
||||||
|
printf("%s", msg);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
|
#ifndef XGBOOST_STRICT_CXX98_
|
||||||
// include declarations, some one must implement this
|
// include declarations, some one must implement this
|
||||||
void HandleAssertError(const char *msg);
|
void HandleAssertError(const char *msg);
|
||||||
void HandleCheckError(const char *msg);
|
void HandleCheckError(const char *msg);
|
||||||
|
void HandlePrint(const char *msg);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#ifdef XGBOOST_STRICT_CXX98_
|
||||||
|
// these function pointers are to be assigned
|
||||||
|
extern "C" void (*Printf)(const char *fmt, ...);
|
||||||
|
extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
|
||||||
|
extern "C" void (*Assert)(int exp, const char *fmt, ...);
|
||||||
|
extern "C" void (*Check)(int exp, const char *fmt, ...);
|
||||||
|
extern "C" void (*Error)(const char *fmt, ...);
|
||||||
|
#else
|
||||||
|
/*! \brief printf, print message to the console */
|
||||||
|
inline void Printf(const char *fmt, ...) {
|
||||||
|
std::string msg(kPrintBuffer, '\0');
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
HandlePrint(msg.c_str());
|
||||||
|
}
|
||||||
|
/*! \brief portable version of snprintf */
|
||||||
|
inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
int ret = vsnprintf(buf, size, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*! \brief assert an condition is true, use this to handle debug information */
|
/*! \brief assert an condition is true, use this to handle debug information */
|
||||||
inline void Assert(bool exp, const char *fmt, ...) {
|
inline void Assert(bool exp, const char *fmt, ...) {
|
||||||
if (!exp) {
|
if (!exp) {
|
||||||
std::string msg(kErrorBuffer, '\0');
|
std::string msg(kPrintBuffer, '\0');
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleAssertError(msg.c_str());
|
HandleAssertError(msg.c_str());
|
||||||
}
|
}
|
||||||
@ -87,10 +126,10 @@ inline void Assert(bool exp, const char *fmt, ...) {
|
|||||||
/*!\brief same as assert, but this is intended to be used as message for user*/
|
/*!\brief same as assert, but this is intended to be used as message for user*/
|
||||||
inline void Check(bool exp, const char *fmt, ...) {
|
inline void Check(bool exp, const char *fmt, ...) {
|
||||||
if (!exp) {
|
if (!exp) {
|
||||||
std::string msg(kErrorBuffer, '\0');
|
std::string msg(kPrintBuffer, '\0');
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleCheckError(msg.c_str());
|
HandleCheckError(msg.c_str());
|
||||||
}
|
}
|
||||||
@ -99,22 +138,41 @@ inline void Check(bool exp, const char *fmt, ...) {
|
|||||||
/*! \brief report error message, same as check */
|
/*! \brief report error message, same as check */
|
||||||
inline void Error(const char *fmt, ...) {
|
inline void Error(const char *fmt, ...) {
|
||||||
{
|
{
|
||||||
std::string msg(kErrorBuffer, '\0');
|
std::string msg(kPrintBuffer, '\0');
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleCheckError(msg.c_str());
|
HandleCheckError(msg.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*! \brief replace fopen, report error when the file open fails */
|
/*! \brief replace fopen, report error when the file open fails */
|
||||||
inline FILE *FopenCheck(const char *fname, const char *flag) {
|
inline std::FILE *FopenCheck(const char *fname, const char *flag) {
|
||||||
FILE *fp = fopen64(fname, flag);
|
std::FILE *fp = fopen64(fname, flag);
|
||||||
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
||||||
return fp;
|
return fp;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
|
// easy utils that can be directly acessed in xgboost
|
||||||
|
/*! \brief get the beginning address of a vector */
|
||||||
|
template<typename T>
|
||||||
|
inline T *BeginPtr(std::vector<T> &vec) {
|
||||||
|
if (vec.size() == 0) {
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
return &vec[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*! \brief get the beginning address of a vector */
|
||||||
|
template<typename T>
|
||||||
|
inline const T *BeginPtr(const std::vector<T> &vec) {
|
||||||
|
if (vec.size() == 0) {
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
return &vec[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif // XGBOOST_UTILS_UTILS_H_
|
#endif // XGBOOST_UTILS_UTILS_H_
|
||||||
|
|||||||
@ -50,6 +50,7 @@ class BoostLearnTask{
|
|||||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||||
if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
|
if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
|
||||||
|
if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val);
|
||||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||||
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
||||||
if (!strcmp("task", name)) task = val;
|
if (!strcmp("task", name)) task = val;
|
||||||
@ -79,6 +80,7 @@ class BoostLearnTask{
|
|||||||
save_period = 0;
|
save_period = 0;
|
||||||
eval_train = 0;
|
eval_train = 0;
|
||||||
pred_margin = 0;
|
pred_margin = 0;
|
||||||
|
ntree_limit = 0;
|
||||||
dump_model_stats = 0;
|
dump_model_stats = 0;
|
||||||
task = "train";
|
task = "train";
|
||||||
model_in = "NULL";
|
model_in = "NULL";
|
||||||
@ -186,7 +188,7 @@ class BoostLearnTask{
|
|||||||
inline void TaskPred(void) {
|
inline void TaskPred(void) {
|
||||||
std::vector<float> preds;
|
std::vector<float> preds;
|
||||||
if (!silent) printf("start prediction...\n");
|
if (!silent) printf("start prediction...\n");
|
||||||
learner.Predict(*data, pred_margin != 0, &preds);
|
learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
|
||||||
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||||
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||||
for (size_t i = 0; i < preds.size(); i++) {
|
for (size_t i = 0; i < preds.size(); i++) {
|
||||||
@ -217,6 +219,8 @@ class BoostLearnTask{
|
|||||||
std::string task;
|
std::string task;
|
||||||
/*! \brief name of predict file */
|
/*! \brief name of predict file */
|
||||||
std::string name_pred;
|
std::string name_pred;
|
||||||
|
/*!\brief limit number of trees in prediction */
|
||||||
|
int ntree_limit;
|
||||||
/*!\brief whether to directly output margin value */
|
/*!\brief whether to directly output margin value */
|
||||||
int pred_margin;
|
int pred_margin;
|
||||||
/*! \brief whether dump statistics along with model */
|
/*! \brief whether dump statistics along with model */
|
||||||
@ -234,7 +238,7 @@ class BoostLearnTask{
|
|||||||
std::vector<io::DataMatrix*> deval;
|
std::vector<io::DataMatrix*> deval;
|
||||||
std::vector<const io::DataMatrix*> devalall;
|
std::vector<const io::DataMatrix*> devalall;
|
||||||
utils::FeatMap fmap;
|
utils::FeatMap fmap;
|
||||||
learner::BoostLearner<FMatrixS> learner;
|
learner::BoostLearner learner;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
The solution has been created with Visual Studio Express 2013.
|
The solution has been created with Visual Studio Express 2010.
|
||||||
Make sure to compile the Release version, unless you need to debug the code
|
Make sure to compile the Release version, unless you need to debug the code
|
||||||
(and in the latter case modify the path in xgboost.py from release to test).
|
(and in the latter case modify the path in xgboost.py from release to test).
|
||||||
Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line
|
Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line
|
||||||
|
|||||||
@ -1,11 +1,9 @@
|
|||||||
|
|
||||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
Microsoft Visual Studio Solution File, Format Version 11.00
|
||||||
# Visual Studio Express 2013 for Windows Desktop
|
# Visual Studio 2010
|
||||||
VisualStudioVersion = 12.0.30723.0
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}"
|
|
||||||
EndProject
|
EndProject
|
||||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{2E1AF937-28BB-4832-B916-309C9A0F6C4F}"
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
@ -15,22 +13,21 @@ Global
|
|||||||
Release|x64 = Release|x64
|
Release|x64 = Release|x64
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.ActiveCfg = Debug|Win32
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.Build.0 = Debug|Win32
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.ActiveCfg = Debug|x64
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.Build.0 = Debug|x64
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.ActiveCfg = Release|Win32
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.Build.0 = Release|Win32
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.ActiveCfg = Release|x64
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
|
||||||
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.Build.0 = Release|x64
|
{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.ActiveCfg = Debug|Win32
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.Build.0 = Debug|Win32
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.ActiveCfg = Debug|x64
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.Build.0 = Debug|x64
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.ActiveCfg = Release|Win32
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.Build.0 = Release|Win32
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.ActiveCfg = Release|x64
|
{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
|
||||||
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.Build.0 = Release|x64
|
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup Label="ProjectConfigurations">
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
<ProjectConfiguration Include="Debug|Win32">
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
<Configuration>Debug</Configuration>
|
<Configuration>Debug</Configuration>
|
||||||
@ -18,8 +18,14 @@
|
|||||||
<Platform>x64</Platform>
|
<Platform>x64</Platform>
|
||||||
</ProjectConfiguration>
|
</ProjectConfiguration>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<ClCompile Include="..\..\src\gbm\gbm.cpp" />
|
||||||
|
<ClCompile Include="..\..\src\io\io.cpp" />
|
||||||
|
<ClCompile Include="..\..\src\tree\updater.cpp" />
|
||||||
|
<ClCompile Include="..\..\src\xgboost_main.cpp" />
|
||||||
|
</ItemGroup>
|
||||||
<PropertyGroup Label="Globals">
|
<PropertyGroup Label="Globals">
|
||||||
<ProjectGuid>{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}</ProjectGuid>
|
<ProjectGuid>{19766C3F-7508-49D0-BAAC-0988FCC9970C}</ProjectGuid>
|
||||||
<RootNamespace>xgboost</RootNamespace>
|
<RootNamespace>xgboost</RootNamespace>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
@ -27,27 +33,23 @@
|
|||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>true</UseDebugLibraries>
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
<PlatformToolset>v120</PlatformToolset>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>true</UseDebugLibraries>
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
<PlatformToolset>v120</PlatformToolset>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>false</UseDebugLibraries>
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
<PlatformToolset>v120</PlatformToolset>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>false</UseDebugLibraries>
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
<PlatformToolset>v120</PlatformToolset>
|
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
@ -111,10 +113,6 @@
|
|||||||
<OptimizeReferences>true</OptimizeReferences>
|
<OptimizeReferences>true</OptimizeReferences>
|
||||||
</Link>
|
</Link>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
<ItemGroup>
|
|
||||||
<ClCompile Include="..\..\src\io\io.cpp" />
|
|
||||||
<ClCompile Include="..\..\src\xgboost_main.cpp" />
|
|
||||||
</ItemGroup>
|
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
|
|||||||
@ -30,17 +30,17 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>true</UseDebugLibraries>
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>true</UseDebugLibraries>
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>false</UseDebugLibraries>
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
<CharacterSet>MultiByte</CharacterSet>
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
|||||||
@ -1,126 +0,0 @@
|
|||||||
# include xgboost library, must set chdir=TRURE
|
|
||||||
source("../xgboost.R", chdir=TRUE)
|
|
||||||
|
|
||||||
# helper function to read libsvm format
|
|
||||||
# this is very badly written, load in dense, and convert to sparse
|
|
||||||
# use this only for demo purpose
|
|
||||||
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
|
||||||
read.libsvm <- function(fname, maxcol) {
|
|
||||||
content <- readLines(fname)
|
|
||||||
nline <- length(content)
|
|
||||||
label <- numeric(nline)
|
|
||||||
mat <- matrix(0, nline, maxcol+1)
|
|
||||||
for (i in 1:nline) {
|
|
||||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
|
||||||
label[i] <- as.numeric(arr[[1]])
|
|
||||||
for (j in 2:length(arr)) {
|
|
||||||
kv <- strsplit(arr[j], ":")[[1]]
|
|
||||||
# to avoid 0 index
|
|
||||||
findex <- as.integer(kv[1]) + 1
|
|
||||||
fvalue <- as.numeric(kv[2])
|
|
||||||
mat[i,findex] <- fvalue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mat <- as(mat, "sparseMatrix")
|
|
||||||
return(list(label=label, data=mat))
|
|
||||||
}
|
|
||||||
|
|
||||||
# test code here
|
|
||||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
|
||||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
|
||||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
# training xgboost model
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
# make prediction
|
|
||||||
preds <- xgb.predict(bst, dtest)
|
|
||||||
labels <- xgb.getinfo(dtest, "label")
|
|
||||||
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
|
|
||||||
# print error rate
|
|
||||||
print(paste("error=",err))
|
|
||||||
|
|
||||||
# dump model
|
|
||||||
xgb.dump(bst, "dump.raw.txt")
|
|
||||||
# dump model with feature map
|
|
||||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
|
||||||
|
|
||||||
# save dmatrix into binary buffer
|
|
||||||
succ <- xgb.save(dtest, "dtest.buffer")
|
|
||||||
# save model into file
|
|
||||||
succ <- xgb.save(bst, "xgb.model")
|
|
||||||
# load model and data in
|
|
||||||
bst2 <- xgb.Booster(modelfile="xgb.model")
|
|
||||||
dtest2 <- xgb.DMatrix("dtest.buffer")
|
|
||||||
preds2 <- xgb.predict(bst2, dtest2)
|
|
||||||
# assert they are the same
|
|
||||||
stopifnot(sum(abs(preds2-preds)) == 0)
|
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from sparseMatrix
|
|
||||||
###
|
|
||||||
print ('start running example of build DMatrix from R.sparseMatrix')
|
|
||||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
|
||||||
label <- csc$label
|
|
||||||
data <- csc$data
|
|
||||||
dtrain <- xgb.DMatrix(data, info=list(label=label) )
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from dense matrix
|
|
||||||
###
|
|
||||||
print ('start running example of build DMatrix from R.Matrix')
|
|
||||||
mat = as.matrix(data)
|
|
||||||
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
|
|
||||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: cutomsized loss function
|
|
||||||
#
|
|
||||||
print("start running example to used cutomized objective function")
|
|
||||||
# note: for customized objective function, we leave objective as default
|
|
||||||
# note: what we are getting is margin value in prediction
|
|
||||||
# you must know what you are doing
|
|
||||||
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient
|
|
||||||
# this is loglikelihood loss
|
|
||||||
logregobj <- function(preds, dtrain) {
|
|
||||||
labels <- xgb.getinfo(dtrain, "label")
|
|
||||||
preds <- 1.0 / (1.0 + exp(-preds))
|
|
||||||
grad <- preds - labels
|
|
||||||
hess <- preds * (1.0-preds)
|
|
||||||
return(list(grad=grad, hess=hess))
|
|
||||||
}
|
|
||||||
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
|
||||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
|
||||||
# this may make buildin evalution metric not function properly
|
|
||||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
|
||||||
# the buildin evaluation error assumes input is after logistic transformation
|
|
||||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
|
||||||
evalerror <- function(preds, dtrain) {
|
|
||||||
labels <- xgb.getinfo(dtrain, "label")
|
|
||||||
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
|
||||||
return(list(metric="error", value=err))
|
|
||||||
}
|
|
||||||
|
|
||||||
# training with customized objective, we can also do step by step training
|
|
||||||
# simply look at xgboost.py"s implementation of train
|
|
||||||
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: start from a initial base prediction
|
|
||||||
#
|
|
||||||
print ("start running example to start from a initial prediction")
|
|
||||||
# specify parameters via map, definition are same as c++ version
|
|
||||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
||||||
# train xgboost for 1 round
|
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
|
||||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
|
||||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
|
||||||
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
|
|
||||||
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
|
|
||||||
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
|
|
||||||
succ <- xgb.setinfo(dtest, "base_margin", ptest)
|
|
||||||
print ("this is result of running from initial prediction")
|
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
|
||||||
@ -2,14 +2,11 @@ Wrapper of XGBoost
|
|||||||
=====
|
=====
|
||||||
This folder provides wrapper of xgboost to other languages
|
This folder provides wrapper of xgboost to other languages
|
||||||
|
|
||||||
|
|
||||||
Python
|
Python
|
||||||
=====
|
=====
|
||||||
* To make the python module, type ```make``` in the root directory of project
|
* To make the python module, type ```make``` in the root directory of project
|
||||||
* Refer to the walk through example in [python-example/demo.py](python-example/demo.py)
|
* Refer also to the walk through example in [demo folder](../demo/guide-python)
|
||||||
|
|
||||||
R
|
R
|
||||||
=====
|
=====
|
||||||
* To make the R wrapper, type ```make R``` in the root directory of project
|
* See [R-package](../R-package)
|
||||||
* R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
|
|
||||||
* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)
|
|
||||||
|
|||||||
@ -1,3 +0,0 @@
|
|||||||
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
|
|
||||||
|
|
||||||
for usage: see demo.py and comments in demo.py
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,121 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import numpy as np
|
|
||||||
import scipy.sparse
|
|
||||||
# append the path to xgboost, you may need to change the following line
|
|
||||||
# alternatively, you can add the path to PYTHONPATH environment variable
|
|
||||||
sys.path.append('../')
|
|
||||||
import xgboost as xgb
|
|
||||||
|
|
||||||
### simple example
|
|
||||||
# load file from text file, also binary buffer generated by xgboost
|
|
||||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
|
||||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
|
||||||
|
|
||||||
# specify parameters via map, definition are same as c++ version
|
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
|
||||||
|
|
||||||
# specify validations set to watch performance
|
|
||||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
|
||||||
num_round = 2
|
|
||||||
bst = xgb.train(param, dtrain, num_round, evallist)
|
|
||||||
|
|
||||||
# this is prediction
|
|
||||||
preds = bst.predict(dtest)
|
|
||||||
labels = dtest.get_label()
|
|
||||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
|
||||||
bst.save_model('0001.model')
|
|
||||||
# dump model
|
|
||||||
bst.dump_model('dump.raw.txt')
|
|
||||||
# dump model with feature map
|
|
||||||
bst.dump_model('dump.nice.txt','featmap.txt')
|
|
||||||
|
|
||||||
# save dmatrix into binary buffer
|
|
||||||
dtest.save_binary('dtest.buffer')
|
|
||||||
bst.save_model('xgb.model')
|
|
||||||
# load model and data in
|
|
||||||
bst2 = xgb.Booster(model_file='xgb.model')
|
|
||||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
|
||||||
preds2 = bst2.predict(dtest2)
|
|
||||||
# assert they are the same
|
|
||||||
assert np.sum(np.abs(preds2-preds)) == 0
|
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from scipy.sparse
|
|
||||||
print ('start running example of build DMatrix from scipy.sparse')
|
|
||||||
labels = []
|
|
||||||
row = []; col = []; dat = []
|
|
||||||
i = 0
|
|
||||||
for l in open('agaricus.txt.train'):
|
|
||||||
arr = l.split()
|
|
||||||
labels.append( int(arr[0]))
|
|
||||||
for it in arr[1:]:
|
|
||||||
k,v = it.split(':')
|
|
||||||
row.append(i); col.append(int(k)); dat.append(float(v))
|
|
||||||
i += 1
|
|
||||||
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
|
||||||
dtrain = xgb.DMatrix( csr )
|
|
||||||
dtrain.set_label(labels)
|
|
||||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
|
||||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
|
||||||
|
|
||||||
print ('start running example of build DMatrix from numpy array')
|
|
||||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
|
|
||||||
npymat = csr.todense()
|
|
||||||
dtrain = xgb.DMatrix( npymat)
|
|
||||||
dtrain.set_label(labels)
|
|
||||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
|
||||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: cutomsized loss function
|
|
||||||
#
|
|
||||||
print ('start running example to used cutomized objective function')
|
|
||||||
|
|
||||||
# note: for customized objective function, we leave objective as default
|
|
||||||
# note: what we are getting is margin value in prediction
|
|
||||||
# you must know what you are doing
|
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
|
||||||
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient
|
|
||||||
# this is loglikelihood loss
|
|
||||||
def logregobj(preds, dtrain):
|
|
||||||
labels = dtrain.get_label()
|
|
||||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
|
||||||
grad = preds - labels
|
|
||||||
hess = preds * (1.0-preds)
|
|
||||||
return grad, hess
|
|
||||||
|
|
||||||
# user defined evaluation function, return a pair metric_name, result
|
|
||||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
|
||||||
# this may make buildin evalution metric not function properly
|
|
||||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
|
||||||
# the buildin evaluation error assumes input is after logistic transformation
|
|
||||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
|
||||||
def evalerror(preds, dtrain):
|
|
||||||
labels = dtrain.get_label()
|
|
||||||
# return a pair metric_name, result
|
|
||||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
|
||||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
|
||||||
|
|
||||||
# training with customized objective, we can also do step by step training
|
|
||||||
# simply look at xgboost.py's implementation of train
|
|
||||||
bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
|
|
||||||
|
|
||||||
###
|
|
||||||
# advanced: start from a initial base prediction
|
|
||||||
#
|
|
||||||
print ('start running example to start from a initial prediction')
|
|
||||||
# specify parameters via map, definition are same as c++ version
|
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
|
||||||
# train xgboost for 1 round
|
|
||||||
bst = xgb.train( param, dtrain, 1, evallist )
|
|
||||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
|
||||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
|
||||||
ptrain = bst.predict(dtrain, output_margin=True)
|
|
||||||
ptest = bst.predict(dtest, output_margin=True)
|
|
||||||
dtrain.set_base_margin(ptrain)
|
|
||||||
dtest.set_base_margin(ptest)
|
|
||||||
|
|
||||||
print ('this is result of running from initial prediction')
|
|
||||||
bst = xgb.train( param, dtrain, 1, evallist )
|
|
||||||
@ -1,126 +0,0 @@
|
|||||||
0 cap-shape=bell i
|
|
||||||
1 cap-shape=conical i
|
|
||||||
2 cap-shape=convex i
|
|
||||||
3 cap-shape=flat i
|
|
||||||
4 cap-shape=knobbed i
|
|
||||||
5 cap-shape=sunken i
|
|
||||||
6 cap-surface=fibrous i
|
|
||||||
7 cap-surface=grooves i
|
|
||||||
8 cap-surface=scaly i
|
|
||||||
9 cap-surface=smooth i
|
|
||||||
10 cap-color=brown i
|
|
||||||
11 cap-color=buff i
|
|
||||||
12 cap-color=cinnamon i
|
|
||||||
13 cap-color=gray i
|
|
||||||
14 cap-color=green i
|
|
||||||
15 cap-color=pink i
|
|
||||||
16 cap-color=purple i
|
|
||||||
17 cap-color=red i
|
|
||||||
18 cap-color=white i
|
|
||||||
19 cap-color=yellow i
|
|
||||||
20 bruises?=bruises i
|
|
||||||
21 bruises?=no i
|
|
||||||
22 odor=almond i
|
|
||||||
23 odor=anise i
|
|
||||||
24 odor=creosote i
|
|
||||||
25 odor=fishy i
|
|
||||||
26 odor=foul i
|
|
||||||
27 odor=musty i
|
|
||||||
28 odor=none i
|
|
||||||
29 odor=pungent i
|
|
||||||
30 odor=spicy i
|
|
||||||
31 gill-attachment=attached i
|
|
||||||
32 gill-attachment=descending i
|
|
||||||
33 gill-attachment=free i
|
|
||||||
34 gill-attachment=notched i
|
|
||||||
35 gill-spacing=close i
|
|
||||||
36 gill-spacing=crowded i
|
|
||||||
37 gill-spacing=distant i
|
|
||||||
38 gill-size=broad i
|
|
||||||
39 gill-size=narrow i
|
|
||||||
40 gill-color=black i
|
|
||||||
41 gill-color=brown i
|
|
||||||
42 gill-color=buff i
|
|
||||||
43 gill-color=chocolate i
|
|
||||||
44 gill-color=gray i
|
|
||||||
45 gill-color=green i
|
|
||||||
46 gill-color=orange i
|
|
||||||
47 gill-color=pink i
|
|
||||||
48 gill-color=purple i
|
|
||||||
49 gill-color=red i
|
|
||||||
50 gill-color=white i
|
|
||||||
51 gill-color=yellow i
|
|
||||||
52 stalk-shape=enlarging i
|
|
||||||
53 stalk-shape=tapering i
|
|
||||||
54 stalk-root=bulbous i
|
|
||||||
55 stalk-root=club i
|
|
||||||
56 stalk-root=cup i
|
|
||||||
57 stalk-root=equal i
|
|
||||||
58 stalk-root=rhizomorphs i
|
|
||||||
59 stalk-root=rooted i
|
|
||||||
60 stalk-root=missing i
|
|
||||||
61 stalk-surface-above-ring=fibrous i
|
|
||||||
62 stalk-surface-above-ring=scaly i
|
|
||||||
63 stalk-surface-above-ring=silky i
|
|
||||||
64 stalk-surface-above-ring=smooth i
|
|
||||||
65 stalk-surface-below-ring=fibrous i
|
|
||||||
66 stalk-surface-below-ring=scaly i
|
|
||||||
67 stalk-surface-below-ring=silky i
|
|
||||||
68 stalk-surface-below-ring=smooth i
|
|
||||||
69 stalk-color-above-ring=brown i
|
|
||||||
70 stalk-color-above-ring=buff i
|
|
||||||
71 stalk-color-above-ring=cinnamon i
|
|
||||||
72 stalk-color-above-ring=gray i
|
|
||||||
73 stalk-color-above-ring=orange i
|
|
||||||
74 stalk-color-above-ring=pink i
|
|
||||||
75 stalk-color-above-ring=red i
|
|
||||||
76 stalk-color-above-ring=white i
|
|
||||||
77 stalk-color-above-ring=yellow i
|
|
||||||
78 stalk-color-below-ring=brown i
|
|
||||||
79 stalk-color-below-ring=buff i
|
|
||||||
80 stalk-color-below-ring=cinnamon i
|
|
||||||
81 stalk-color-below-ring=gray i
|
|
||||||
82 stalk-color-below-ring=orange i
|
|
||||||
83 stalk-color-below-ring=pink i
|
|
||||||
84 stalk-color-below-ring=red i
|
|
||||||
85 stalk-color-below-ring=white i
|
|
||||||
86 stalk-color-below-ring=yellow i
|
|
||||||
87 veil-type=partial i
|
|
||||||
88 veil-type=universal i
|
|
||||||
89 veil-color=brown i
|
|
||||||
90 veil-color=orange i
|
|
||||||
91 veil-color=white i
|
|
||||||
92 veil-color=yellow i
|
|
||||||
93 ring-number=none i
|
|
||||||
94 ring-number=one i
|
|
||||||
95 ring-number=two i
|
|
||||||
96 ring-type=cobwebby i
|
|
||||||
97 ring-type=evanescent i
|
|
||||||
98 ring-type=flaring i
|
|
||||||
99 ring-type=large i
|
|
||||||
100 ring-type=none i
|
|
||||||
101 ring-type=pendant i
|
|
||||||
102 ring-type=sheathing i
|
|
||||||
103 ring-type=zone i
|
|
||||||
104 spore-print-color=black i
|
|
||||||
105 spore-print-color=brown i
|
|
||||||
106 spore-print-color=buff i
|
|
||||||
107 spore-print-color=chocolate i
|
|
||||||
108 spore-print-color=green i
|
|
||||||
109 spore-print-color=orange i
|
|
||||||
110 spore-print-color=purple i
|
|
||||||
111 spore-print-color=white i
|
|
||||||
112 spore-print-color=yellow i
|
|
||||||
113 population=abundant i
|
|
||||||
114 population=clustered i
|
|
||||||
115 population=numerous i
|
|
||||||
116 population=scattered i
|
|
||||||
117 population=several i
|
|
||||||
118 population=solitary i
|
|
||||||
119 habitat=grasses i
|
|
||||||
120 habitat=leaves i
|
|
||||||
121 habitat=meadows i
|
|
||||||
122 habitat=paths i
|
|
||||||
123 habitat=urban i
|
|
||||||
124 habitat=waste i
|
|
||||||
125 habitat=woods i
|
|
||||||
@ -1,222 +0,0 @@
|
|||||||
# depends on matrix
|
|
||||||
succ <- require("Matrix")
|
|
||||||
if (!succ) {
|
|
||||||
stop("xgboost depends on Matrix library")
|
|
||||||
}
|
|
||||||
# load in library
|
|
||||||
dyn.load("./libxgboostR.so")
|
|
||||||
|
|
||||||
# constructing DMatrix
|
|
||||||
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
|
|
||||||
if (typeof(data) == "character") {
|
|
||||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
|
|
||||||
} else if(is.matrix(data)) {
|
|
||||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
|
|
||||||
} else if(class(data) == "dgCMatrix") {
|
|
||||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
|
|
||||||
} else {
|
|
||||||
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
|
|
||||||
}
|
|
||||||
dmat <- structure(handle, class="xgb.DMatrix")
|
|
||||||
if (length(info) != 0) {
|
|
||||||
for (i in 1:length(info)) {
|
|
||||||
p <- info[i]
|
|
||||||
xgb.setinfo(dmat, names(p), p[[1]])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return(dmat)
|
|
||||||
}
|
|
||||||
# get information from dmatrix
|
|
||||||
xgb.getinfo <- function(dmat, name) {
|
|
||||||
if (typeof(name) != "character") {
|
|
||||||
stop("xgb.getinfo: name must be character")
|
|
||||||
}
|
|
||||||
if (class(dmat) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
|
||||||
}
|
|
||||||
if (name != "label" &&
|
|
||||||
name != "weight" &&
|
|
||||||
name != "base_margin" ) {
|
|
||||||
stop(paste("xgb.getinfo: unknown info name", name))
|
|
||||||
}
|
|
||||||
ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
|
|
||||||
return(ret)
|
|
||||||
}
|
|
||||||
# set information into dmatrix, this mutate dmatrix
|
|
||||||
xgb.setinfo <- function(dmat, name, info) {
|
|
||||||
if (class(dmat) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
|
|
||||||
}
|
|
||||||
if (name == "label") {
|
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
if (name == "weight") {
|
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
if (name == "base_margin") {
|
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
if (name == "group") {
|
|
||||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
stop(pase("xgb.setinfo: unknown info name", name))
|
|
||||||
return(FALSE)
|
|
||||||
}
|
|
||||||
# construct a Booster from cachelist
|
|
||||||
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
|
||||||
if (typeof(cachelist) != "list") {
|
|
||||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
|
||||||
}
|
|
||||||
for (dm in cachelist) {
|
|
||||||
if (class(dm) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
handle <- .Call("XGBoosterCreate_R", cachelist)
|
|
||||||
.Call("XGBoosterSetParam_R", handle, "seed", "0")
|
|
||||||
if (length(params) != 0) {
|
|
||||||
for (i in 1:length(params)) {
|
|
||||||
p <- params[i]
|
|
||||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!is.null(modelfile)) {
|
|
||||||
if (typeof(modelfile) != "character"){
|
|
||||||
stop("xgb.Booster: modelfile must be character");
|
|
||||||
}
|
|
||||||
.Call("XGBoosterLoadModel_R", handle, modelfile)
|
|
||||||
}
|
|
||||||
return(structure(handle, class="xgb.Booster"))
|
|
||||||
}
|
|
||||||
# train a model using given parameters
|
|
||||||
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
|
|
||||||
if (typeof(params) != "list") {
|
|
||||||
stop("xgb.train: first argument params must be list");
|
|
||||||
}
|
|
||||||
if (class(dtrain) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
|
|
||||||
}
|
|
||||||
bst <- xgb.Booster(params, append(watchlist,dtrain))
|
|
||||||
for (i in 1:nrounds) {
|
|
||||||
if (is.null(obj)) {
|
|
||||||
succ <- xgb.iter.update(bst, dtrain, i-1)
|
|
||||||
} else {
|
|
||||||
pred <- xgb.predict(bst, dtrain)
|
|
||||||
gpair <- obj(pred, dtrain)
|
|
||||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
|
||||||
}
|
|
||||||
if (length(watchlist) != 0) {
|
|
||||||
if (is.null(feval)) {
|
|
||||||
msg <- xgb.iter.eval(bst, watchlist, i-1)
|
|
||||||
cat(msg); cat("\n")
|
|
||||||
} else {
|
|
||||||
cat("["); cat(i); cat("]");
|
|
||||||
for (j in 1:length(watchlist)) {
|
|
||||||
w <- watchlist[j]
|
|
||||||
if (length(names(w)) == 0) {
|
|
||||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
|
||||||
}
|
|
||||||
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
|
|
||||||
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
|
|
||||||
cat(":"); cat(ret$value)
|
|
||||||
}
|
|
||||||
cat("\n")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return(bst)
|
|
||||||
}
|
|
||||||
# save model or DMatrix to file
|
|
||||||
xgb.save <- function(handle, fname) {
|
|
||||||
if (typeof(fname) != "character") {
|
|
||||||
stop("xgb.save: fname must be character");
|
|
||||||
}
|
|
||||||
if (class(handle) == "xgb.Booster") {
|
|
||||||
.Call("XGBoosterSaveModel_R", handle, fname);
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
if (class(handle) == "xgb.DMatrix") {
|
|
||||||
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE))
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
|
||||||
return(FALSE)
|
|
||||||
}
|
|
||||||
# predict
|
|
||||||
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
|
||||||
if (class(booster) != "xgb.Booster") {
|
|
||||||
stop("xgb.predict: first argument must be type xgb.Booster")
|
|
||||||
}
|
|
||||||
if (class(dmat) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
|
||||||
}
|
|
||||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
|
|
||||||
return(ret)
|
|
||||||
}
|
|
||||||
# dump model
|
|
||||||
xgb.dump <- function(booster, fname, fmap = "") {
|
|
||||||
if (class(booster) != "xgb.Booster") {
|
|
||||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
|
||||||
}
|
|
||||||
if (typeof(fname) != "character"){
|
|
||||||
stop("xgb.dump: second argument must be type character")
|
|
||||||
}
|
|
||||||
.Call("XGBoosterDumpModel_R", booster, fname, fmap)
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
##--------------------------------------
|
|
||||||
# the following are low level iteratively function, not needed
|
|
||||||
# if you do not want to use them
|
|
||||||
#---------------------------------------
|
|
||||||
# iteratively update booster with dtrain
|
|
||||||
xgb.iter.update <- function(booster, dtrain, iter) {
|
|
||||||
if (class(booster) != "xgb.Booster") {
|
|
||||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
|
||||||
}
|
|
||||||
if (class(dtrain) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
|
||||||
}
|
|
||||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain)
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
# iteratively update booster with customized statistics
|
|
||||||
xgb.iter.boost <- function(booster, dtrain, gpair) {
|
|
||||||
if (class(booster) != "xgb.Booster") {
|
|
||||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
|
||||||
}
|
|
||||||
if (class(dtrain) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
|
||||||
}
|
|
||||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess)
|
|
||||||
return(TRUE)
|
|
||||||
}
|
|
||||||
# iteratively evaluate one iteration
|
|
||||||
xgb.iter.eval <- function(booster, watchlist, iter) {
|
|
||||||
if (class(booster) != "xgb.Booster") {
|
|
||||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
|
||||||
}
|
|
||||||
if (typeof(watchlist) != "list") {
|
|
||||||
stop("xgb.eval: only accepts list of DMatrix as watchlist")
|
|
||||||
}
|
|
||||||
for (w in watchlist) {
|
|
||||||
if (class(w) != "xgb.DMatrix") {
|
|
||||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
evnames <- list()
|
|
||||||
if (length(watchlist) != 0) {
|
|
||||||
for (i in 1:length(watchlist)) {
|
|
||||||
w <- watchlist[i]
|
|
||||||
if (length(names(w)) == 0) {
|
|
||||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
|
||||||
}
|
|
||||||
evnames <- append(evnames, names(w))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
|
|
||||||
return(msg)
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user