Merge pull request #51 from tqchen/unity

merge unity into master, R package ready
This commit is contained in:
Tianqi Chen 2014-08-27 21:13:38 -07:00
commit 582e4e3d8c
46 changed files with 865 additions and 1408 deletions

View File

@ -1,32 +1,32 @@
export CC = gcc export CC = gcc
export CXX = g++ export CXX = g++
export LDFLAGS= -pthread -lm export LDFLAGS= -pthread -lm
# note for R module
# add include path to Rinternals.h here export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC
ifeq ($(no_omp),1) ifeq ($(no_omp),1)
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -DDISABLE_OPENMP CFLAGS += -DDISABLE_OPENMP
else else
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp CFLAGS += -fopenmp
endif endif
# expose these flags to R CMD SHLIB
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
# specify tensor path # specify tensor path
BIN = xgboost BIN = xgboost
OBJ = OBJ = updater.o gbm.o io.o
SLIB = wrapper/libxgboostwrapper.so SLIB = wrapper/libxgboostwrapper.so
RLIB = wrapper/libxgboostR.so
.PHONY: clean all R
all: $(BIN) wrapper/libxgboostwrapper.so .PHONY: clean all python
R: wrapper/libxgboostR.so
xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp all: $(BIN) $(OBJ) $(SLIB)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part # now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
wrapper/libxgboostR.so: wrapper/xgboost_wrapper.cpp wrapper/xgboost_R.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
$(BIN) : $(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
@ -34,9 +34,6 @@ $(BIN) :
$(SLIB) : $(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(RLIB) :
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) : $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
@ -44,4 +41,4 @@ install:
cp -f -r $(BIN) $(INSTALL_PATH) cp -f -r $(BIN) $(INSTALL_PATH)
clean: clean:
$(RM) $(OBJ) $(BIN) $(SLIB) $(RLIB) *~ */*~ */*/*~ $(RM) $(OBJ) $(BIN) $(SLIB) *.o *~ */*~ */*/*~

View File

@ -8,4 +8,5 @@ export(xgb.train)
export(xgb.save) export(xgb.save)
export(xgb.load) export(xgb.load)
export(xgb.dump) export(xgb.dump)
export(xgb.Booster)
export(xgb.DMatrix.save) export(xgb.DMatrix.save)

View File

@ -1,5 +1,4 @@
# Main function for xgboost-package # Main function for xgboost-package
xgboost <- function(data = NULL, label = NULL, params = list(), nrounds = 10, xgboost <- function(data = NULL, label = NULL, params = list(), nrounds = 10,
verbose = 1, ...) { verbose = 1, ...) {
inClass <- class(data) inClass <- class(data)

View File

@ -93,20 +93,22 @@ print(paste("error=", err))
############################ Save and load model to hard disk ############################ Save and load model to hard disk
# save model to binary local file # save model to binary local file
xgb.save(bst, "model.save") xgb.save(bst, "xgboost.model")
# load binary model to R # load binary model to R
bst <- xgb.load("model.save") bst <- xgb.load("xgboost.model")
pred <- predict(bst, test.x) pred <- predict(bst, test.x)
# save model to text file # save model to text file
xgb.dump(bst, "model.dump") xgb.dump(bst, "dump.raw.txt")
# save model to text file, with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save a DMatrix object to hard disk # save a DMatrix object to hard disk
xgb.DMatrix.save(dtrain, "dtrain.save") xgb.DMatrix.save(dtrain, "dtrain.buffer")
# load a DMatrix object to R # load a DMatrix object to R
dtrain <- xgb.DMatrix("dtrain.save") dtrain <- xgb.DMatrix("dtrain.buffer")
############################ More flexible training function xgb.train ############################ More flexible training function xgb.train

View File

@ -10,7 +10,7 @@ ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP PKG_CPPFLAGS += -DDISABLE_OPENMP
endif endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o CXXOBJ= xgboost_wrapper.o xgboost_io.o xgboost_gbm.o xgboost_updater.o
OBJECTS= xgboost_R.o $(CXXOBJ) OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean .PHONY: all clean
@ -18,7 +18,9 @@ all: $(SHLIB)
$(SHLIB): $(OBJECTS) $(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp xgboost_io.o: ../../src/io/io.cpp
xgboost_gbm.o: ../../src/gbm/gbm.cpp
xgboost_updater.o: ../../src/tree/updater.cpp
$(CXXOBJ) : $(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

View File

@ -15,7 +15,7 @@ ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP PKG_CPPFLAGS += -DDISABLE_OPENMP
endif endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o CXXOBJ= xgboost_wrapper.o xgboost_io.o xgboost_gbm.o xgboost_updater.o
OBJECTS= xgboost_R.o $(CXXOBJ) OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean .PHONY: all clean
@ -24,6 +24,8 @@ $(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp xgboost_io.o: ../../src/io/io.cpp
xgboost_gbm.o: ../../src/gbm/gbm.cpp
xgboost_updater.o: ../../src/tree/updater.cpp
$(CXXOBJ) : $(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

View File

@ -1,5 +1,6 @@
# include xgboost library, must set chdir=TRURE # install xgboost package, see R-package in root folder
source("../../wrapper/xgboost.R", chdir=TRUE) require(xgboost)
require(methods)
modelfile <- "higgs.model" modelfile <- "higgs.model"
outfile <- "higgs.pred.csv" outfile <- "higgs.pred.csv"

View File

@ -1,5 +1,7 @@
# include xgboost library, must set chdir=TRURE # install xgboost package, see R-package in root folder
source("../../wrapper/xgboost.R", chdir=TRUE) require(xgboost)
require(methods)
testsize <- 550000 testsize <- 550000
dtrain <- read.csv("data/training.csv", header=TRUE) dtrain <- read.csv("data/training.csv", header=TRUE)
@ -12,7 +14,7 @@ sumwpos <- sum(weight * (label==1.0))
sumwneg <- sum(weight * (label==0.0)) sumwneg <- sum(weight * (label==0.0))
print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos)) print(paste("weight statistics: wpos=", sumwpos, "wneg=", sumwneg, "ratio=", sumwneg / sumwpos))
xgmat <- xgb.DMatrix(data, info = list(label=label, weight=weight), missing = -999.0) xgmat <- xgb.DMatrix(data, label = label, weight = weight, missing = -999.0)
param <- list("objective" = "binary:logitraw", param <- list("objective" = "binary:logitraw",
"scale_pos_weight" = sumwneg / sumwpos, "scale_pos_weight" = sumwneg / sumwpos,
"bst:eta" = 0.1, "bst:eta" = 0.1,

View File

@ -13,10 +13,10 @@ Project Logical Layout
File Naming Convention File Naming Convention
======= =======
* The project is templatized, to make it easy to adjust input data structure.
* .h files are data structures and interface, which are needed to use functions in that layer. * .h files are data structures and interface, which are needed to use functions in that layer.
* -inl.hpp files are implementations of interface, like cpp file in most project. * -inl.hpp files are implementations of interface, like cpp file in most project.
- You only need to understand the interface file to understand the usage of that layer - You only need to understand the interface file to understand the usage of that layer
* In each folder, there can be a .cpp file, that compiles the module of that layer
How to Hack the Code How to Hack the Code
====== ======

View File

@ -7,16 +7,8 @@
*/ */
#include <cstdio> #include <cstdio>
#include <vector> #include <vector>
#include <limits>
#include <climits>
#include <cstring>
#include <algorithm>
#include "utils/io.h"
#include "utils/omp.h"
#include "utils/utils.h" #include "utils/utils.h"
#include "utils/iterator.h" #include "utils/iterator.h"
#include "utils/random.h"
#include "utils/matrix_csr.h"
namespace xgboost { namespace xgboost {
/*! /*!
@ -70,12 +62,12 @@ struct SparseBatch {
/*! \brief an entry of sparse vector */ /*! \brief an entry of sparse vector */
struct Entry { struct Entry {
/*! \brief feature index */ /*! \brief feature index */
bst_uint findex; bst_uint index;
/*! \brief feature value */ /*! \brief feature value */
bst_float fvalue; bst_float fvalue;
// default constructor // default constructor
Entry(void) {} Entry(void) {}
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {} Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
/*! \brief reversely compare feature values */ /*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry &a, const Entry &b) { inline static bool CmpValue(const Entry &a, const Entry &b) {
return a.fvalue < b.fvalue; return a.fvalue < b.fvalue;
@ -86,7 +78,7 @@ struct SparseBatch {
/*! \brief pointer to the elements*/ /*! \brief pointer to the elements*/
const Entry *data; const Entry *data;
/*! \brief length of the instance */ /*! \brief length of the instance */
const bst_uint length; bst_uint length;
/*! \brief constructor */ /*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {} Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/ /*! \brief get i-th pair in the sparse vector*/
@ -96,298 +88,72 @@ struct SparseBatch {
}; };
/*! \brief batch size */ /*! \brief batch size */
size_t size; size_t size;
};
/*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */ /*! \brief the offset of rowid of this batch */
size_t base_rowid; size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */ /*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr; const size_t *ind_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */ /*! \brief array[ind_ptr.back()], content of the sparse element */
const Entry *data_ptr; const Entry *data_ptr;
/*! \brief get i-th row from the batch */ /*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const { inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i])); return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
} }
}; };
/*!
/** * \brief read-only column batch, used to access columns,
* \brief This is a interface convention via template, defining the way to access features, * the columns are not required to be continuous
* column access rule is defined by template, for efficiency purpose,
* row access is defined by iterator of sparse batches
* \tparam Derived type of actual implementation
*/ */
template<typename Derived> struct ColBatch : public SparseBatch {
class FMatrixInterface { /*! \brief column index of each columns in the data */
const bst_uint *col_index;
/*! \brief pointer to the column data */
const Inst *col_data;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return col_data[i];
}
};
/**
* \brief interface of feature matrix, needed for tree construction
* this interface defines two way to access features,
* row access is defined by iterator of RowBatch
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
*/
class IFMatrix {
public: public:
/*! \brief example iterator over one column */ // the interface only need to ganrantee row iter
struct ColIter{ // column iter is active, when ColIterator is called, row_iter can be disabled
/*! /*! \brief get the row iterator associated with FMatrix */
* \brief move to next position virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
* \return whether there is element in next position /*!\brief get column iterator */
*/ virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
inline bool Next(void);
/*! \return row index of current position */
inline bst_uint rindex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue(void) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
// column access is needed by some of tree construction algorithms
/*! /*!
* \brief get column iterator, the columns must be sorted by feature value * \brief get the column iterator associated with FMatrix with subset of column features
* \param cidx column index * \param fset is the list of column index set that must be contained in the returning Column iterator
* \return column iterator * \return the column iterator, initialized so that it reads the elements in fset
*/ */
inline ColIter GetSortedCol(size_t cidx) const; virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param cidx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol(void) const;
/*! /*!
* \brief check if column access is supported, if not, initialize column access * \brief check if column access is supported, if not, initialize column access
* \param max_rows maximum number of rows allowed in constructor * \param subsample subsample ratio when generating column access
*/ */
inline void InitColAccess(void); virtual void InitColAccess(float subsample) = 0;
// the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
inline bool HaveColAccess(void) const; virtual bool HaveColAccess(void) const = 0;
/*! \breif return #entries-in-col */ /*! \return number of columns in the FMatrix */
inline size_t GetColSize(size_t cidx) const; virtual size_t NumCol(void) const = 0;
/*! /*! \brief get number of non-missing entries in column */
* \breif return #entries-in-col / #rows virtual size_t GetColSize(size_t cidx) const = 0;
* \param cidx column index /*! \brief get column density */
* this function is used to help speedup, virtual float GetColDensity(size_t cidx) const = 0;
* doese not necessarily implement it if not sure, return 0.0; /*! \brief reference of buffered rowset */
* \return column density virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
*/ // virtual destructor
inline float GetColDensity(size_t cidx) const; virtual ~IFMatrix(void){}
/*! \brief get the row iterator associated with FMatrix */
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public FMatrixInterface<FMatrixS>{
public:
typedef SparseBatch::Entry Entry;
/*! \brief row iterator */
struct ColIter{
const Entry *dptr_, *end_;
ColIter(const Entry* begin, const Entry* end)
:dptr_(begin), end_(end) {}
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
++dptr_; return true;
}
}
inline bst_uint rindex(void) const {
return dptr_->findex;
}
inline bst_float fvalue(void) const {
return dptr_->fvalue;
}
};
/*! \brief reverse column iterator */
struct ColBackIter : public ColIter {
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
// shadows ColIter::Next
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
--dptr_; return true;
}
}
};
/*! \brief constructor */
FMatrixS(void) {
iter_ = NULL;
}
// destructor
~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
inline size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
inline const std::vector<bst_uint> buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[0] + col_ptr_[cidx] - 1,
&col_data_[0] + col_ptr_[cidx + 1] - 1);
}
/*!
* \brief get reversed col iterator,
* this function will be deprecated at some point
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[0] + col_ptr_[cidx + 1],
&col_data_[0] + col_ptr_[cidx]);
}
/*! \brief get col size */
inline size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
inline float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
inline void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
}
/*!
* \brief get the row iterator associated with FMatrix
* this function is not threadsafe, returns iterator stored in FMatrixS
*/
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
iter_->BeforeFirst();
return iter_;
}
/*! \brief set iterator */
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
this->iter_ = iter;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_);
if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<SparseBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<SparseBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].findex);
}
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop;
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].findex,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_;
}; };
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_H #endif // XGBOOST_DATA_H

View File

@ -18,8 +18,7 @@ namespace gbm {
* \brief gradient boosted linear model * \brief gradient boosted linear model
* \tparam FMatrix the data type updater taking * \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBLinear : public IGradBooster {
class GBLinear : public IGradBooster<FMatrix> {
public: public:
virtual ~GBLinear(void) { virtual ~GBLinear(void) {
} }
@ -41,13 +40,12 @@ class GBLinear : public IGradBooster<FMatrix> {
virtual void InitModel(void) { virtual void InitModel(void) {
model.InitModel(); model.InitModel();
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
this->InitFeatIndex(fmat);
std::vector<bst_gpair> &gpair = *in_gpair; std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// for all the output group // for all the output group
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0; double sum_grad = 0.0, sum_hess = 0.0;
@ -72,45 +70,49 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
} }
} }
// number of features utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size()); while (iter->Next()) {
#pragma omp parallel for schedule(static) // number of features
for (bst_omp_uint i = 0; i < nfeat; ++i) { const ColBatch &batch = iter->Value();
const bst_uint fid = feat_index[i]; const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
for (int gid = 0; gid < ngroup; ++gid) { #pragma omp parallel for schedule(static)
double sum_grad = 0.0, sum_hess = 0.0; for (bst_omp_uint i = 0; i < nfeat; ++i) {
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { const bst_uint fid = batch.col_index[i];
const float v = it.fvalue(); ColBatch::Inst col = batch[i];
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; for (int gid = 0; gid < ngroup; ++gid) {
if (p.hess < 0.0f) continue; double sum_grad = 0.0, sum_hess = 0.0;
sum_grad += p.grad * v; for (bst_uint j = 0; j < col.length; ++j) {
sum_hess += p.hess * v * v; const float v = col[j].fvalue;
} bst_gpair &p = gpair[col[j].index * ngroup + gid];
float &w = model[fid][gid]; if (p.hess < 0.0f) continue;
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w)); sum_grad += p.grad * v;
w += dw; sum_hess += p.hess * v * v;
// update grad value }
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { float &w = model[fid][gid];
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
if (p.hess < 0.0f) continue; w += dw;
p.grad += p.hess * it.fvalue() * dw; // update grad value
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
p.grad += p.hess * col[j].fvalue * dw;
}
} }
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds) {
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); preds.resize(0);
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * ngroup == preds.size(), utils::Assert(batch.base_rowid * ngroup == preds.size(),
"base_rowid is not set correctly"); "base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows // output convention: nrow * k, where nrow is number of rows
@ -134,23 +136,11 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
protected: protected:
inline void InitFeatIndex(const FMatrix &fmat) { inline void Pred(const RowBatch::Inst &inst, float *preds) {
if (feat_index.size() != 0) return;
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
feat_index.reserve(ncol);
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) {
feat_index.push_back(i);
}
}
random::Shuffle(feat_index);
}
inline void Pred(const SparseBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) { for (int gid = 0; gid < model.param.num_output_group; ++gid) {
float psum = model.bias()[gid]; float psum = model.bias()[gid];
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
psum += inst[i].fvalue * model[inst[i].findex][gid]; psum += inst[i].fvalue * model[inst[i].index][gid];
} }
preds[gid] = psum; preds[gid] = psum;
} }

18
src/gbm/gbm.cpp Normal file
View File

@ -0,0 +1,18 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstring>
#include "./gbm.h"
#include "./gbtree-inl.hpp"
#include "./gblinear-inl.hpp"
namespace xgboost {
namespace gbm {
IGradBooster* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree();
if (!strcmp("gblinear", name)) return new GBLinear();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm
} // namespace xgboost

View File

@ -7,6 +7,7 @@
*/ */
#include <vector> #include <vector>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
#include "../utils/fmap.h" #include "../utils/fmap.h"
namespace xgboost { namespace xgboost {
@ -14,9 +15,7 @@ namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief interface of gradient boosting model * \brief interface of gradient boosting model
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IGradBooster { class IGradBooster {
public: public:
/*! /*!
@ -41,17 +40,17 @@ class IGradBooster {
virtual void InitModel(void) = 0; virtual void InitModel(void) = 0;
/*! /*!
* \brief peform update to the model(boosting) * \brief peform update to the model(boosting)
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info meta information about training * \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data * \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair * the booster may change content of gpair
*/ */
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) = 0; std::vector<bst_gpair> *in_gpair) = 0;
/*! /*!
* \brief generate predictions for given feature matrix * \brief generate predictions for given feature matrix
* \param fmat feature matrix * \param p_fmat feature matrix
* \param buffer_offset buffer index offset of these instances, if equals -1 * \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm * this means we do not have buffer index allocated to the gbm
* a buffer index is assigned to each instance that requires repeative prediction * a buffer index is assigned to each instance that requires repeative prediction
@ -59,7 +58,7 @@ class IGradBooster {
* \param info extra side information that may be needed for prediction * \param info extra side information that may be needed for prediction
* \param out_preds output vector to hold the predictions * \param out_preds output vector to hold the predictions
*/ */
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) = 0; std::vector<float> *out_preds) = 0;
@ -73,21 +72,11 @@ class IGradBooster {
// destrcutor // destrcutor
virtual ~IGradBooster(void){} virtual ~IGradBooster(void){}
}; };
} // namespace gbm /*!
} // namespace xgboost * \breif create a gradient booster from given name
* \param name name of gradient booster
#include "gbtree-inl.hpp" */
#include "gblinear-inl.hpp" IGradBooster* CreateGradBooster(const char *name);
namespace xgboost {
namespace gbm {
template<typename FMatrix>
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm } // namespace gbm
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_GBM_GBM_H_ #endif // XGBOOST_GBM_GBM_H_

View File

@ -9,16 +9,15 @@
#include <utility> #include <utility>
#include <string> #include <string>
#include "./gbm.h" #include "./gbm.h"
#include "../utils/omp.h"
#include "../tree/updater.h" #include "../tree/updater.h"
namespace xgboost { namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief gradient boosted tree * \brief gradient boosted tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBTree : public IGradBooster {
class GBTree : public IGradBooster<FMatrix> {
public: public:
virtual ~GBTree(void) { virtual ~GBTree(void) {
this->Clear(); this->Clear();
@ -82,12 +81,12 @@ class GBTree : public IGradBooster<FMatrix> {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized"); utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized"); utils::Assert(trees.size() == 0, "GBTree: model already initialized");
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
const std::vector<bst_gpair> &gpair = *in_gpair; const std::vector<bst_gpair> &gpair = *in_gpair;
if (mparam.num_output_group == 1) { if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, fmat, info, 0); this->BoostNewTrees(gpair, p_fmat, info, 0);
} else { } else {
const int ngroup = mparam.num_output_group; const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0, utils::Check(gpair.size() % ngroup == 0,
@ -99,11 +98,11 @@ class GBTree : public IGradBooster<FMatrix> {
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid]; tmp[i] = gpair[i * ngroup + gid];
} }
this->BoostNewTrees(tmp, fmat, info, gid); this->BoostNewTrees(tmp, p_fmat, info, gid);
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds) {
@ -118,17 +117,13 @@ class GBTree : public IGradBooster<FMatrix> {
} }
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
"base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows
// k is number of group
preds.resize(preds.size() + batch.size * mparam.num_output_group);
// parallel over local batch // parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -136,13 +131,13 @@ class GBTree : public IGradBooster<FMatrix> {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
tree::RegTree::FVec &feats = thread_temp[tid]; tree::RegTree::FVec &feats = thread_temp[tid];
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i); int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const unsigned root_idx = info.GetRoot(ridx); utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
// loop over output groups // loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) { for (int gid = 0; gid < mparam.num_output_group; ++gid) {
preds[ridx * mparam.num_output_group + gid] = this->Pred(batch[i],
this->Pred(batch[i], buffer_offset < 0 ? -1 : buffer_offset + ridx,
buffer_offset < 0 ? -1 : buffer_offset+ridx, gid, info.GetRoot(ridx), &feats,
gid, root_idx, &feats); &preds[ridx * mparam.num_output_group + gid], stride);
} }
} }
} }
@ -176,7 +171,7 @@ class GBTree : public IGradBooster<FMatrix> {
char *pstr; char *pstr;
pstr = strtok(&tval[0], ","); pstr = strtok(&tval[0], ",");
while (pstr != NULL) { while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr)); updaters.push_back(tree::CreateUpdater(pstr));
for (size_t j = 0; j < cfg.size(); ++j) { for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters // set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
@ -187,7 +182,7 @@ class GBTree : public IGradBooster<FMatrix> {
} }
// do group specific group // do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair, inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
int bst_group) { int bst_group) {
this->InitUpdater(); this->InitUpdater();
@ -202,7 +197,7 @@ class GBTree : public IGradBooster<FMatrix> {
} }
// update the trees // update the trees
for (size_t i = 0; i < updaters.size(); ++i) { for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, fmat, info, new_trees); updaters[i]->Update(gpair, p_fmat, info, new_trees);
} }
// push back to model // push back to model
for (size_t i = 0; i < new_trees.size(); ++i) { for (size_t i = 0; i < new_trees.size(); ++i) {
@ -212,24 +207,34 @@ class GBTree : public IGradBooster<FMatrix> {
mparam.num_trees += tparam.num_parallel_tree; mparam.num_trees += tparam.num_parallel_tree;
} }
// make a prediction for a single instance // make a prediction for a single instance
inline float Pred(const SparseBatch::Inst &inst, inline void Pred(const RowBatch::Inst &inst,
int64_t buffer_index, int64_t buffer_index,
int bst_group, int bst_group,
unsigned root_index, unsigned root_index,
tree::RegTree::FVec *p_feats) { tree::RegTree::FVec *p_feats,
float *out_pred, size_t stride) {
size_t itop = 0; size_t itop = 0;
float psum = 0.0f; float psum = 0.0f;
// sum of leaf vector
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group); const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
// load buffered results if any // load buffered results if any
if (bid >= 0) { if (bid >= 0) {
itop = pred_counter[bid]; itop = pred_counter[bid];
psum = pred_buffer[bid]; psum = pred_buffer[bid];
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
vec_psum[i] = pred_buffer[bid + i + 1];
}
} }
if (itop != trees.size()) { if (itop != trees.size()) {
p_feats->Fill(inst); p_feats->Fill(inst);
for (size_t i = itop; i < trees.size(); ++i) { for (size_t i = itop; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) { if (tree_info[i] == bst_group) {
psum += trees[i]->Predict(*p_feats, root_index); int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
psum += (*trees[i])[tid].leaf_value();
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
vec_psum[j] += trees[i]->leafvec(tid)[j];
}
} }
} }
p_feats->Drop(inst); p_feats->Drop(inst);
@ -238,8 +243,14 @@ class GBTree : public IGradBooster<FMatrix> {
if (bid >= 0) { if (bid >= 0) {
pred_counter[bid] = static_cast<unsigned>(trees.size()); pred_counter[bid] = static_cast<unsigned>(trees.size());
pred_buffer[bid] = psum; pred_buffer[bid] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] = vec_psum[i];
}
}
out_pred[0] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
out_pred[stride * (i + 1)] = vec_psum[i];
} }
return psum;
} }
// --- data structure --- // --- data structure ---
/*! \brief training parameters */ /*! \brief training parameters */
@ -292,14 +303,17 @@ class GBTree : public IGradBooster<FMatrix> {
* suppose we have n instance and k group, output will be k*n * suppose we have n instance and k group, output will be k*n
*/ */
int num_output_group; int num_output_group;
/*! \brief size of leaf vector needed in tree */
int size_leaf_vector;
/*! \brief reserved parameters */ /*! \brief reserved parameters */
int reserved[32]; int reserved[31];
/*! \brief constructor */ /*! \brief constructor */
ModelParam(void) { ModelParam(void) {
num_trees = 0; num_trees = 0;
num_roots = num_feature = 0; num_roots = num_feature = 0;
num_pbuffer = 0; num_pbuffer = 0;
num_output_group = 1; num_output_group = 1;
size_leaf_vector = 0;
memset(reserved, 0, sizeof(reserved)); memset(reserved, 0, sizeof(reserved));
} }
/*! /*!
@ -312,10 +326,11 @@ class GBTree : public IGradBooster<FMatrix> {
if (!strcmp("num_output_group", name)) num_output_group = atol(val); if (!strcmp("num_output_group", name)) num_output_group = atol(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val); if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
} }
/*! \return size of prediction buffer actually needed */ /*! \return size of prediction buffer actually needed */
inline size_t PredBufferSize(void) const { inline size_t PredBufferSize(void) const {
return num_output_group * num_pbuffer; return num_output_group * num_pbuffer * (size_leaf_vector + 1);
} }
/*! /*!
* \brief get the buffer offset given a buffer index and group id * \brief get the buffer offset given a buffer index and group id
@ -324,7 +339,7 @@ class GBTree : public IGradBooster<FMatrix> {
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const { inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1; if (buffer_index < 0) return -1;
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer"); utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
return buffer_index + num_pbuffer * bst_group; return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
} }
}; };
// training parameter // training parameter
@ -345,7 +360,7 @@ class GBTree : public IGradBooster<FMatrix> {
// temporal storage for per thread // temporal storage for per thread
std::vector<tree::RegTree::FVec> thread_temp; std::vector<tree::RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree // the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters; std::vector<tree::IUpdater*> updaters;
}; };
} // namespace gbm } // namespace gbm

View File

@ -13,7 +13,7 @@ namespace xgboost {
/*! \brief namespace related to data format */ /*! \brief namespace related to data format */
namespace io { namespace io {
/*! \brief DMatrix object that I/O module support save/load */ /*! \brief DMatrix object that I/O module support save/load */
typedef learner::DMatrix<FMatrixS> DataMatrix; typedef learner::DMatrix DataMatrix;
/*! /*!
* \brief load DataMatrix from stream * \brief load DataMatrix from stream
* \param fname file name to be loaded * \param fname file name to be loaded

View File

@ -16,6 +16,7 @@
#include "../utils/utils.h" #include "../utils/utils.h"
#include "../learner/dmatrix.h" #include "../learner/dmatrix.h"
#include "./io.h" #include "./io.h"
#include "./simple_fmatrix-inl.hpp"
namespace xgboost { namespace xgboost {
namespace io { namespace io {
@ -24,11 +25,16 @@ class DMatrixSimple : public DataMatrix {
public: public:
// constructor // constructor
DMatrixSimple(void) : DataMatrix(kMagic) { DMatrixSimple(void) : DataMatrix(kMagic) {
this->fmat.set_iter(new OneBatchIter(this)); fmat_ = new FMatrixS(new OneBatchIter(this));
this->Clear(); this->Clear();
} }
// virtual destructor // virtual destructor
virtual ~DMatrixSimple(void) {} virtual ~DMatrixSimple(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief clear the storage */ /*! \brief clear the storage */
inline void Clear(void) { inline void Clear(void) {
row_ptr_.clear(); row_ptr_.clear();
@ -41,15 +47,15 @@ class DMatrixSimple : public DataMatrix {
this->info = src.info; this->info = src.info;
this->Clear(); this->Clear();
// clone data content in thos matrix // clone data content in thos matrix
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator(); utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) { for (size_t i = 0; i < batch.size; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
row_data_.resize(row_data_.size() + inst.length); row_data_.resize(row_data_.size() + inst.length);
memcpy(&row_data_[row_ptr_.back()], inst.data, memcpy(&row_data_[row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length); sizeof(RowBatch::Entry) * inst.length);
row_ptr_.push_back(row_ptr_.back() + inst.length); row_ptr_.push_back(row_ptr_.back() + inst.length);
} }
} }
@ -59,10 +65,10 @@ class DMatrixSimple : public DataMatrix {
* \param feats features * \param feats features
* \return the index of added row * \return the index of added row
*/ */
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) { inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
for (size_t i = 0; i < feats.size(); ++i) { for (size_t i = 0; i < feats.size(); ++i) {
row_data_.push_back(feats[i]); row_data_.push_back(feats[i]);
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].findex+1)); info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
} }
row_ptr_.push_back(row_ptr_.back() + feats.size()); row_ptr_.push_back(row_ptr_.back() + feats.size());
info.info.num_row += 1; info.info.num_row += 1;
@ -78,10 +84,10 @@ class DMatrixSimple : public DataMatrix {
FILE* file = utils::FopenCheck(fname, "r"); FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true; float label; bool init = true;
char tmp[1024]; char tmp[1024];
std::vector<SparseBatch::Entry> feats; std::vector<RowBatch::Entry> feats;
while (fscanf(file, "%s", tmp) == 1) { while (fscanf(file, "%s", tmp) == 1) {
SparseBatch::Entry e; RowBatch::Entry e;
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) { if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
feats.push_back(e); feats.push_back(e);
} else { } else {
if (!init) { if (!init) {
@ -145,7 +151,7 @@ class DMatrixSimple : public DataMatrix {
info.LoadBinary(fs); info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
fmat.LoadColAccess(fs); fmat_->LoadColAccess(fs);
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded", printf("%lux%lu matrix with %lu entries is loaded",
@ -172,7 +178,7 @@ class DMatrixSimple : public DataMatrix {
info.SaveBinary(fs); info.SaveBinary(fs);
FMatrixS::SaveBinary(fs, row_ptr_, row_data_); FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
fmat.SaveColAccess(fs); fmat_->SaveColAccess(fs);
fs.Close(); fs.Close();
if (!silent) { if (!silent) {
@ -211,13 +217,15 @@ class DMatrixSimple : public DataMatrix {
/*! \brief row pointer of CSR sparse storage */ /*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_; std::vector<size_t> row_ptr_;
/*! \brief data in the row */ /*! \brief data in the row */
std::vector<SparseBatch::Entry> row_data_; std::vector<RowBatch::Entry> row_data_;
/*! \brief the real fmatrix */
FMatrixS *fmat_;
/*! \brief magic number used to identify DMatrix */ /*! \brief magic number used to identify DMatrix */
static const int kMagic = 0xffffab01; static const int kMagic = 0xffffab01;
protected: protected:
// one batch iterator that return content in the matrix // one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<SparseBatch> { struct OneBatchIter: utils::IIterator<RowBatch> {
explicit OneBatchIter(DMatrixSimple *parent) explicit OneBatchIter(DMatrixSimple *parent)
: at_first_(true), parent_(parent) {} : at_first_(true), parent_(parent) {}
virtual ~OneBatchIter(void) {} virtual ~OneBatchIter(void) {}
@ -229,11 +237,11 @@ class DMatrixSimple : public DataMatrix {
at_first_ = false; at_first_ = false;
batch_.size = parent_->row_ptr_.size() - 1; batch_.size = parent_->row_ptr_.size() - 1;
batch_.base_rowid = 0; batch_.base_rowid = 0;
batch_.row_ptr = &parent_->row_ptr_[0]; batch_.ind_ptr = &parent_->row_ptr_[0];
batch_.data_ptr = &parent_->row_data_[0]; batch_.data_ptr = &parent_->row_data_[0];
return true; return true;
} }
virtual const SparseBatch &Value(void) const { virtual const RowBatch &Value(void) const {
return batch_; return batch_;
} }
@ -243,8 +251,8 @@ class DMatrixSimple : public DataMatrix {
// pointer to parient // pointer to parient
DMatrixSimple *parent_; DMatrixSimple *parent_;
// temporal space for batch // temporal space for batch
SparseBatch batch_; RowBatch batch_;
}; };
}; };
} // namespace io } // namespace io
} // namespace xgboost } // namespace xgboost

View File

@ -0,0 +1,242 @@
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP
/*!
* \file simple_fmatrix-inl.hpp
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#include "../data.h"
#include "../utils/utils.h"
#include "../utils/random.h"
#include "../utils/omp.h"
#include "../utils/matrix_csr.h"
namespace xgboost {
namespace io {
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public IFMatrix{
public:
typedef SparseBatch::Entry Entry;
/*! \brief constructor */
FMatrixS(utils::IIterator<RowBatch> *iter) {
this->iter_ = iter;
}
// destructor
virtual ~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
virtual size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get column size */
virtual size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
virtual float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
}
/*!
* \brief get the row iterator associated with FMatrix
*/
virtual utils::IIterator<RowBatch>* RowIterator(void) {
iter_->BeforeFirst();
return iter_;
}
/*!
* \brief get the column based iterator
*/
virtual utils::IIterator<ColBatch>* ColIterator(void) {
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(ncol);
for (size_t i = 0; i < ncol; ++i) {
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
}
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
/*!
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
col_iter_.col_index_ = fset;
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_);
if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index);
}
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}
private:
// one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<ColBatch> {
OneBatchIter(void) : at_first_(true){}
virtual ~OneBatchIter(void) {}
virtual void BeforeFirst(void) {
at_first_ = true;
}
virtual bool Next(void) {
if (!at_first_) return false;
at_first_ = false;
return true;
}
virtual const ColBatch &Value(void) const {
return batch_;
}
inline void SetBatch(const std::vector<size_t> &ptr,
const std::vector<ColBatch::Entry> &data) {
batch_.size = col_index_.size();
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0));
for (size_t i = 0; i < col_data_.size(); ++i) {
const bst_uint ridx = col_index_[i];
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],
static_cast<bst_uint>(ptr[ridx+1] - ptr[ridx]));
}
batch_.col_index = &col_index_[0];
batch_.col_data = &col_data_[0];
this->BeforeFirst();
}
// data content
std::vector<bst_uint> col_index_;
std::vector<ColBatch::Inst> col_data_;
// whether is at first
bool at_first_;
// temporal space for batch
ColBatch batch_;
};
// --- data structure used to support InitColAccess --
// column iterator
OneBatchIter col_iter_;
// row iterator
utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<ColBatch::Entry> col_data_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP

View File

@ -7,8 +7,9 @@
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#include <vector> #include <vector>
#include <cstring>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
namespace xgboost { namespace xgboost {
namespace learner { namespace learner {
/*! /*!
@ -142,7 +143,6 @@ struct MetaInfo {
* \brief data object used for learning, * \brief data object used for learning,
* \tparam FMatrix type of feature data source * \tparam FMatrix type of feature data source
*/ */
template<typename FMatrix>
struct DMatrix { struct DMatrix {
/*! /*!
* \brief magic number associated with this object * \brief magic number associated with this object
@ -151,8 +151,6 @@ struct DMatrix {
const int magic; const int magic;
/*! \brief meta information about the dataset */ /*! \brief meta information about the dataset */
MetaInfo info; MetaInfo info;
/*! \brief feature matrix about data content */
FMatrix fmat;
/*! /*!
* \brief cache pointer to verify if the data structure is cached in some learner * \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached * used to verify if DMatrix is cached
@ -160,6 +158,8 @@ struct DMatrix {
void *cache_learner_ptr_; void *cache_learner_ptr_;
/*! \brief default constructor */ /*! \brief default constructor */
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {} explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
/*! \brief get feature matrix about data content */
virtual IFMatrix *fmat(void) const = 0;
// virtual destructor // virtual destructor
virtual ~DMatrix(void){} virtual ~DMatrix(void){}
}; };

View File

@ -24,9 +24,12 @@ template<typename Derived>
struct EvalEWiseBase : public IEvaluator { struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match"); "label and prediction size not match");
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
float sum = 0.0, wsum = 0.0; float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+: sum, wsum) schedule(static) #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) { for (bst_omp_uint i = 0; i < ndata; ++i) {
@ -99,6 +102,45 @@ struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
} }
}; };
/*! \brief ctest */
struct EvalCTest: public IEvaluator {
EvalCTest(IEvaluator *base, const char *name)
: base_(base), name_(name) {}
virtual ~EvalCTest(void) {
delete base_;
}
virtual const char *Name(void) const {
return name_.c_str();
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
size_t ngroup = preds.size() / info.labels.size() - 1;
const unsigned ndata = static_cast<unsigned>(info.labels.size());
utils::Check(ngroup > 1, "pred size does not meet requirement");
utils::Check(ndata == info.info.fold_index.size(), "need fold index");
double wsum = 0.0;
for (size_t k = 0; k < ngroup; ++k) {
std::vector<float> tpred;
MetaInfo tinfo;
for (unsigned i = 0; i < ndata; ++i) {
if (info.info.fold_index[i] == k) {
tpred.push_back(preds[i + (k + 1) * ndata]);
tinfo.labels.push_back(info.labels[i]);
tinfo.weights.push_back(info.GetWeight(i));
}
}
wsum += base_->Eval(tpred, tinfo);
}
return static_cast<float>(wsum / ngroup);
}
private:
IEvaluator *base_;
std::string name_;
};
/*! \brief AMS: also records best threshold */ /*! \brief AMS: also records best threshold */
struct EvalAMS : public IEvaluator { struct EvalAMS : public IEvaluator {
public: public:
@ -109,7 +151,8 @@ struct EvalAMS : public IEvaluator {
} }
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams"); utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata); std::vector< std::pair<float, unsigned> > rec(ndata);
@ -168,9 +211,11 @@ struct EvalPrecisionRatio : public IEvaluator{
} }
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Assert(preds.size() == info.labels.size(), "label size predict size not match"); utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Assert(preds.size() % info.labels.size() == 0,
"label size predict size not match");
std::vector< std::pair<float, unsigned> > rec; std::vector< std::pair<float, unsigned> > rec;
for (size_t j = 0; j < preds.size(); ++j) { for (size_t j = 0; j < info.labels.size(); ++j) {
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j))); rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
} }
std::sort(rec.begin(), rec.end(), CmpFirst); std::sort(rec.begin(), rec.end(), CmpFirst);
@ -206,10 +251,14 @@ struct EvalPrecisionRatio : public IEvaluator{
struct EvalAuc : public IEvaluator { struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds, virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const { const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), "label size predict size not match"); utils::Check(info.labels.size() != 0, "label set cannot be empty");
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size()); utils::Check(preds.size() % info.labels.size() == 0,
"label size predict size not match");
std::vector<unsigned> tgptr(2, 0);
tgptr[1] = static_cast<unsigned>(info.labels.size());
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr; const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.back() == preds.size(), utils::Check(gptr.back() == info.labels.size(),
"EvalAuc: group structure must match number of prediction"); "EvalAuc: group structure must match number of prediction");
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1); const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
// sum statictis // sum statictis

View File

@ -45,7 +45,9 @@ inline IEvaluator* CreateEvaluator(const char *name) {
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name); if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name); if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
if (!strncmp(name, "map", 3)) return new EvalMAP(name); if (!strncmp(name, "map", 3)) return new EvalMAP(name);
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name); if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
utils::Error("unknown evaluation metric type: %s", name); utils::Error("unknown evaluation metric type: %s", name);
return NULL; return NULL;
} }

View File

@ -21,7 +21,6 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions * \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction * and do training and prediction
*/ */
template<typename FMatrix>
class BoostLearner { class BoostLearner {
public: public:
BoostLearner(void) { BoostLearner(void) {
@ -44,7 +43,7 @@ class BoostLearner {
* data matrices to continue training otherwise it will cause error * data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached * \param mats array of pointers to matrix whose prediction result need to be cached
*/ */
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) { inline void SetCacheData(const std::vector<DMatrix*>& mats) {
// estimate feature bound // estimate feature bound
unsigned num_feature = 0; unsigned num_feature = 0;
// assign buffer index // assign buffer index
@ -158,18 +157,18 @@ class BoostLearner {
* if not intialize it * if not intialize it
* \param p_train pointer to the matrix used by training * \param p_train pointer to the matrix used by training
*/ */
inline void CheckInit(DMatrix<FMatrix> *p_train) { inline void CheckInit(DMatrix *p_train) {
p_train->fmat.InitColAccess(prob_buffer_row); p_train->fmat()->InitColAccess(prob_buffer_row);
} }
/*! /*!
* \brief update the model for one iteration * \brief update the model for one iteration
* \param iter current iteration number * \param iter current iteration number
* \param p_train pointer to the data matrix * \param p_train pointer to the data matrix
*/ */
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) { inline void UpdateOneIter(int iter, const DMatrix &train) {
this->PredictRaw(train, &preds_); this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_); obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(train.fmat, train.info.info, &gpair_); gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
} }
/*! /*!
* \brief evaluate the model for specific iteration * \brief evaluate the model for specific iteration
@ -179,7 +178,7 @@ class BoostLearner {
* \return a string corresponding to the evaluation result * \return a string corresponding to the evaluation result
*/ */
inline std::string EvalOneIter(int iter, inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals, const std::vector<const DMatrix*> &evals,
const std::vector<std::string> &evname) { const std::vector<std::string> &evname) {
std::string res; std::string res;
char tmp[256]; char tmp[256];
@ -198,7 +197,7 @@ class BoostLearner {
* \param metric name of metric * \param metric name of metric
* \return a pair of <evaluation name, result> * \return a pair of <evaluation name, result>
*/ */
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) { std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric(); if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str()); IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_); this->PredictRaw(data, &preds_);
@ -213,7 +212,7 @@ class BoostLearner {
* \param output_margin whether to only predict margin value instead of transformed prediction * \param output_margin whether to only predict margin value instead of transformed prediction
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
*/ */
inline void Predict(const DMatrix<FMatrix> &data, inline void Predict(const DMatrix &data,
bool output_margin, bool output_margin,
std::vector<float> *out_preds) const { std::vector<float> *out_preds) const {
this->PredictRaw(data, out_preds); this->PredictRaw(data, out_preds);
@ -235,7 +234,7 @@ class BoostLearner {
if (obj_ != NULL) return; if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL"); utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str()); obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str()); gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) { for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@ -247,9 +246,9 @@ class BoostLearner {
* \param data training data matrix * \param data training data matrix
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
*/ */
inline void PredictRaw(const DMatrix<FMatrix> &data, inline void PredictRaw(const DMatrix &data,
std::vector<float> *out_preds) const { std::vector<float> *out_preds) const {
gbm_->Predict(data.fmat, this->FindBufferOffset(data), gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
data.info.info, out_preds); data.info.info, out_preds);
// add base margin // add base margin
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
@ -307,7 +306,7 @@ class BoostLearner {
// model parameter // model parameter
ModelParam mparam; ModelParam mparam;
// gbm model that back everything // gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_; gbm::IGradBooster *gbm_;
// name of gbm model used for training // name of gbm model used for training
std::string name_gbm_; std::string name_gbm_;
// objective fnction // objective fnction
@ -324,14 +323,14 @@ class BoostLearner {
private: private:
// cache entry object that helps handle feature caching // cache entry object that helps handle feature caching
struct CacheEntry { struct CacheEntry {
const DMatrix<FMatrix> *mat_; const DMatrix *mat_;
size_t buffer_offset_; size_t buffer_offset_;
size_t num_row_; size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row) CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {} :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
}; };
// find internal bufer offset for certain matrix, if not exist, return -1 // find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const { inline int64_t FindBufferOffset(const DMatrix &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) { for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) { if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.info.num_row()) { if (cache_[i].num_row_ == mat.info.num_row()) {

View File

@ -123,7 +123,7 @@ class RegLossObj : public IObjFunction{
float p = loss.PredTransform(preds[i]); float p = loss.PredTransform(preds[i]);
float w = info.GetWeight(j); float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight; if (info.labels[j] == 1.0f) w *= scale_pos_weight;
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w, gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w); loss.SecondOrderGradient(p, info.labels[j]) * w);
} }
} }

View File

@ -272,6 +272,7 @@ class TreeModel {
param.num_nodes = param.num_roots; param.num_nodes = param.num_roots;
nodes.resize(param.num_nodes); nodes.resize(param.num_nodes);
stats.resize(param.num_nodes); stats.resize(param.num_nodes);
leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
for (int i = 0; i < param.num_nodes; i ++) { for (int i = 0; i < param.num_nodes; i ++) {
nodes[i].set_leaf(0.0f); nodes[i].set_leaf(0.0f);
nodes[i].set_parent(-1); nodes[i].set_parent(-1);
@ -289,6 +290,9 @@ class TreeModel {
"TreeModel: wrong format"); "TreeModel: wrong format");
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0, utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
"TreeModel: wrong format"); "TreeModel: wrong format");
if (param.size_leaf_vector != 0) {
utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
}
// chg deleted nodes // chg deleted nodes
deleted_nodes.resize(0); deleted_nodes.resize(0);
for (int i = param.num_roots; i < param.num_nodes; i ++) { for (int i = param.num_roots; i < param.num_nodes; i ++) {
@ -309,6 +313,7 @@ class TreeModel {
fo.Write(&param, sizeof(Param)); fo.Write(&param, sizeof(Param));
fo.Write(&nodes[0], sizeof(Node) * nodes.size()); fo.Write(&nodes[0], sizeof(Node) * nodes.size());
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size()); fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
} }
/*! /*!
* \brief add child nodes to node * \brief add child nodes to node
@ -486,15 +491,15 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
std::fill(data.begin(), data.end(), e); std::fill(data.begin(), data.end(), e);
} }
/*! \brief fill the vector with sparse vector */ /*! \brief fill the vector with sparse vector */
inline void Fill(const SparseBatch::Inst &inst) { inline void Fill(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].fvalue = inst[i].fvalue; data[inst[i].index].fvalue = inst[i].fvalue;
} }
} }
/*! \brief drop the trace after fill, must be called after fill */ /*! \brief drop the trace after fill, must be called after fill */
inline void Drop(const SparseBatch::Inst &inst) { inline void Drop(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].flag = -1; data[inst[i].index].flag = -1;
} }
} }
/*! \brief get ith value */ /*! \brief get ith value */

View File

@ -22,10 +22,10 @@ struct TrainParam{
//----- the rest parameters are less important ---- //----- the rest parameters are less important ----
// minimum amount of hessian(weight) allowed in a child // minimum amount of hessian(weight) allowed in a child
float min_child_weight; float min_child_weight;
// weight decay parameter used to control leaf fitting // L2 regularization factor
float reg_lambda; float reg_lambda;
// reg method // L1 regularization factor
int reg_method; float reg_alpha;
// default direction choice // default direction choice
int default_direction; int default_direction;
// whether we want to do subsample // whether we want to do subsample
@ -36,6 +36,8 @@ struct TrainParam{
float colsample_bytree; float colsample_bytree;
// speed optimization for dense column // speed optimization for dense column
float opt_dense_col; float opt_dense_col;
// leaf vector size
int size_leaf_vector;
// number of threads to be used for tree construction, // number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default // if OpenMP is enabled, if equals 0, use system default
int nthread; int nthread;
@ -45,13 +47,14 @@ struct TrainParam{
min_child_weight = 1.0f; min_child_weight = 1.0f;
max_depth = 6; max_depth = 6;
reg_lambda = 1.0f; reg_lambda = 1.0f;
reg_method = 2; reg_alpha = 0.0f;
default_direction = 0; default_direction = 0;
subsample = 1.0f; subsample = 1.0f;
colsample_bytree = 1.0f; colsample_bytree = 1.0f;
colsample_bylevel = 1.0f; colsample_bylevel = 1.0f;
opt_dense_col = 1.0f; opt_dense_col = 1.0f;
nthread = 0; nthread = 0;
size_leaf_vector = 0;
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
@ -63,15 +66,17 @@ struct TrainParam{
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val)); if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val)); if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val)); if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val)); if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val)); if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val)); if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val)); if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "reg_method")) reg_method = atoi(val); if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val)); if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val)); if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val)); if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val)); if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val);
if (!strcmp(name, "default_direction")) { if (!strcmp(name, "default_direction")) {
@ -82,31 +87,31 @@ struct TrainParam{
} }
// calculate the cost of loss function // calculate the cost of loss function
inline double CalcGain(double sum_grad, double sum_hess) const { inline double CalcGain(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) { if (sum_hess < min_child_weight) return 0.0;
return 0.0; if (reg_alpha == 0.0f) {
return Sqr(sum_grad) / (sum_hess + reg_lambda);
} else {
return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
} }
switch (reg_method) { }
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess; // calculate cost of loss function with four stati
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda); inline double CalcGain(double sum_grad, double sum_hess,
case 3 : return double test_grad, double test_hess) const {
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) / double w = CalcWeight(sum_grad, sum_hess);
(sum_hess + 0.5 * reg_lambda); double ret = test_grad * w + 0.5 * (test_hess + reg_lambda) * Sqr(w);
default: return Sqr(sum_grad) / sum_hess; if (reg_alpha == 0.0f) {
return - 2.0 * ret;
} else {
return - 2.0 * (ret + reg_alpha * std::abs(w));
} }
} }
// calculate weight given the statistics // calculate weight given the statistics
inline double CalcWeight(double sum_grad, double sum_hess) const { inline double CalcWeight(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) { if (sum_hess < min_child_weight) return 0.0;
return 0.0; if (reg_alpha == 0.0f) {
return -sum_grad / (sum_hess + reg_lambda);
} else { } else {
switch (reg_method) { return -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
case 2: return - sum_grad / (sum_hess + reg_lambda);
case 3: return
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
(sum_hess + 0.5 * reg_lambda);
default: return - sum_grad / sum_hess;
}
} }
} }
/*! \brief whether need forward small to big search: default right */ /*! \brief whether need forward small to big search: default right */
@ -153,6 +158,9 @@ struct GradStats {
inline void Clear(void) { inline void Clear(void) {
sum_grad = sum_hess = 0.0f; sum_grad = sum_hess = 0.0f;
} }
/*! \brief check if necessary information is ready */
inline static void CheckInfo(const BoosterInfo &info) {
}
/*! /*!
* \brief accumulate statistics, * \brief accumulate statistics,
* \param gpair the vector storing the gradient statistics * \param gpair the vector storing the gradient statistics
@ -188,14 +196,88 @@ struct GradStats {
} }
/*! \brief set leaf vector value based on statistics */ /*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{ inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
} }
protected: // constructor to allow inheritance
GradStats(void) {}
/*! \brief add statistics to the data */ /*! \brief add statistics to the data */
inline void Add(double grad, double hess) { inline void Add(double grad, double hess) {
sum_grad += grad; sum_hess += hess; sum_grad += grad; sum_hess += hess;
} }
}; };
/*! \brief vectorized cv statistics */
template<unsigned vsize>
struct CVGradStats : public GradStats {
// additional statistics
GradStats train[vsize], valid[vsize];
// constructor
explicit CVGradStats(const TrainParam &param) {
utils::Check(param.size_leaf_vector == vsize,
"CVGradStats: vsize must match size_leaf_vector");
this->Clear();
}
/*! \brief check if necessary information is ready */
inline static void CheckInfo(const BoosterInfo &info) {
utils::Check(info.fold_index.size() != 0,
"CVGradStats: require fold_index");
}
/*! \brief clear the statistics */
inline void Clear(void) {
GradStats::Clear();
for (unsigned i = 0; i < vsize; ++i) {
train[i].Clear(); valid[i].Clear();
}
}
inline void Add(const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
bst_uint ridx) {
GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
const size_t step = info.fold_index.size();
for (unsigned i = 0; i < vsize; ++i) {
const bst_gpair &b = gpair[(i + 1) * step + ridx];
if (info.fold_index[ridx] == i) {
valid[i].Add(b.grad, b.hess);
} else {
train[i].Add(b.grad, b.hess);
}
}
}
/*! \brief calculate gain of the solution */
inline double CalcGain(const TrainParam &param) const {
double ret = 0.0;
for (unsigned i = 0; i < vsize; ++i) {
ret += param.CalcGain(train[i].sum_grad,
train[i].sum_hess,
vsize * valid[i].sum_grad,
vsize * valid[i].sum_hess);
}
return ret / vsize;
}
/*! \brief add statistics to the data */
inline void Add(const CVGradStats &b) {
GradStats::Add(b);
for (unsigned i = 0; i < vsize; ++i) {
train[i].Add(b.train[i]);
valid[i].Add(b.valid[i]);
}
}
/*! \brief set current value to a - b */
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
GradStats::SetSubstract(a, b);
for (int i = 0; i < vsize; ++i) {
train[i].SetSubstract(a.train[i], b.train[i]);
valid[i].SetSubstract(a.valid[i], b.valid[i]);
}
}
/*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
for (int i = 0; i < vsize; ++i) {
vec[i] = param.learning_rate *
param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
}
}
};
/*! /*!
* \brief statistics that is helpful to store * \brief statistics that is helpful to store
* and represent a split solution for the tree * and represent a split solution for the tree

20
src/tree/updater.cpp Normal file
View File

@ -0,0 +1,20 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstring>
#include "./updater.h"
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
IUpdater* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner();
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree
} // namespace xgboost

View File

@ -14,9 +14,7 @@ namespace xgboost {
namespace tree { namespace tree {
/*! /*!
* \brief interface of tree update module, that performs update of a tree * \brief interface of tree update module, that performs update of a tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IUpdater { class IUpdater {
public: public:
/*! /*!
@ -28,7 +26,7 @@ class IUpdater {
/*! /*!
* \brief peform update to the tree models * \brief peform update to the tree models
* \param gpair the gradient pair statistics of the data * \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info extra side information that may be need, such as root index * \param info extra side information that may be need, such as root index
* \param trees pointer to the trese to be updated, upater will change the content of the tree * \param trees pointer to the trese to be updated, upater will change the content of the tree
* note: all the trees in the vector are updated, with the same statistics, * note: all the trees in the vector are updated, with the same statistics,
@ -36,36 +34,18 @@ class IUpdater {
* there can be multiple trees when we train random forest style model * there can be multiple trees when we train random forest style model
*/ */
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) = 0; const std::vector<RegTree*> &trees) = 0;
// destructor // destructor
virtual ~IUpdater(void) {} virtual ~IUpdater(void) {}
}; };
} // namespace tree
} // namespace xgboost
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
/*! /*!
* \brief create a updater based on name * \brief create a updater based on name
* \param name name of updater * \param name name of updater
* \return return the updater instance * \return return the updater instance
*/ */
template<typename FMatrix> IUpdater* CreateUpdater(const char *name);
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_H_ #endif // XGBOOST_TREE_UPDATER_H_

View File

@ -15,8 +15,8 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class ColMaker: public IUpdater<FMatrix> { class ColMaker: public IUpdater {
public: public:
virtual ~ColMaker(void) {} virtual ~ColMaker(void) {}
// set training parameter // set training parameter
@ -24,16 +24,17 @@ class ColMaker: public IUpdater<FMatrix> {
param.SetParam(name, val); param.SetParam(name, val);
} }
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info);
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
float lr = param.learning_rate; float lr = param.learning_rate;
param.learning_rate = lr / trees.size(); param.learning_rate = lr / trees.size();
// build tree // build tree
for (size_t i = 0; i < trees.size(); ++i) { for (size_t i = 0; i < trees.size(); ++i) {
Builder builder(param); Builder builder(param);
builder.Update(gpair, fmat, info, trees[i]); builder.Update(gpair, p_fmat, info, trees[i]);
} }
param.learning_rate = lr; param.learning_rate = lr;
} }
@ -76,17 +77,16 @@ class ColMaker: public IUpdater<FMatrix> {
explicit Builder(const TrainParam &param) : param(param) {} explicit Builder(const TrainParam &param) : param(param) {}
// update one tree, growing // update one tree, growing
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
this->InitData(gpair, fmat, info.root_index, *p_tree); this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) { for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree); this->FindSplit(depth, this->qexpand, gpair, p_fmat, info, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree); this->ResetPosition(this->qexpand, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand); this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
// if nothing left to be expand, break // if nothing left to be expand, break
if (qexpand.size() == 0) break; if (qexpand.size() == 0) break;
} }
@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
private: private:
// initialize temp data structure // initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair, inline void InitData(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) { const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
@ -137,8 +137,7 @@ class ColMaker: public IUpdater<FMatrix> {
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1; if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
} }
} }
} }
{ {
// initialize feature index // initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol()); unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@ -175,7 +174,7 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand, inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const BoosterInfo &info, const BoosterInfo &info,
const RegTree &tree) { const RegTree &tree) {
{// setup statistics space for each tree node {// setup statistics space for each tree node
@ -222,24 +221,25 @@ class ColMaker: public IUpdater<FMatrix> {
qexpand = newnodes; qexpand = newnodes;
} }
// enumerate the split values of specific feature // enumerate the split values of specific feature
template<typename Iter> inline void EnumerateSplit(const ColBatch::Entry *begin,
inline void EnumerateSplit(Iter it, unsigned fid, const ColBatch::Entry *end,
int d_step,
bst_uint fid,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<ThreadEntry> &temp, std::vector<ThreadEntry> &temp) {
bool is_forward_search) {
// clear all the temp statistics // clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) { for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear(); temp[qexpand[j]].stats.Clear();
} }
// left statistics // left statistics
TStats c(param); TStats c(param);
while (it.Next()) { for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
const bst_uint ridx = it.rindex(); const bst_uint ridx = it->index;
const int nid = position[ridx]; const int nid = position[ridx];
if (nid < 0) continue; if (nid < 0) continue;
// start working // start working
const float fvalue = it.fvalue(); const float fvalue = it->fvalue;
// get the statistics of nid // get the statistics of nid
ThreadEntry &e = temp[nid]; ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init // test if first hit, this is fine, because we set 0 during init
@ -252,7 +252,7 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) { if (c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search); e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
} }
} }
// update the statistics // update the statistics
@ -267,38 +267,46 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = is_forward_search ? rt_eps : -rt_eps; const float delta = d_step == +1 ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search); e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
} }
} }
} }
// find splits at current level, do split per level // find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand, inline void FindSplit(int depth,
const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index; std::vector<bst_uint> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) { if (param.colsample_bylevel != 1.0f) {
random::Shuffle(feat_set); random::Shuffle(feat_set);
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size()); unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included"); utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
feat_set.resize(n); feat_set.resize(n);
} }
// start enumeration utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size()); while (iter->Next()) {
#if defined(_OPENMP) const ColBatch &batch = iter->Value();
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1); // start enumeration
#endif const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, batch_size) #if defined(_OPENMP)
for (bst_omp_uint i = 0; i < nsize; ++i) { const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
const unsigned fid = feat_set[i]; #endif
const int tid = omp_get_thread_num(); #pragma omp parallel for schedule(dynamic, batch_size)
if (param.need_forward_search(fmat.GetColDensity(fid))) { for (bst_omp_uint i = 0; i < nsize; ++i) {
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true); const bst_uint fid = batch.col_index[i];
} const int tid = omp_get_thread_num();
if (param.need_backward_search(fmat.GetColDensity(fid))) { const ColBatch::Inst c = batch[i];
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false); if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
}
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
}
} }
} }
// after this each thread's stemp will get the best candidates, aggregate results // after this each thread's stemp will get the best candidates, aggregate results
@ -318,8 +326,8 @@ class ColMaker: public IUpdater<FMatrix> {
} }
} }
// reset position of each data points after split is created in the tree // reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) { inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1 // step 1, set default direct nodes to default, and leaf nodes to -1
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -343,22 +351,28 @@ class ColMaker: public IUpdater<FMatrix> {
} }
std::sort(fsplits.begin(), fsplits.end()); std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin()); fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size()); utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
#pragma omp parallel for schedule(dynamic, 1) while (iter->Next()) {
for (bst_omp_uint i = 0; i < nfeats; ++i) { const ColBatch &batch = iter->Value();
const unsigned fid = fsplits[i]; for (size_t i = 0; i < batch.size; ++i) {
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { ColBatch::Inst col = batch[i];
const bst_uint ridx = it.rindex(); const bst_uint fid = batch.col_index[i];
int nid = position[ridx]; const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
if (nid == -1) continue; #pragma omp parallel for schedule(static)
// go back to parent, correct those who are not default for (bst_omp_uint j = 0; j < ndata; ++j) {
nid = tree[nid].parent(); const bst_uint ridx = col[j].index;
if (tree[nid].split_index() == fid) { const float fvalue = col[j].fvalue;
if (it.fvalue() < tree[nid].split_cond()) { int nid = position[ridx];
position[ridx] = tree[nid].cleft(); if (nid == -1) continue;
} else { // go back to parent, correct those who are not default
position[ridx] = tree[nid].cright(); nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (fvalue < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
} else {
position[ridx] = tree[nid].cright();
}
} }
} }
} }
@ -369,7 +383,7 @@ class ColMaker: public IUpdater<FMatrix> {
// number of omp thread used during training // number of omp thread used during training
int nthread; int nthread;
// Per feature: shuffle index of each feature index // Per feature: shuffle index of each feature index
std::vector<unsigned> feat_index; std::vector<bst_uint> feat_index;
// Instance Data: current node position in the tree of each instance // Instance Data: current node position in the tree of each instance
std::vector<int> position; std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction // PerThread x PerTreeNode: statistics for per thread construction

View File

@ -12,8 +12,7 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix> class TreePruner: public IUpdater {
class TreePruner: public IUpdater<FMatrix> {
public: public:
virtual ~TreePruner(void) {} virtual ~TreePruner(void) {}
// set training parameter // set training parameter
@ -23,7 +22,7 @@ class TreePruner: public IUpdater<FMatrix> {
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
@ -75,7 +74,6 @@ class TreePruner: public IUpdater<FMatrix> {
// training parameter // training parameter
TrainParam param; TrainParam param;
}; };
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ #endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_

View File

@ -9,12 +9,13 @@
#include <limits> #include <limits>
#include "./param.h" #include "./param.h"
#include "./updater.h" #include "./updater.h"
#include "../utils/omp.h"
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class TreeRefresher: public IUpdater<FMatrix> { class TreeRefresher: public IUpdater {
public: public:
virtual ~TreeRefresher(void) {} virtual ~TreeRefresher(void) {}
// set training parameter // set training parameter
@ -23,7 +24,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
if (trees.size() == 0) return; if (trees.size() == 0) return;
@ -50,16 +51,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
fvec_temp[tid].Init(trees[0]->param.num_feature); fvec_temp[tid].Init(trees[0]->param.num_feature);
} }
// start accumulating statistics // start accumulating statistics
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(), utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size "); "too large batch size ");
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) { for (bst_omp_uint i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i); const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid]; RegTree::FVec &feats = fvec_temp[tid];

View File

@ -8,6 +8,7 @@
#define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS
#include <cstdio> #include <cstdio>
#include <cstdarg> #include <cstdarg>
#include <string>
#include <cstdlib> #include <cstdlib>
#ifdef _MSC_VER #ifdef _MSC_VER
#define fopen64 fopen #define fopen64 fopen

View File

@ -234,7 +234,7 @@ class BoostLearnTask{
std::vector<io::DataMatrix*> deval; std::vector<io::DataMatrix*> deval;
std::vector<const io::DataMatrix*> devalall; std::vector<const io::DataMatrix*> devalall;
utils::FeatMap fmap; utils::FeatMap fmap;
learner::BoostLearner<FMatrixS> learner; learner::BoostLearner learner;
}; };
} }

View File

@ -1,4 +1,4 @@
The solution has been created with Visual Studio Express 2013. The solution has been created with Visual Studio Express 2010.
Make sure to compile the Release version, unless you need to debug the code Make sure to compile the Release version, unless you need to debug the code
(and in the latter case modify the path in xgboost.py from release to test). (and in the latter case modify the path in xgboost.py from release to test).
Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line Note that you have two projects in one solution and they need to be compiled to use the standalone executable from the command line

View File

@ -1,11 +1,9 @@
 
Microsoft Visual Studio Solution File, Format Version 12.00 Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio Express 2013 for Windows Desktop # Visual Studio 2010
VisualStudioVersion = 12.0.30723.0 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{19766C3F-7508-49D0-BAAC-0988FCC9970C}"
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}"
EndProject EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{2E1AF937-28BB-4832-B916-309C9A0F6C4F}" Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
EndProject EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -15,22 +13,21 @@ Global
Release|x64 = Release|x64 Release|x64 = Release|x64
EndGlobalSection EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution GlobalSection(ProjectConfigurationPlatforms) = postSolution
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.ActiveCfg = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.Build.0 = Debug|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.ActiveCfg = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.Build.0 = Debug|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.ActiveCfg = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.Build.0 = Release|Win32 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.ActiveCfg = Release|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.Build.0 = Release|x64 {19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.ActiveCfg = Debug|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|Win32.Build.0 = Debug|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.ActiveCfg = Debug|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Debug|x64.Build.0 = Debug|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.ActiveCfg = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|Win32.Build.0 = Release|Win32 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.ActiveCfg = Release|x64 {B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.Build.0 = Release|x64
{2E1AF937-28BB-4832-B916-309C9A0F6C4F}.Release|x64.Build.0 = Release|x64
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

View File

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration> <Configuration>Debug</Configuration>
@ -18,8 +18,14 @@
<Platform>x64</Platform> <Platform>x64</Platform>
</ProjectConfiguration> </ProjectConfiguration>
</ItemGroup> </ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\gbm\gbm.cpp" />
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\tree\updater.cpp" />
<ClCompile Include="..\..\src\xgboost_main.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals"> <PropertyGroup Label="Globals">
<ProjectGuid>{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}</ProjectGuid> <ProjectGuid>{19766C3F-7508-49D0-BAAC-0988FCC9970C}</ProjectGuid>
<RootNamespace>xgboost</RootNamespace> <RootNamespace>xgboost</RootNamespace>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
@ -27,27 +33,23 @@
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
@ -111,10 +113,6 @@
<OptimizeReferences>true</OptimizeReferences> <OptimizeReferences>true</OptimizeReferences>
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\xgboost_main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>

View File

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration> <Configuration>Debug</Configuration>
@ -18,40 +18,38 @@
<Platform>x64</Platform> <Platform>x64</Platform>
</ProjectConfiguration> </ProjectConfiguration>
</ItemGroup> </ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\gbm\gbm.cpp" />
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\tree\updater.cpp" />
<ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
</ItemGroup>
<PropertyGroup Label="Globals"> <PropertyGroup Label="Globals">
<ProjectGuid>{2E1AF937-28BB-4832-B916-309C9A0F6C4F}</ProjectGuid> <ProjectGuid>{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}</ProjectGuid>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<Keyword>ManagedCProj</Keyword>
<RootNamespace>xgboost_wrapper</RootNamespace> <RootNamespace>xgboost_wrapper</RootNamespace>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset> <CharacterSet>MultiByte</CharacterSet>
<CLRSupport>true</CLRSupport>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset> <CharacterSet>MultiByte</CharacterSet>
<CLRSupport>true</CLRSupport>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset> <WholeProgramOptimization>true</WholeProgramOptimization>
<CLRSupport>true</CLRSupport> <CharacterSet>MultiByte</CharacterSet>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>DynamicLibrary</ConfigurationType> <ConfigurationType>DynamicLibrary</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries> <UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset> <WholeProgramOptimization>true</WholeProgramOptimization>
<CLRSupport>true</CLRSupport> <CharacterSet>MultiByte</CharacterSet>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
@ -69,85 +67,53 @@
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup> </ImportGroup>
<PropertyGroup Label="UserMacros" /> <PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <PropertyGroup />
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile> <ClCompile>
<WarningLevel>Level3</WarningLevel> <WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization> <Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
</ClCompile> </ClCompile>
<Link> <Link>
<GenerateDebugInformation>true</GenerateDebugInformation> <GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies />
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile> <ClCompile>
<WarningLevel>Level3</WarningLevel> <WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization> <Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile> </ClCompile>
<Link> <Link>
<GenerateDebugInformation>true</GenerateDebugInformation> <GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>
</AdditionalDependencies>
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile> <ClCompile>
<WarningLevel>Level3</WarningLevel> <WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> <Optimization>MaxSpeed</Optimization>
<PrecompiledHeader>Use</PrecompiledHeader> <FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile> </ClCompile>
<Link> <Link>
<GenerateDebugInformation>true</GenerateDebugInformation> <GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies /> <EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile> <ClCompile>
<WarningLevel>Level3</WarningLevel> <WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> <Optimization>MaxSpeed</Optimization>
<PrecompiledHeader>NotUsing</PrecompiledHeader> <FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<OpenMPSupport>true</OpenMPSupport> <OpenMPSupport>true</OpenMPSupport>
</ClCompile> </ClCompile>
<Link> <Link>
<GenerateDebugInformation>true</GenerateDebugInformation> <GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies> <EnableCOMDATFolding>true</EnableCOMDATFolding>
</AdditionalDependencies> <OptimizeReferences>true</OptimizeReferences>
</Link> </Link>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemGroup>
<Reference Include="System" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\src\io\io.h" />
<ClInclude Include="..\..\src\io\simple_dmatrix-inl.hpp" />
<ClInclude Include="..\..\wrapper\xgboost_wrapper.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\wrapper\xgboost.py" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>

View File

@ -1,126 +0,0 @@
# include xgboost library, must set chdir=TRURE
source("../xgboost.R", chdir=TRUE)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i,findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# test code here
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
# dump model
xgb.dump(bst, "dump.raw.txt")
# dump model with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save dmatrix into binary buffer
succ <- xgb.save(dtest, "dtest.buffer")
# save model into file
succ <- xgb.save(bst, "xgb.model")
# load model and data in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# assert they are the same
stopifnot(sum(abs(preds2-preds)) == 0)
###
# build dmatrix from sparseMatrix
###
print ('start running example of build DMatrix from R.sparseMatrix')
csc <- read.libsvm("agaricus.txt.train", 126)
label <- csc$label
data <- csc$data
dtrain <- xgb.DMatrix(data, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# build dmatrix from dense matrix
###
print ('start running example of build DMatrix from R.Matrix')
mat = as.matrix(data)
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py"s implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ("start running example to start from a initial prediction")
# specify parameters via map, definition are same as c++ version
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ("this is result of running from initial prediction")
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -10,6 +10,4 @@ Python
R R
===== =====
* To make the R wrapper, type ```make R``` in the root directory of project * See ../R-package
* R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)

View File

@ -1,222 +0,0 @@
# depends on matrix
succ <- require("Matrix")
if (!succ) {
stop("xgboost depends on Matrix library")
}
# load in library
dyn.load("./libxgboostR.so")
# constructing DMatrix
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
} else if(is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
} else if(class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
} else {
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
}
dmat <- structure(handle, class="xgb.DMatrix")
if (length(info) != 0) {
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
}
return(dmat)
}
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
return(ret)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
return(TRUE)
}
stop(pase("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist)
.Call("XGBoosterSetParam_R", handle, "seed", "0")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) != "character"){
stop("xgb.Booster: modelfile must be character");
}
.Call("XGBoosterLoadModel_R", handle, modelfile)
}
return(structure(handle, class="xgb.Booster"))
}
# train a model using given parameters
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list");
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
}
bst <- xgb.Booster(params, append(watchlist,dtrain))
for (i in 1:nrounds) {
if (is.null(obj)) {
succ <- xgb.iter.update(bst, dtrain, i-1)
} else {
pred <- xgb.predict(bst, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair)
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i-1)
cat(msg); cat("\n")
} else {
cat("["); cat(i); cat("]");
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
cat(":"); cat(ret$value)
}
cat("\n")
}
}
}
return(bst)
}
# save model or DMatrix to file
xgb.save <- function(handle, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character");
}
if (class(handle) == "xgb.Booster") {
.Call("XGBoosterSaveModel_R", handle, fname);
return(TRUE)
}
if (class(handle) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE))
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE)
}
# predict
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.predict: first argument must be type xgb.Booster")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix")
}
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
return(ret)
}
# dump model
xgb.dump <- function(booster, fname, fmap = "") {
if (class(booster) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster")
}
if (typeof(fname) != "character"){
stop("xgb.dump: second argument must be type character")
}
.Call("XGBoosterDumpModel_R", booster, fname, fmap)
return(TRUE)
}
##--------------------------------------
# the following are low level iteratively function, not needed
# if you do not want to use them
#---------------------------------------
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain)
return(TRUE)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess)
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
evnames <- list()
if (length(watchlist) != 0) {
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
return(msg)
}

View File

@ -12,7 +12,7 @@ import scipy.sparse as scp
if os.name == 'nt': if os.name == 'nt':
XGBOOST_PATH = os.path.dirname(__file__)+'/../windows/x64/Release/xgboost_wrapper.dll' XGBOOST_PATH = os.path.dirname(__file__)+'/../windows/x64/Release/xgboost_wrapper.dll'
else: else:
XGBOOST_PATH = os.path.dirname(__file__)+'/../libxgboostwrapper.so' XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so'
# load in xgboost library # load in xgboost library
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH) xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)

View File

@ -1,221 +0,0 @@
#include <vector>
#include <string>
#include <utility>
#include <cstring>
#include "xgboost_R.h"
#include "xgboost_wrapper.h"
#include "../src/utils/utils.h"
#include "../src/utils/omp.h"
#include "../src/utils/matrix_csr.h"
using namespace xgboost;
// implements error handling
namespace xgboost {
namespace utils {
void HandleAssertError(const char *msg) {
error("%s", msg);
}
void HandleCheckError(const char *msg) {
error("%s", msg);
}
} // namespace utils
} // namespace xgboost
extern "C" {
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) {
SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0];
int ncol = INTEGER(dim)[1];
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
const int *col_ptr = INTEGER(indptr);
const int *row_index = INTEGER(indices);
const double *col_data = REAL(data);
int ncol = length(indptr) - 1;
int ndata = length(data);
// transform into CSR format
std::vector<bst_ulong> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(row_index[j]);
}
}
builder.InitStorage();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
}
}
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
std::vector<float> row_data(ndata);
std::vector<unsigned> col_index(ndata);
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
col_index[i] = csr_data[i].first;
row_data[i] = csr_data[i].second;
}
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent));
}
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
return;
}
{
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
&vec[0], len);
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGBoosterFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGBoosterCreate_R(SEXP dmats) {
int len = length(dmats);
std::vector<void*> dvec;
for (int i = 0; i < len; ++i){
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
}
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)),
CHAR(asChar(val)));
}
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain));
}
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad);
std::vector<float> tgrad(len), thess(len);
#pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) {
tgrad[j] = REAL(grad)[j];
thess[j] = REAL(hess)[j];
}
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain),
&tgrad[0], &thess[0], len);
}
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats);
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
&vec_dmats[0], &vec_sptr[0], len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
&olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
&olen);
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
for (size_t i = 0; i < olen; ++i) {
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
fprintf(fo, "%s", res[i]);
}
fclose(fo);
}
}

View File

@ -1,124 +0,0 @@
#ifndef XGBOOST_WRAPPER_R_H_
#define XGBOOST_WRAPPER_R_H_
/*!
* \file xgboost_wrapper_R.h
* \author Tianqi Chen
* \brief R wrapper of xgboost
*/
extern "C" {
#include <Rinternals.h>
}
extern "C" {
/*!
* \brief load a data matrix
* \param fname name of the content
* \param silent whether print messages
* \return a loaded data matrix
*/
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
/*!
* \brief create matrix content from dense matrix
* This assumes the matrix is stored in column major format
* \param data R Matrix object
* \param missing which value to represent missing value
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
*/
SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
*/
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats list of handles to dmatrices
* \param evname name of evaluation
* \return the string containing evaluation stati
*/
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param output_margin whether only output raw margin value
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
/*!
* \brief dump model into text file
* \param handle handle
* \param fname file name of model that can be dumped into
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
};
#endif // XGBOOST_WRAPPER_R_H_

View File

@ -1,4 +1,6 @@
// implementations in ctypes // implementations in ctypes
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstdio> #include <cstdio>
#include <vector> #include <vector>
#include <string> #include <string>
@ -16,7 +18,7 @@ using namespace xgboost::io;
namespace xgboost { namespace xgboost {
namespace wrapper { namespace wrapper {
// booster wrapper class // booster wrapper class
class Booster: public learner::BoostLearner<FMatrixS> { class Booster: public learner::BoostLearner {
public: public:
explicit Booster(const std::vector<DataMatrix*>& mats) { explicit Booster(const std::vector<DataMatrix*>& mats) {
this->silent = 1; this->silent = 1;
@ -25,8 +27,8 @@ class Booster: public learner::BoostLearner<FMatrixS> {
} }
const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) { const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) {
this->CheckInitModel(); this->CheckInitModel();
this->Predict(dmat, output_margin, &this->preds_); this->Predict(dmat, output_margin != 0, &this->preds_);
*len = this->preds_.size(); *len = static_cast<bst_ulong>(this->preds_.size());
return &this->preds_[0]; return &this->preds_[0];
} }
inline void BoostOneIter(const DataMatrix &train, inline void BoostOneIter(const DataMatrix &train,
@ -37,7 +39,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
for (bst_omp_uint j = 0; j < ndata; ++j) { for (bst_omp_uint j = 0; j < ndata; ++j) {
gpair_[j] = bst_gpair(grad[j], hess[j]); gpair_[j] = bst_gpair(grad[j], hess[j]);
} }
gbm_->DoBoost(train.fmat, train.info.info, &gpair_); gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
} }
inline void CheckInitModel(void) { inline void CheckInitModel(void) {
if (!init_model) { if (!init_model) {
@ -45,7 +47,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
} }
} }
inline void LoadModel(const char *fname) { inline void LoadModel(const char *fname) {
learner::BoostLearner<FMatrixS>::LoadModel(fname); learner::BoostLearner::LoadModel(fname);
this->init_model = true; this->init_model = true;
} }
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) { inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
@ -54,7 +56,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
for (size_t i = 0; i < model_dump.size(); ++i) { for (size_t i = 0; i < model_dump.size(); ++i) {
model_dump_cptr[i] = model_dump[i].c_str(); model_dump_cptr[i] = model_dump[i].c_str();
} }
*len = model_dump.size(); *len = static_cast<bst_ulong>(model_dump.size());
return &model_dump_cptr[0]; return &model_dump_cptr[0];
} }
// temporal fields // temporal fields
@ -74,7 +76,7 @@ using namespace xgboost::wrapper;
extern "C"{ extern "C"{
void* XGDMatrixCreateFromFile(const char *fname, int silent) { void* XGDMatrixCreateFromFile(const char *fname, int silent) {
return LoadDataMatrix(fname, silent, false); return LoadDataMatrix(fname, silent != 0, false);
} }
void* XGDMatrixCreateFromCSR(const bst_ulong *indptr, void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices, const unsigned *indices,
@ -89,7 +91,7 @@ extern "C"{
} }
mat.row_data_.resize(nelem); mat.row_data_.resize(nelem);
for (bst_ulong i = 0; i < nelem; ++i) { for (bst_ulong i = 0; i < nelem; ++i) {
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]); mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
mat.info.info.num_col = std::max(mat.info.info.num_col, mat.info.info.num_col = std::max(mat.info.info.num_col,
static_cast<size_t>(indices[i]+1)); static_cast<size_t>(indices[i]+1));
} }
@ -108,7 +110,7 @@ extern "C"{
bst_ulong nelem = 0; bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) { for (bst_ulong j = 0; j < ncol; ++j) {
if (data[j] != missing) { if (data[j] != missing) {
mat.row_data_.push_back(SparseBatch::Entry(j, data[j])); mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
++nelem; ++nelem;
} }
} }
@ -135,17 +137,17 @@ extern "C"{
ret.info.info.num_row = len; ret.info.info.num_row = len;
ret.info.info.num_col = src.info.num_col(); ret.info.info.num_col = src.info.num_col();
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator(); utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
utils::Assert(iter->Next(), "slice"); utils::Assert(iter->Next(), "slice");
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
for (bst_ulong i = 0; i < len; ++i) { for (bst_ulong i = 0; i < len; ++i) {
const int ridx = idxset[i]; const int ridx = idxset[i];
SparseBatch::Inst inst = batch[ridx]; RowBatch::Inst inst = batch[ridx];
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows"); utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
ret.row_data_.resize(ret.row_data_.size() + inst.length); ret.row_data_.resize(ret.row_data_.size() + inst.length);
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length); sizeof(RowBatch::Entry) * inst.length);
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
if (src.info.labels.size() != 0) { if (src.info.labels.size() != 0) {
ret.info.labels.push_back(src.info.labels[ridx]); ret.info.labels.push_back(src.info.labels[ridx]);
@ -156,6 +158,9 @@ extern "C"{
if (src.info.info.root_index.size() != 0) { if (src.info.info.root_index.size() != 0) {
ret.info.info.root_index.push_back(src.info.info.root_index[ridx]); ret.info.info.root_index.push_back(src.info.info.root_index[ridx]);
} }
if (src.info.info.fold_index.size() != 0) {
ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]);
}
} }
return p_ret; return p_ret;
} }
@ -163,7 +168,7 @@ extern "C"{
delete static_cast<DataMatrix*>(handle); delete static_cast<DataMatrix*>(handle);
} }
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) { void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent); SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent != 0);
} }
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) { void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) {
std::vector<float> &vec = std::vector<float> &vec =
@ -181,24 +186,24 @@ extern "C"{
DataMatrix *pmat = static_cast<DataMatrix*>(handle); DataMatrix *pmat = static_cast<DataMatrix*>(handle);
pmat->info.group_ptr.resize(len + 1); pmat->info.group_ptr.resize(len + 1);
pmat->info.group_ptr[0] = 0; pmat->info.group_ptr[0] = 0;
for (bst_ulong i = 0; i < len; ++i) { for (uint64_t i = 0; i < len; ++i) {
pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i]+group[i]; pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i]+group[i];
} }
} }
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* len) { const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<float> &vec = const std::vector<float> &vec =
static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field); static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field);
*len = vec.size(); *len = static_cast<bst_ulong>(vec.size());
return &vec[0]; return &vec[0];
} }
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) { const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<unsigned> &vec = const std::vector<unsigned> &vec =
static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field); static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
*len = vec.size(); *len = static_cast<bst_ulong>(vec.size());
return &vec[0]; return &vec[0];
} }
bst_ulong XGDMatrixNumRow(const void *handle) { bst_ulong XGDMatrixNumRow(const void *handle) {
return static_cast<const DataMatrix*>(handle)->info.num_row(); return static_cast<bst_ulong>(static_cast<const DataMatrix*>(handle)->info.num_row());
} }
// xgboost implementation // xgboost implementation

View File

@ -15,6 +15,7 @@
// manually define unsign long // manually define unsign long
typedef unsigned long bst_ulong; typedef unsigned long bst_ulong;
extern "C" { extern "C" {
/*! /*!
* \brief load a data matrix * \brief load a data matrix