Merge branch 'master' into unity

Conflicts:
	src/learner/evaluation-inl.hpp
	wrapper/xgboost_R.cpp
	wrapper/xgboost_wrapper.cpp
	wrapper/xgboost_wrapper.h
This commit is contained in:
tqchen
2014-08-26 20:32:07 -07:00
21 changed files with 169 additions and 146 deletions

View File

@@ -13,7 +13,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
@@ -75,7 +75,7 @@ print ('start running example to used cutomized objective function')
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 }
param = {'max_depth':2, 'eta':1, 'silent':1 }
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
@@ -107,7 +107,7 @@ bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# train xgboost for 1 round
bst = xgb.train( param, dtrain, 1, evallist )
# Note: we need the margin value instead of transformed prediction in set_base_margin

View File

@@ -62,9 +62,9 @@ extern "C" {
int ncol = length(indptr) - 1;
int ndata = length(data);
// transform into CSR format
std::vector<size_t> row_ptr;
std::vector<bst_ulong> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
@@ -119,7 +119,7 @@ extern "C" {
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
uint64_t olen;
bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
@@ -188,7 +188,7 @@ extern "C" {
&vec_dmats[0], &vec_sptr[0], len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
uint64_t olen;
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
@@ -207,13 +207,13 @@ extern "C" {
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
uint64_t olen;
bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
&olen);
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
for (size_t i = 0; i < olen; ++i) {
fprintf(fo, "booster[%lu]:\n", i);
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
fprintf(fo, "%s", res[i]);
}
fclose(fo);

View File

@@ -23,18 +23,18 @@ class Booster: public learner::BoostLearner<FMatrixS> {
this->init_model = false;
this->SetCacheData(mats);
}
const float *Pred(const DataMatrix &dmat, int output_margin, uint64_t *len) {
const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) {
this->CheckInitModel();
this->Predict(dmat, output_margin, &this->preds_);
*len = this->preds_.size();
return &this->preds_[0];
}
inline void BoostOneIter(const DataMatrix &train,
float *grad, float *hess, uint64_t len) {
float *grad, float *hess, bst_ulong len) {
this->gpair_.resize(len);
const unsigned ndata = static_cast<unsigned>(len);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
gpair_[j] = bst_gpair(grad[j], hess[j]);
}
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@@ -48,7 +48,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
learner::BoostLearner<FMatrixS>::LoadModel(fname);
this->init_model = true;
}
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, uint64_t *len) {
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
model_dump = this->DumpModel(fmap, with_stats);
model_dump_cptr.resize(model_dump.size());
for (size_t i = 0; i < model_dump.size(); ++i) {
@@ -76,19 +76,19 @@ extern "C"{
void* XGDMatrixCreateFromFile(const char *fname, int silent) {
return LoadDataMatrix(fname, silent, false);
}
void* XGDMatrixCreateFromCSR(const uint64_t *indptr,
void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,
uint64_t nindptr,
uint64_t nelem) {
bst_ulong nindptr,
bst_ulong nelem) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.row_ptr_.resize(nindptr);
for (uint64_t i = 0; i < nindptr; ++ i) {
for (bst_ulong i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
}
mat.row_data_.resize(nelem);
for (uint64_t i = 0; i < nelem; ++i) {
for (bst_ulong i = 0; i < nelem; ++i) {
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
mat.info.info.num_col = std::max(mat.info.info.num_col,
static_cast<uint64_t>(indices[i]+1));
@@ -97,16 +97,16 @@ extern "C"{
return p_mat;
}
void* XGDMatrixCreateFromMat(const float *data,
uint64_t nrow,
uint64_t ncol,
bst_ulong nrow,
bst_ulong ncol,
float missing) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.info.info.num_row = nrow;
mat.info.info.num_col = ncol;
for (uint64_t i = 0; i < nrow; ++i, data += ncol) {
uint64_t nelem = 0;
for (uint64_t j = 0; j < ncol; ++j) {
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) {
if (data[j] != missing) {
mat.row_data_.push_back(SparseBatch::Entry(j, data[j]));
++nelem;
@@ -118,7 +118,7 @@ extern "C"{
}
void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
uint64_t len) {
bst_ulong len) {
DMatrixSimple tmp;
DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
if (dsrc.magic != DMatrixSimple::kMagic) {
@@ -139,10 +139,10 @@ extern "C"{
iter->BeforeFirst();
utils::Assert(iter->Next(), "slice");
const SparseBatch &batch = iter->Value();
for (uint64_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
const int ridx = idxset[i];
SparseBatch::Inst inst = batch[ridx];
utils::Check(static_cast<uint64_t>(ridx) < batch.size, "slice index exceed number of rows");
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
ret.row_data_.resize(ret.row_data_.size() + inst.length);
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length);
@@ -168,13 +168,13 @@ extern "C"{
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent);
}
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, uint64_t len) {
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) {
std::vector<float> &vec =
static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
vec.resize(len);
memcpy(&vec[0], info, sizeof(float) * len);
}
void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, uint64_t len) {
void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) {
std::vector<unsigned> &vec =
static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
vec.resize(len);
@@ -194,20 +194,20 @@ extern "C"{
*len = vec.size();
return &vec[0];
}
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, uint64_t* len) {
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<unsigned> &vec =
static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
*len = vec.size();
return &vec[0];
}
uint64_t XGDMatrixNumRow(const void *handle) {
bst_ulong XGDMatrixNumRow(const void *handle) {
return static_cast<const DataMatrix*>(handle)->info.num_row();
}
// xgboost implementation
void *XGBoosterCreate(void *dmats[], uint64_t len) {
void *XGBoosterCreate(void *dmats[], bst_ulong len) {
std::vector<DataMatrix*> mats;
for (uint64_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
mats.push_back(dtr);
}
@@ -227,7 +227,7 @@ extern "C"{
bst->UpdateOneIter(iter, *dtr);
}
void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, uint64_t len) {
float *grad, float *hess, bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
bst->CheckInitModel();
@@ -235,11 +235,11 @@ extern "C"{
bst->BoostOneIter(*dtr, grad, hess, len);
}
const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], uint64_t len) {
const char *evnames[], bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
std::vector<std::string> names;
std::vector<const DataMatrix*> mats;
for (uint64_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
mats.push_back(static_cast<DataMatrix*>(dmats[i]));
names.push_back(std::string(evnames[i]));
}
@@ -247,7 +247,7 @@ extern "C"{
bst->eval_str = bst->EvalOneIter(iter, mats, names);
return bst->eval_str.c_str();
}
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, uint64_t *len) {
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len) {
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, len);
}
void XGBoosterLoadModel(void *handle, const char *fname) {
@@ -256,7 +256,7 @@ extern "C"{
void XGBoosterSaveModel(const void *handle, const char *fname) {
static_cast<const Booster*>(handle)->SaveModel(fname);
}
const char** XGBoosterDumpModel(void *handle, const char *fmap, uint64_t *len){
const char** XGBoosterDumpModel(void *handle, const char *fmap, bst_ulong *len){
utils::FeatMap featmap;
if (strlen(fmap) != 0) {
featmap.LoadText(fmap);

View File

@@ -7,15 +7,17 @@
* can be used to create wrapper of other languages
*/
#include <cstdio>
// define uint64_t
typedef unsigned long uint64_t;
#define XGB_DLL
// manually define unsign long
typedef unsigned long bst_ulong;
extern "C" {
/*!
* \brief load a data matrix
* \return a loaded data matrix
*/
void* XGDMatrixCreateFromFile(const char *fname, int silent);
XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);
/*!
* \brief create a matrix content from csr format
* \param indptr pointer to row headers
@@ -25,11 +27,11 @@ extern "C" {
* \param nelem number of nonzero elements in the matrix
* \return created dmatrix
*/
void* XGDMatrixCreateFromCSR(const uint64_t *indptr,
const unsigned *indices,
const float *data,
uint64_t nindptr,
uint64_t nelem);
XGB_DLL void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem);
/*!
* \brief create matrix content from dense matrix
* \param data pointer to the data space
@@ -38,10 +40,10 @@ extern "C" {
* \param missing which value to represent missing value
* \return created dmatrix
*/
void* XGDMatrixCreateFromMat(const float *data,
uint64_t nrow,
uint64_t ncol,
float missing);
XGB_DLL void* XGDMatrixCreateFromMat(const float *data,
bst_ulong nrow,
bst_ulong ncol,
float missing);
/*!
* \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced
@@ -49,20 +51,20 @@ extern "C" {
* \param len length of index set
* \return a sliced new matrix
*/
void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
uint64_t len);
XGB_DLL void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
bst_ulong len);
/*!
* \brief free space in data matrix
*/
void XGDMatrixFree(void *handle);
XGB_DLL void XGDMatrixFree(void *handle);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
XGB_DLL void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
/*!
* \brief set float vector to a content in info
* \param handle a instance of data matrix
@@ -70,7 +72,7 @@ extern "C" {
* \param array pointer to float vector
* \param len length of array
*/
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, uint64_t len);
XGB_DLL void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, bst_ulong len);
/*!
* \brief set uint32 vector to a content in info
* \param handle a instance of data matrix
@@ -78,14 +80,14 @@ extern "C" {
* \param array pointer to float vector
* \param len length of array
*/
void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, uint64_t len);
XGB_DLL void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, bst_ulong len);
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
*/
void XGDMatrixSetGroup(void *handle, const unsigned *group, uint64_t len);
XGB_DLL void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len);
/*!
* \brief get float info vector from matrix
* \param handle a instance of data matrix
@@ -93,7 +95,7 @@ extern "C" {
* \param out_len used to set result length
* \return pointer to the result
*/
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, uint64_t* out_len);
XGB_DLL const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* out_len);
/*!
* \brief get uint32 info vector from matrix
* \param handle a instance of data matrix
@@ -101,37 +103,37 @@ extern "C" {
* \param out_len used to set result length
* \return pointer to the result
*/
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, uint64_t* out_len);
XGB_DLL const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* out_len);
/*!
* \brief return number of rows
*/
uint64_t XGDMatrixNumRow(const void *handle);
XGB_DLL bst_ulong XGDMatrixNumRow(const void *handle);
// --- start XGBoost class
/*!
* \brief create xgboost learner
* \param dmats matrices that are set to be cached
* \param len length of dmats
*/
void *XGBoosterCreate(void* dmats[], uint64_t len);
XGB_DLL void *XGBoosterCreate(void* dmats[], bst_ulong len);
/*!
* \brief free obj in handle
* \param handle handle to be freed
*/
void XGBoosterFree(void* handle);
XGB_DLL void XGBoosterFree(void* handle);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam(void *handle, const char *name, const char *value);
XGB_DLL void XGBoosterSetParam(void *handle, const char *name, const char *value);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
XGB_DLL void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
@@ -141,8 +143,8 @@ extern "C" {
* \param hess second order gradient statistics
* \param len length of grad/hess array
*/
void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, uint64_t len);
XGB_DLL void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, bst_ulong len);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
@@ -152,8 +154,8 @@ extern "C" {
* \param len length of dmats
* \return the string containing evaluation stati
*/
const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], uint64_t len);
XGB_DLL const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], bst_ulong len);
/*!
* \brief make prediction based on dmat
* \param handle handle
@@ -161,19 +163,19 @@ extern "C" {
* \param output_margin whether only output raw margin value
* \param len used to store length of returning result
*/
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, uint64_t *len);
XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel(void *handle, const char *fname);
XGB_DLL void XGBoosterLoadModel(void *handle, const char *fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel(const void *handle, const char *fname);
XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
/*!
* \brief dump model, return array of strings representing model dump
* \param handle handle
@@ -181,7 +183,7 @@ extern "C" {
* \param out_len length of output array
* \return char *data[], representing dump of each model
*/
const char **XGBoosterDumpModel(void *handle, const char *fmap,
uint64_t *out_len);
XGB_DLL const char **XGBoosterDumpModel(void *handle, const char *fmap,
bst_ulong *out_len);
};
#endif // XGBOOST_WRAPPER_H_