Merge branch 'unity'

Conflicts:
	.gitignore
	R-package/src/xgboost_R.cpp
	src/gbm/gblinear-inl.hpp
	tools/xgcombine_buffer.cpp
This commit is contained in:
tqchen
2015-01-18 20:09:21 -08:00
70 changed files with 6411 additions and 275 deletions

View File

@@ -34,7 +34,6 @@ xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
def ctypes2numpy(cptr, length, dtype):
"""convert a ctypes pointer array to numpy array """
assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
@@ -304,6 +303,7 @@ class Booster:
(ctypes.c_float*len(grad))(*grad),
(ctypes.c_float*len(hess))(*hess),
len(grad))
def eval_set(self, evals, it = 0, feval = None):
"""evaluates by metric
Args:
@@ -332,24 +332,38 @@ class Booster:
return res
def eval(self, mat, name = 'eval', it = 0):
return self.eval_set( [(mat,name)], it)
def predict(self, data, output_margin=False, ntree_limit=0):
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
"""
predict with data
Args:
data: DMatrix
the dmatrix storing the input
the dmatrix storing the input
output_margin: bool
whether output raw margin value that is untransformed
whether output raw margin value that is untransformed
ntree_limit: int
limit number of trees in prediction, default to 0, 0 means using all the trees
limit number of trees in prediction, default to 0, 0 means using all the trees
pred_leaf: bool
when this option is on, the output will be a matrix of (nsample, ntrees)
with each record indicate the predicted leaf index of each sample in each tree
Note that the leaf index of tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0
Returns:
numpy array of prediction
"""
option_mask = 0
if output_margin:
option_mask += 1
if pred_leaf:
option_mask += 2
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict(self.handle, data.handle,
int(output_margin), ntree_limit, ctypes.byref(length))
return ctypes2numpy(preds, length.value, 'float32')
option_mask, ntree_limit, ctypes.byref(length))
preds = ctypes2numpy(preds, length.value, 'float32')
if pred_leaf:
preds = preds.astype('int32')
nrow = data.num_row()
if preds.size != nrow and preds.size % nrow == 0:
preds = preds.reshape(nrow, preds.size / nrow)
return preds
def save_model(self, fname):
""" save model to file
Args:
@@ -542,3 +556,4 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
sys.stderr.write(res+'\n')
results.append(res)
return results

View File

@@ -8,7 +8,9 @@
#include <algorithm>
// include all std functions
using namespace std;
#ifdef _MSC_VER
#define isnan(x) (_isnan(x) != 0)
#endif
#include "./xgboost_wrapper.h"
#include "../src/data.h"
#include "../src/learner/learner-inl.hpp"
@@ -30,9 +32,9 @@ class Booster: public learner::BoostLearner {
this->init_model = false;
this->SetCacheData(mats);
}
inline const float *Pred(const DataMatrix &dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
inline const float *Pred(const DataMatrix &dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
this->CheckInitModel();
this->Predict(dmat, output_margin != 0, &this->preds_, ntree_limit);
this->Predict(dmat, (option_mask&1) != 0, &this->preds_, ntree_limit, (option_mask&2) != 0);
*len = static_cast<bst_ulong>(this->preds_.size());
return BeginPtr(this->preds_);
}
@@ -44,7 +46,7 @@ class Booster: public learner::BoostLearner {
for (bst_omp_uint j = 0; j < ndata; ++j) {
gpair_[j] = bst_gpair(grad[j], hess[j]);
}
gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
}
inline void CheckInitModel(void) {
if (!init_model) {
@@ -132,7 +134,7 @@ extern "C"{
bst_ulong nrow,
bst_ulong ncol,
float missing) {
bool nan_missing = std::isnan(missing);
bool nan_missing = isnan(missing);
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.info.info.num_row = nrow;
@@ -140,7 +142,7 @@ extern "C"{
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) {
if (std::isnan(data[j])) {
if (isnan(data[j])) {
utils::Check(nan_missing, "There are NAN in the matrix, however, you did not set missing=NAN");
} else {
if (nan_missing || data[j] != missing) {
@@ -284,8 +286,8 @@ extern "C"{
bst->eval_str = bst->EvalOneIter(iter, mats, names);
return bst->eval_str.c_str();
}
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, ntree_limit, len);
const float *XGBoosterPredict(void *handle, void *dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), option_mask, ntree_limit, len);
}
void XGBoosterLoadModel(void *handle, const char *fname) {
static_cast<Booster*>(handle)->LoadModel(fname);

View File

@@ -17,6 +17,28 @@ typedef unsigned long bst_ulong;
#ifdef __cplusplus
extern "C" {
#endif
/*!
* \brief initialize sync module, this is needed if used in distributed model
* normally, argv need to contain master_uri and master_port
* if start using submit_job_tcp script, then pass args to this will do
* \param argc number of arguments
* \param argv the arguments to be passed in sync module
*/
XGB_DLL void XGSyncInit(int argc, char *argv[]);
/*!
* \brief finalize sync module, call this when everything is done
*/
XGB_DLL void XGSyncFinalize(void);
/*!
* \brief get the rank
* \return return the rank of
*/
XGB_DLL int XGSyncGetRank(void);
/*!
* \brief get the world size from sync
* \return return the number of distributed job ran in the group
*/
XGB_DLL int XGSyncGetWorldSize(void);
/*!
* \brief load a data matrix
* \return a loaded data matrix
@@ -41,7 +63,7 @@ extern "C" {
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \return created dmatrix
*/
@@ -178,12 +200,18 @@ extern "C" {
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param output_margin whether only output raw margin value
* \param option_mask bit-mask of options taken in prediction, possible values
* 0:normal prediction
* 1:output margin instead of transformed value
* 2:output leaf index of trees instead of leaf value, note leaf index is unique per tree
* \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
* when the parameter is set to 0, we will use all the trees
* \param len used to store length of returning result
*/
XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len);
XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat,
int option_mask,
unsigned ntree_limit,
bst_ulong *len);
/*!
* \brief load model from existing file
* \param handle handle