Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
This commit is contained in:
commit
0794dd0f6f
@ -1,6 +1,6 @@
|
|||||||
export CC = gcc
|
export CC = gcc
|
||||||
export CXX = g++
|
export CXX = g++
|
||||||
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fopenmp
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
SLIB = libxgboostpy.so
|
SLIB = libxgboostpy.so
|
||||||
|
|||||||
@ -22,6 +22,13 @@ xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
|
|||||||
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
|
||||||
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
|
||||||
|
|
||||||
|
def ctypes2numpy( cptr, length ):
|
||||||
|
# convert a ctypes pointer array to numpy
|
||||||
|
assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
|
||||||
|
res = numpy.zeros( length, dtype='float32' )
|
||||||
|
assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
|
||||||
|
return res
|
||||||
|
|
||||||
# data matrix used in xgboost
|
# data matrix used in xgboost
|
||||||
class DMatrix:
|
class DMatrix:
|
||||||
# constructor
|
# constructor
|
||||||
@ -73,7 +80,7 @@ class DMatrix:
|
|||||||
def get_label(self):
|
def get_label(self):
|
||||||
length = ctypes.c_ulong()
|
length = ctypes.c_ulong()
|
||||||
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
|
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
|
||||||
return numpy.array( [labels[i] for i in xrange(length.value)] )
|
return ctypes2numpy( labels, length.value );
|
||||||
# clear everything
|
# clear everything
|
||||||
def clear(self):
|
def clear(self):
|
||||||
xglib.XGDMatrixClear(self.handle)
|
xglib.XGDMatrixClear(self.handle)
|
||||||
@ -138,7 +145,7 @@ class Booster:
|
|||||||
def predict(self, data, bst_group = -1):
|
def predict(self, data, bst_group = -1):
|
||||||
length = ctypes.c_ulong()
|
length = ctypes.c_ulong()
|
||||||
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
|
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
|
||||||
return numpy.array( [ preds[i] for i in xrange(length.value)])
|
return ctypes2numpy( preds, length.value )
|
||||||
def save_model(self, fname):
|
def save_model(self, fname):
|
||||||
""" save model to file """
|
""" save model to file """
|
||||||
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
|
xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) )
|
||||||
|
|||||||
@ -75,6 +75,7 @@ namespace xgboost{
|
|||||||
inline void CheckInit(void){
|
inline void CheckInit(void){
|
||||||
if(!init_col_){
|
if(!init_col_){
|
||||||
this->data.InitData();
|
this->data.InitData();
|
||||||
|
init_col_ = true;
|
||||||
}
|
}
|
||||||
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
|
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -283,6 +283,7 @@ namespace xgboost{
|
|||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
|
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
|
||||||
|
|
||||||
}
|
}
|
||||||
}else
|
}else
|
||||||
#pragma omp parallel for schedule( static )
|
#pragma omp parallel for schedule( static )
|
||||||
|
|||||||
@ -83,7 +83,7 @@ namespace xgboost{
|
|||||||
float sum = 0.0f, wsum = 0.0f;
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
for (unsigned i = 0; i < ndata; ++i){
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
const float wt = info.GetWeight(i);
|
const float wt = info.GetWeight(i);
|
||||||
if (preds[i] > 0.5f){
|
if (preds[i] > 0.5f){
|
||||||
if (info.labels[i] < 0.5f) sum += wt;
|
if (info.labels[i] < 0.5f) sum += wt;
|
||||||
}
|
}
|
||||||
@ -99,6 +99,39 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Error */
|
||||||
|
struct EvalMatchError : public IEvaluator{
|
||||||
|
public:
|
||||||
|
EvalMatchError(const char *name){
|
||||||
|
name_ = name;
|
||||||
|
abs_ = 0;
|
||||||
|
if(!strcmp("mabserror", name)) abs_ =1;
|
||||||
|
}
|
||||||
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
|
const DMatrix::Info &info) const {
|
||||||
|
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||||
|
float sum = 0.0f, wsum = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
|
||||||
|
for (unsigned i = 0; i < ndata; ++i){
|
||||||
|
const float wt = info.GetWeight(i);
|
||||||
|
int label = static_cast<int>(info.labels[i]);
|
||||||
|
if( label < 0 && abs_ != 0 ) label = -label-1;
|
||||||
|
if (static_cast<int>(preds[i]) != label ){
|
||||||
|
sum += wt;
|
||||||
|
}
|
||||||
|
wsum += wt;
|
||||||
|
}
|
||||||
|
return sum / wsum;
|
||||||
|
}
|
||||||
|
virtual const char *Name(void) const{
|
||||||
|
return name_.c_str();
|
||||||
|
}
|
||||||
|
int abs_;
|
||||||
|
std::string name_;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/*! \brief Area under curve, for both classification and rank */
|
/*! \brief Area under curve, for both classification and rank */
|
||||||
struct EvalAuc : public IEvaluator{
|
struct EvalAuc : public IEvaluator{
|
||||||
virtual float Eval(const std::vector<float> &preds,
|
virtual float Eval(const std::vector<float> &preds,
|
||||||
@ -281,6 +314,8 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE());
|
if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE());
|
||||||
if (!strcmp(name, "error")) evals_.push_back(new EvalError());
|
if (!strcmp(name, "error")) evals_.push_back(new EvalError());
|
||||||
|
if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError("merror"));
|
||||||
|
if (!strcmp(name, "mabserror")) evals_.push_back(new EvalMatchError("mabserror"));
|
||||||
if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
|
if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
|
||||||
if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc());
|
if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc());
|
||||||
if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
|
if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
|
||||||
|
|||||||
@ -77,7 +77,7 @@ namespace xgboost{
|
|||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
std::vector< float > rec;
|
std::vector< float > rec;
|
||||||
#pragma for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (unsigned k = 0; k < ngroup; ++k){
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
rec.clear();
|
rec.clear();
|
||||||
int nhit = 0;
|
int nhit = 0;
|
||||||
@ -127,13 +127,16 @@ namespace xgboost{
|
|||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
std::vector<float> rec(nclass);
|
std::vector<float> rec(nclass);
|
||||||
#pragma for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
for( int k = 0; k < nclass; ++ k ){
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
rec[k] = preds[j + k * ndata];
|
rec[k] = preds[j + k * ndata];
|
||||||
}
|
}
|
||||||
Softmax( rec );
|
Softmax( rec );
|
||||||
int label = static_cast<int>(info.labels[j]);
|
int label = static_cast<int>(info.labels[j]);
|
||||||
|
if( label < 0 ){
|
||||||
|
label = -label - 1;
|
||||||
|
}
|
||||||
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
|
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
|
||||||
for( int k = 0; k < nclass; ++ k ){
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
float p = rec[ k ];
|
float p = rec[ k ];
|
||||||
@ -151,22 +154,22 @@ namespace xgboost{
|
|||||||
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
utils::Assert( nclass != 0, "must set num_class to use softmax" );
|
||||||
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
|
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
|
||||||
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
|
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
std::vector<float> rec(nclass);
|
std::vector<float> rec(nclass);
|
||||||
#pragma for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (unsigned j = 0; j < ndata; ++j){
|
for (unsigned j = 0; j < ndata; ++j){
|
||||||
for( int k = 0; k < nclass; ++ k ){
|
for( int k = 0; k < nclass; ++ k ){
|
||||||
rec[k] = preds[j + k * ndata];
|
rec[k] = preds[j + k * ndata];
|
||||||
}
|
}
|
||||||
Softmax( rec );
|
|
||||||
preds[j] = FindMaxIndex( rec );
|
preds[j] = FindMaxIndex( rec );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
preds.resize( ndata );
|
preds.resize( ndata );
|
||||||
}
|
}
|
||||||
virtual const char* DefaultEvalMetric(void) {
|
virtual const char* DefaultEvalMetric(void) {
|
||||||
return "error";
|
return "merror";
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
int nclass;
|
int nclass;
|
||||||
@ -203,7 +206,7 @@ namespace xgboost{
|
|||||||
// thread use its own random number generator, seed by thread id and current iteration
|
// thread use its own random number generator, seed by thread id and current iteration
|
||||||
random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() );
|
random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() );
|
||||||
std::vector< std::pair<float,unsigned> > rec;
|
std::vector< std::pair<float,unsigned> > rec;
|
||||||
#pragma for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (unsigned k = 0; k < ngroup; ++k){
|
for (unsigned k = 0; k < ngroup; ++k){
|
||||||
rec.clear();
|
rec.clear();
|
||||||
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
|
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
|
||||||
|
|||||||
@ -26,7 +26,9 @@ namespace xgboost{
|
|||||||
inline static int FindMaxIndex( std::vector<float>& rec ){
|
inline static int FindMaxIndex( std::vector<float>& rec ){
|
||||||
size_t mxid = 0;
|
size_t mxid = 0;
|
||||||
for( size_t i = 1; i < rec.size(); ++ i ){
|
for( size_t i = 1; i < rec.size(); ++ i ){
|
||||||
if( rec[i] > rec[mxid] ) mxid = i;
|
if( rec[i] > rec[mxid]+1e-6f ){
|
||||||
|
mxid = i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return (int)mxid;
|
return (int)mxid;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user