start unity refactor

This commit is contained in:
tqchen
2014-08-15 20:15:58 -07:00
parent 5b215742c2
commit 2a92c82b92
49 changed files with 3659 additions and 5803 deletions

84
learner/dmatrix.h Normal file
View File

@@ -0,0 +1,84 @@
#ifndef XGBOOST_LEARNER_DMATRIX_H_
#define XGBOOST_LEARNER_DMATRIX_H_
/*!
* \file dmatrix.h
* \brief meta data and template data structure
* used for regression/classification/ranking
* \author Tianqi Chen
*/
#include "../data.h"
namespace xgboost {
namespace learner {
/*!
* \brief meta information needed in training, including label, weight
*/
struct MetaInfo {
/*! \brief label of each instance */
std::vector<float> labels;
/*!
* \brief the index of begin and end of a group
* needed when the learning task is ranking
*/
std::vector<bst_uint> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*!
* \brief specified root index of each instance,
* can be used for multi task setting
*/
std::vector<unsigned> root_index;
/*! \brief get weight of each instances */
inline float GetWeight(size_t i) const {
if(weights.size() != 0) {
return weights[i];
} else {
return 1.0f;
}
}
/*! \brief get root index of i-th instance */
inline float GetRoot(size_t i) const {
if(root_index.size() != 0) {
return static_cast<float>(root_index[i]);
} else {
return 0;
}
}
inline void SaveBinary(utils::IStream &fo) {
fo.Write(labels);
fo.Write(group_ptr);
fo.Write(weights);
fo.Write(root_index);
}
inline void LoadBinary(utils::IStream &fi) {
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
utils::Check(fi.Read(&root_index), "MetaInfo: invalid format");
}
};
/*!
* \brief data object used for learning,
* \tparam FMatrix type of feature data source
*/
template<typename FMatrix>
struct DMatrix {
/*! \brief meta information about the dataset */
MetaInfo info;
/*! \brief number of rows in the DMatrix */
size_t num_row;
/*! \brief feature matrix about data content */
FMatrix fmat;
/*!
* \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached
*/
void *cache_learner_ptr_;
/*! \brief default constructor */
DMatrix(void) : cache_learner_ptr_(NULL) {}
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_DMATRIX_H_

346
learner/evaluation-inl.hpp Normal file
View File

@@ -0,0 +1,346 @@
#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
/*!
* \file xgboost_evaluation-inl.hpp
* \brief evaluation metrics for regression and classification and rank
* \author Kailong Chen, Tianqi Chen
*/
#include <vector>
#include <utility>
#include <string>
#include <climits>
#include <algorithm>
#include "./evaluation.h"
#include "./helper_utils.h"
namespace xgboost {
namespace learner {
/*!
* \brief base class of elementwise evaluation
* \tparam Derived the name of subclass
*/
template<typename Derived>
struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(),
"label and prediction size not match");
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+:sum, wsum) schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
const float wt = info.GetWeight(i);
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
wsum += wt;
}
return Derived::GetFinal(sum, wsum);
}
/*!
* \brief to be implemented by subclass,
* get evaluation result from one row
* \param label label of current instance
* \param pred prediction value of current instance
* \param weight weight of current instance
*/
inline static float EvalRow(float label, float pred);
/*!
* \brief to be overide by subclas, final trasnformation
* \param esum the sum statistics returned by EvalRow
* \param wsum sum of weight
*/
inline static float GetFinal(float esum, float wsum) {
return esum / wsum;
}
};
/*! \brief RMSE */
struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
virtual const char *Name(void) const {
return "rmse";
}
inline static float EvalRow(float label, float pred) {
float diff = label - pred;
return diff * diff;
}
inline static float GetFinal(float esum, float wsum) {
return std::sqrt(esum / wsum);
}
};
/*! \brief logloss */
struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
virtual const char *Name(void) const {
return "logloss";
}
inline static float EvalRow(float y, float py) {
return - y * std::log(py) - (1.0f - y) * std::log(1 - py);
}
};
/*! \brief error */
struct EvalError : public EvalEWiseBase<EvalError> {
virtual const char *Name(void) const {
return "error";
}
inline static float EvalRow(float label, float pred) {
// assume label is in [0,1]
return pred > 0.5f ? 1.0f - label : label;
}
};
/*! \brief match error */
struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
virtual const char *Name(void) const {
return "merror";
}
inline static float EvalRow(float label, float pred) {
return static_cast<int>(pred) != static_cast<int>(label);
}
};
/*! \brief AMS: also records best threshold */
struct EvalAMS : public IEvaluator {
public:
explicit EvalAMS(const char *name) {
name_ = name;
// note: ams@0 will automatically select which ratio to go
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
rec[i] = std::make_pair(preds[i], i);
}
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
if (ntop == 0) ntop = ndata;
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
const unsigned ridx = rec[i].second;
const float wt = info.weights[ridx];
if (info.labels[ridx] > 0.5f) {
s_tp += wt;
} else {
b_fp += wt;
}
if (rec[i].first != rec[i+1].first) {
double ams = sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
if (tams < ams) {
thresindex = i;
tams = ams;
}
}
}
if (ntop == ndata) {
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
return tams;
} else {
return sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
}
}
virtual const char *Name(void) const {
return name_.c_str();
}
private:
std::string name_;
float ratio_;
};
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.back() == preds.size(),
"EvalAuc: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
// sum statictis
double sum_auc = 0.0f;
#pragma omp parallel reduction(+:sum_auc)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], j));
}
std::sort(rec.begin(), rec.end(), CmpFirst);
// calculate AUC
double sum_pospair = 0.0;
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
for (size_t j = 0; j < rec.size(); ++j) {
const float wt = info.GetWeight(rec[j].second);
const float ctr = info.labels[rec[j].second];
// keep bucketing predictions in same bucket
if (j != 0 && rec[j].first != rec[j - 1].first) {
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
buf_neg = buf_pos = 0.0f;
}
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
}
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
// check weird conditions
utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
"AUC: the dataset only contains pos or neg samples");
// this is the AUC
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
// return average AUC over list
return static_cast<float>(sum_auc) / ngroup;
}
virtual const char *Name(void) const {
return "auc";
}
};
/*! \brief Evaluate rank list */
struct EvalRankList : public IEvaluator {
public:
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(),
"label size predict size not match");
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
utils::Assert(gptr.back() == preds.size(),
"EvalRanklist: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
// sum statistics
double sum_metric = 0.0f;
#pragma omp parallel reduction(+:sum_metric)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
}
sum_metric += this->EvalMetric(rec);
}
}
return static_cast<float>(sum_metric) / ngroup;
}
virtual const char *Name(void) const {
return name_.c_str();
}
protected:
explicit EvalRankList(const char *name) {
name_ = name;
minus_ = false;
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
topn_ = UINT_MAX;
}
if (name[strlen(name) - 1] == '-') {
minus_ = true;
}
}
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0;
protected:
unsigned topn_;
std::string name_;
bool minus_;
};
/*! \brief Precison at N, for both classification and rank */
struct EvalPrecision : public EvalRankList{
public:
explicit EvalPrecision(const char *name) : EvalRankList(name) {}
protected:
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
// calculate Preicsion
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhit = 0;
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
nhit += (rec[j].second != 0);
}
return static_cast<float>(nhit) / topn_;
}
};
/*! \brief NDCG */
struct EvalNDCG : public EvalRankList{
public:
explicit EvalNDCG(const char *name) : EvalRankList(name) {}
protected:
inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
double sumdcg = 0.0;
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
const unsigned rel = rec[i].second;
if (rel != 0) {
sumdcg += ((1 << rel) - 1) / logf(i + 2);
}
}
return static_cast<float>(sumdcg);
}
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
std::stable_sort(rec.begin(), rec.end(), CmpFirst);
float dcg = this->CalcDCG(rec);
std::stable_sort(rec.begin(), rec.end(), CmpSecond);
float idcg = this->CalcDCG(rec);
if (idcg == 0.0f) {
if (minus_) {
return 0.0f;
} else {
return 1.0f;
}
}
return dcg/idcg;
}
};
/*! \brief Precison at N, for both classification and rank */
struct EvalMAP : public EvalRankList {
public:
explicit EvalMAP(const char *name) : EvalRankList(name) {}
protected:
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhits = 0;
double sumap = 0.0;
for (size_t i = 0; i < rec.size(); ++i) {
if (rec[i].second != 0) {
nhits += 1;
if (i < this->topn_) {
sumap += static_cast<float>(nhits) / (i+1);
}
}
}
if (nhits != 0) {
sumap /= nhits;
return static_cast<float>(sumap);
} else {
if (minus_) {
return 0.0f;
} else {
return 1.0f;
}
}
}
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_EVALUATION_INL_HPP_

82
learner/evaluation.h Normal file
View File

@@ -0,0 +1,82 @@
#ifndef XGBOOST_LEARNER_EVALUATION_H_
#define XGBOOST_LEARNER_EVALUATION_H_
/*!
* \file evaluation.h
* \brief interface of evaluation function supported in xgboost
* \author Tianqi Chen, Kailong Chen
*/
#include <string>
#include <vector>
#include "../utils/utils.h"
namespace xgboost {
namespace learner {
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param info information, including label etc.
*/
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
/*! \brief virtual destructor */
virtual ~IEvaluator(void) {}
};
} // namespace learner
} // namespace xgboost
// include implementations of evaluation functions
#include "evaluation-inl.hpp"
// factory function
namespace xgboost {
namespace learner {
inline IEvaluator* CreateEvaluator(const char *name) {
if (!strcmp(name, "rmse")) return new EvalRMSE();
if (!strcmp(name, "error")) return new EvalError();
if (!strcmp(name, "merror")) return new EvalMatchError();
if (!strcmp(name, "logloss")) return new EvalLogLoss();
if (!strcmp(name, "auc")) return new EvalAuc();
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
utils::Error("unknown evaluation metric type: %s", name);
return NULL;
}
/*! \brief a set of evaluators */
class EvalSet{
public:
inline void AddEval(const char *name) {
for (size_t i = 0; i < evals_.size(); ++i) {
if (!strcmp(name, evals_[i]->Name())) return;
}
evals_.push_back(CreateEvaluator(name));
}
~EvalSet(void) {
for (size_t i = 0; i < evals_.size(); ++i) {
delete evals_[i];
}
}
inline std::string Eval(const char *evname,
const std::vector<float> &preds,
const MetaInfo &info) const {
std::string result = "";
for (size_t i = 0; i < evals_.size(); ++i) {
float res = evals_[i]->Eval(preds, info);
char tmp[1024];
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
result += tmp;
}
return result;
}
private:
std::vector<const IEvaluator*> evals_;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_EVALUATION_H_

50
learner/helper_utils.h Normal file
View File

@@ -0,0 +1,50 @@
#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
#define XGBOOST_LEARNER_HELPER_UTILS_H_
/*!
* \file helper_utils.h
* \brief useful helper functions
* \author Tianqi Chen, Kailong Chen
*/
#include <utility>
#include <vector>
#include <algorithm>
namespace xgboost {
namespace learner {
// simple helper function to do softmax
inline static void Softmax(std::vector<float>* p_rec) {
std::vector<float> &rec = *p_rec;
float wmax = rec[0];
for (size_t i = 1; i < rec.size(); ++i) {
wmax = std::max(rec[i], wmax);
}
double wsum = 0.0f;
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] = std::exp(rec[i]-wmax);
wsum += rec[i];
}
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] /= static_cast<float>(wsum);
}
}
// simple helper function to do softmax
inline static int FindMaxIndex(const std::vector<float>& rec) {
size_t mxid = 0;
for (size_t i = 1; i < rec.size(); ++i) {
if (rec[i] > rec[mxid] + 1e-6f) {
mxid = i;
}
}
return static_cast<int>(mxid);
}
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {
return a.first > b.first;
}
inline static bool CmpSecond(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {
return a.second > b.second;
}
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_HELPER_UTILS_H_

296
learner/learner-inl.hpp Normal file
View File

@@ -0,0 +1,296 @@
#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
#define XGBOOST_LEARNER_LEARNER_INL_HPP_
/*!
* \file learner-inl.hpp
* \brief learning algorithm
* \author Tianqi Chen
*/
#include <algorithm>
#include <vector>
#include <utility>
#include <string>
#include "./objective.h"
#include "./evaluation.h"
#include "../gbm/gbm.h"
namespace xgboost {
/*! \brief namespace for learning algorithm */
namespace learner {
/*!
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
template<typename FMatrix>
class BoostLearner {
public:
BoostLearner(void) {
obj_ = NULL;
gbm_ = NULL;
name_obj_ = "reg:linear";
name_gbm_ = "gbtree";
}
~BoostLearner(void) {
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
}
/*!
* \brief add internal cache space for mat, this can speedup prediction for matrix,
* please cache prediction for training and eval data
* warning: if the model is loaded from file from some previous training history
* set cache data must be called with exactly SAME
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
// estimate feature bound
unsigned num_feature = 0;
// assign buffer index
size_t buffer_size = 0;
utils::Assert(cache_.size() == 0, "can only call cache data once");
for (size_t i = 0; i < mats.size(); ++i) {
bool dupilicate = false;
for (size_t j = 0; j < i; ++j) {
if (mats[i] == mats[j]) dupilicate = true;
}
if (dupilicate) continue;
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
buffer_size += mats[i]->num_row;
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
}
char str_temp[25];
if (num_feature > mparam.num_feature) {
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
this->SetParam("bst:num_feature", str_temp);
}
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
this->SetParam("num_pbuffer", str_temp);
if (!silent) {
printf("buffer_size=%ld\n", buffer_size);
}
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (gbm_ == NULL) {
if (!strcmp(name, "objective")) name_obj_ = val;
if (!strcmp(name, "booster")) name_gbm_ = val;
mparam.SetParam(name, val);
}
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
}
/*!
* \brief initialize the model
*/
inline void InitModel(void) {
this->InitObjGBM();
// adapt the base score
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
gbm_->InitModel();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel(utils::IStream &fi) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format");
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
// delete existing gbm if any
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
this->InitObjGBM();
gbm_->LoadModel(fi);
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname) {
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
this->LoadModel(fi);
fi.Close();
}
inline void SaveModel(utils::IStream &fo) const {
fo.Write(&mparam, sizeof(ModelParam));
fo.Write(&name_obj_);
fo.Write(&name_gbm_);
gbm_->SaveModel(fo);
}
/*!
* \brief save model into file
* \param fname file name
*/
inline void SaveModel(const char *fname) const {
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
}
/*!
* \brief update the model for one iteration
* \param iter current iteration number
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
this->PredictRaw(preds_, *p_train);
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
}
/*!
* \brief evaluate the model for specific iteration
* \param iter iteration number
* \param evals datas i want to evaluate
* \param evname name of each dataset
* \return a string corresponding to the evaluation result
*/
inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals,
const std::vector<std::string> &evname) {
std::string res;
char tmp[256];
snprintf(tmp, sizeof(tmp), "[%d]", iter);
res = tmp;
for (size_t i = 0; i < evals.size(); ++i) {
this->PredictRaw(*evals[i], &preds_);
obj_->EvalTransform(&preds_);
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
}
return res;
}
/*!
* \brief simple evaluation function, with a specified metric
* \param data input data
* \param metric name of metric
* \return a pair of <evaluation name, result>
*/
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_);
obj_->EvalTransform(&preds_);
float res = ev->Eval(preds_, data.info);
delete ev;
return std::make_pair(metric, res);
}
/*!
* \brief get prediction
* \param data input data
* \param out_preds output vector that stores the prediction
*/
inline void Predict(const DMatrix<FMatrix> &data,
std::vector<float> *out_preds) const {
this->PredictRaw(data, out_preds);
obj_->PredTransform(out_preds);
}
protected:
/*!
* \brief initialize the objective function and GBM,
* if not yet done
*/
inline void InitObjGBM(void) {
if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
}
evaluator_.AddEval(obj_->DefaultEvalMetric());
}
/*!
* \brief get un-transformed prediction
* \param data training data matrix
* \param out_preds output vector that stores the prediction
*/
inline void PredictRaw(const DMatrix<FMatrix> &data,
std::vector<float> *out_preds) {
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
data.info, out_preds);
}
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
float base_score;
/* \brief number of features */
unsigned num_feature;
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
int reserved[32];
/*! \brief constructor */
ModelParam(void) {
base_score = 0.5f;
num_feature = 0;
num_class = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
if (!strcmp("num_class", name)) num_class = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
};
// data fields
// silent during training
int silent;
// evaluation set
EvalSet evaluator_;
// model parameter
ModelParam mparam;
// gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_;
// name of gbm model used for training
std::string name_gbm_;
// objective fnction
IObjFunction *obj_;
// name of objective function
std::string name_obj_;
// configurations
std::vector< std::pair<std::string, std::string> > cfg_;
// temporal storages for prediciton
std::vector<float> preds_;
// gradient pairs
std::vector<bst_gpair> gpair_;
private:
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix<FMatrix> *mat_;
size_t buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
};
// find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.num_row) {
return cache_[i].buffer_offset_;
}
}
}
return -1;
}
// data structure field
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_LEARNER_INL_HPP_

137
learner/objective-inl.hpp Normal file
View File

@@ -0,0 +1,137 @@
#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
/*!
* \file objective-inl.hpp
* \brief objective function implementations
* \author Tianqi Chen, Kailong Chen
*/
#include <vector>
#include "./objective.h"
namespace xgboost {
namespace learner {
/*! \brief defines functions to calculate some commonly used functions */
struct LossType {
/*! \brief indicate which type we are using */
int loss_type;
// list of constants
static const int kLinearSquare = 0;
static const int kLogisticNeglik = 1;
static const int kLogisticClassify = 2;
static const int kLogisticRaw = 3;
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform(float x) const {
switch (loss_type) {
case kLogisticRaw:
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient(float predt, float label) const {
switch (loss_type) {
case kLinearSquare: return predt - label;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const {
switch (loss_type) {
case kLinearSquare: return 1.0f;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief transform probability value back to margin
*/
inline float ProbToMargin(float base_score) const {
if (loss_type == kLogisticRaw ||
loss_type == kLogisticClassify ||
loss_type == kLogisticNeglik ) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -logf(1.0f / base_score - 1.0f);
}
return base_score;
}
/*! \brief get default evaluation metric for the objective */
inline const char *DefaultEvalMetric(void) const {
if (loss_type == kLogisticClassify) return "error";
if (loss_type == kLogisticRaw) return "auc";
return "rmse";
}
};
/*! \brief objective function that only need to */
class RegLossObj : public IObjFunction{
public:
explicit RegLossObj(int loss_type) {
loss.loss_type = loss_type;
scale_pos_weight = 1.0f;
}
virtual ~RegLossObj(void) {}
virtual void SetParam(const char *name, const char *val) {
if (!strcmp("scale_pos_weight", name)) {
scale_pos_weight = static_cast<float>(atof(val));
}
}
virtual void GetGradient(const std::vector<float>& preds,
const MetaInfo &info,
int iter,
std::vector<bst_gpair> *out_gpair) {
utils::Check(preds.size() == info.labels.size(),
"labels are not correctly provided");
std::vector<bst_gpair> &gpair = *out_gpair;
gpair.resize(preds.size());
// start calculating gradient
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
float p = loss.PredTransform(preds[j]);
float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w);
}
}
virtual const char* DefaultEvalMetric(void) {
return loss.DefaultEvalMetric();
}
virtual void PredTransform(std::vector<float> *io_preds) {
std::vector<float> &preds = *io_preds;
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
preds[j] = loss.PredTransform(preds[j]);
}
}
protected:
float scale_pos_weight;
LossType loss;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_

80
learner/objective.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
#define XGBOOST_LEARNER_OBJECTIVE_H_
/*!
* \file objective.h
* \brief interface of objective function used for gradient boosting
* \author Tianqi Chen, Kailong Chen
*/
#include "dmatrix.h"
namespace xgboost {
namespace learner {
/*! \brief interface of objective function */
class IObjFunction{
public:
/*! \brief virtual destructor */
virtual ~IObjFunction(void){}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief get gradient over each of predictions, given existing information
* \param preds prediction of current round
* \param info information about labels, weights, groups in rank
* \param iter current iteration number
* \param out_gpair output of get gradient, saves gradient and second order gradient in
*/
virtual void GetGradient(const std::vector<float>& preds,
const MetaInfo &info,
int iter,
std::vector<bst_gpair> *out_gpair) = 0;
/*! \return the default evaluation metric for the objective */
virtual const char* DefaultEvalMetric(void) = 0;
// the following functions are optional, most of time default implementation is good enough
/*!
* \brief transform prediction values, this is only called when Prediction is called
* \param io_preds prediction values, saves to this vector as well
*/
virtual void PredTransform(std::vector<float> *io_preds){}
/*!
* \brief transform prediction values, this is only called when Eval is called,
* usually it redirect to PredTransform
* \param io_preds prediction values, saves to this vector as well
*/
virtual void EvalTransform(std::vector<float> *io_preds) {
this->PredTransform(io_preds);
}
/*!
* \brief transform probability value back to margin
* this is used to transform user-set base_score back to margin
* used by gradient boosting
* \return transformed value
*/
virtual float ProbToMargin(float base_score) {
return base_score;
}
};
} // namespace learner
} // namespace xgboost
// this are implementations of objective functions
#include "objective-inl.hpp"
// factory function
namespace xgboost {
namespace learner {
/*! \brief factory funciton to create objective function by name */
inline IObjFunction* CreateObjFunction(const char *name) {
if (!strcmp("reg:linear", name)) return new RegLossObj( LossType::kLinearSquare );
if (!strcmp("reg:logistic", name)) return new RegLossObj( LossType::kLogisticNeglik );
if (!strcmp("binary:logistic", name)) return new RegLossObj( LossType::kLogisticClassify );
if (!strcmp("binary:logitraw", name)) return new RegLossObj( LossType::kLogisticRaw );
utils::Error("unknown objective function type: %s", name);
return NULL;
}
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_OBJECTIVE_H_