start unity refactor
This commit is contained in:
84
learner/dmatrix.h
Normal file
84
learner/dmatrix.h
Normal file
@@ -0,0 +1,84 @@
|
||||
#ifndef XGBOOST_LEARNER_DMATRIX_H_
|
||||
#define XGBOOST_LEARNER_DMATRIX_H_
|
||||
/*!
|
||||
* \file dmatrix.h
|
||||
* \brief meta data and template data structure
|
||||
* used for regression/classification/ranking
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief meta information needed in training, including label, weight
|
||||
*/
|
||||
struct MetaInfo {
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
/*!
|
||||
* \brief the index of begin and end of a group
|
||||
* needed when the learning task is ranking
|
||||
*/
|
||||
std::vector<bst_uint> group_ptr;
|
||||
/*! \brief weights of each instance, optional */
|
||||
std::vector<float> weights;
|
||||
/*!
|
||||
* \brief specified root index of each instance,
|
||||
* can be used for multi task setting
|
||||
*/
|
||||
std::vector<unsigned> root_index;
|
||||
/*! \brief get weight of each instances */
|
||||
inline float GetWeight(size_t i) const {
|
||||
if(weights.size() != 0) {
|
||||
return weights[i];
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
/*! \brief get root index of i-th instance */
|
||||
inline float GetRoot(size_t i) const {
|
||||
if(root_index.size() != 0) {
|
||||
return static_cast<float>(root_index[i]);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
inline void SaveBinary(utils::IStream &fo) {
|
||||
fo.Write(labels);
|
||||
fo.Write(group_ptr);
|
||||
fo.Write(weights);
|
||||
fo.Write(root_index);
|
||||
}
|
||||
inline void LoadBinary(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&root_index), "MetaInfo: invalid format");
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief data object used for learning,
|
||||
* \tparam FMatrix type of feature data source
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
struct DMatrix {
|
||||
/*! \brief meta information about the dataset */
|
||||
MetaInfo info;
|
||||
/*! \brief number of rows in the DMatrix */
|
||||
size_t num_row;
|
||||
/*! \brief feature matrix about data content */
|
||||
FMatrix fmat;
|
||||
/*!
|
||||
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||
* used to verify if DMatrix is cached
|
||||
*/
|
||||
void *cache_learner_ptr_;
|
||||
/*! \brief default constructor */
|
||||
DMatrix(void) : cache_learner_ptr_(NULL) {}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_DMATRIX_H_
|
||||
346
learner/evaluation-inl.hpp
Normal file
346
learner/evaluation-inl.hpp
Normal file
@@ -0,0 +1,346 @@
|
||||
#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
/*!
|
||||
* \file xgboost_evaluation-inl.hpp
|
||||
* \brief evaluation metrics for regression and classification and rank
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <climits>
|
||||
#include <algorithm>
|
||||
#include "./evaluation.h"
|
||||
#include "./helper_utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief base class of elementwise evaluation
|
||||
* \tparam Derived the name of subclass
|
||||
*/
|
||||
template<typename Derived>
|
||||
struct EvalEWiseBase : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"label and prediction size not match");
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
float sum = 0.0, wsum = 0.0;
|
||||
#pragma omp parallel for reduction(+:sum, wsum) schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
const float wt = info.GetWeight(i);
|
||||
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
|
||||
wsum += wt;
|
||||
}
|
||||
return Derived::GetFinal(sum, wsum);
|
||||
}
|
||||
/*!
|
||||
* \brief to be implemented by subclass,
|
||||
* get evaluation result from one row
|
||||
* \param label label of current instance
|
||||
* \param pred prediction value of current instance
|
||||
* \param weight weight of current instance
|
||||
*/
|
||||
inline static float EvalRow(float label, float pred);
|
||||
/*!
|
||||
* \brief to be overide by subclas, final trasnformation
|
||||
* \param esum the sum statistics returned by EvalRow
|
||||
* \param wsum sum of weight
|
||||
*/
|
||||
inline static float GetFinal(float esum, float wsum) {
|
||||
return esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief RMSE */
|
||||
struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
|
||||
virtual const char *Name(void) const {
|
||||
return "rmse";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
float diff = label - pred;
|
||||
return diff * diff;
|
||||
}
|
||||
inline static float GetFinal(float esum, float wsum) {
|
||||
return std::sqrt(esum / wsum);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief logloss */
|
||||
struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
|
||||
virtual const char *Name(void) const {
|
||||
return "logloss";
|
||||
}
|
||||
inline static float EvalRow(float y, float py) {
|
||||
return - y * std::log(py) - (1.0f - y) * std::log(1 - py);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief error */
|
||||
struct EvalError : public EvalEWiseBase<EvalError> {
|
||||
virtual const char *Name(void) const {
|
||||
return "error";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
// assume label is in [0,1]
|
||||
return pred > 0.5f ? 1.0f - label : label;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief match error */
|
||||
struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
|
||||
virtual const char *Name(void) const {
|
||||
return "merror";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
return static_cast<int>(pred) != static_cast<int>(label);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief AMS: also records best threshold */
|
||||
struct EvalAMS : public IEvaluator {
|
||||
public:
|
||||
explicit EvalAMS(const char *name) {
|
||||
name_ = name;
|
||||
// note: ams@0 will automatically select which ratio to go
|
||||
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
||||
}
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
|
||||
std::vector< std::pair<float, unsigned> > rec(ndata);
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
rec[i] = std::make_pair(preds[i], i);
|
||||
}
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
|
||||
if (ntop == 0) ntop = ndata;
|
||||
const double br = 10.0;
|
||||
unsigned thresindex = 0;
|
||||
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
|
||||
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
|
||||
const unsigned ridx = rec[i].second;
|
||||
const float wt = info.weights[ridx];
|
||||
if (info.labels[ridx] > 0.5f) {
|
||||
s_tp += wt;
|
||||
} else {
|
||||
b_fp += wt;
|
||||
}
|
||||
if (rec[i].first != rec[i+1].first) {
|
||||
double ams = sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
|
||||
if (tams < ams) {
|
||||
thresindex = i;
|
||||
tams = ams;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ntop == ndata) {
|
||||
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
||||
return tams;
|
||||
} else {
|
||||
return sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
|
||||
}
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
float ratio_;
|
||||
};
|
||||
|
||||
/*! \brief Area under curve, for both classification and rank */
|
||||
struct EvalAuc : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
|
||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
|
||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||
utils::Check(gptr.back() == preds.size(),
|
||||
"EvalAuc: group structure must match number of prediction");
|
||||
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||
// sum statictis
|
||||
double sum_auc = 0.0f;
|
||||
#pragma omp parallel reduction(+:sum_auc)
|
||||
{
|
||||
// each thread takes a local rec
|
||||
std::vector< std::pair<float, unsigned> > rec;
|
||||
#pragma omp for schedule(static)
|
||||
for (unsigned k = 0; k < ngroup; ++k) {
|
||||
rec.clear();
|
||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
||||
rec.push_back(std::make_pair(preds[j], j));
|
||||
}
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
// calculate AUC
|
||||
double sum_pospair = 0.0;
|
||||
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||
for (size_t j = 0; j < rec.size(); ++j) {
|
||||
const float wt = info.GetWeight(rec[j].second);
|
||||
const float ctr = info.labels[rec[j].second];
|
||||
// keep bucketing predictions in same bucket
|
||||
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
buf_neg = buf_pos = 0.0f;
|
||||
}
|
||||
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
|
||||
}
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
// check weird conditions
|
||||
utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
|
||||
"AUC: the dataset only contains pos or neg samples");
|
||||
// this is the AUC
|
||||
sum_auc += sum_pospair / (sum_npos*sum_nneg);
|
||||
}
|
||||
}
|
||||
// return average AUC over list
|
||||
return static_cast<float>(sum_auc) / ngroup;
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return "auc";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Evaluate rank list */
|
||||
struct EvalRankList : public IEvaluator {
|
||||
public:
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"label size predict size not match");
|
||||
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
||||
utils::Assert(gptr.back() == preds.size(),
|
||||
"EvalRanklist: group structure must match number of prediction");
|
||||
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||
// sum statistics
|
||||
double sum_metric = 0.0f;
|
||||
#pragma omp parallel reduction(+:sum_metric)
|
||||
{
|
||||
// each thread takes a local rec
|
||||
std::vector< std::pair<float, unsigned> > rec;
|
||||
#pragma omp for schedule(static)
|
||||
for (unsigned k = 0; k < ngroup; ++k) {
|
||||
rec.clear();
|
||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
||||
rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
|
||||
}
|
||||
sum_metric += this->EvalMetric(rec);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sum_metric) / ngroup;
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit EvalRankList(const char *name) {
|
||||
name_ = name;
|
||||
minus_ = false;
|
||||
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
|
||||
topn_ = UINT_MAX;
|
||||
}
|
||||
if (name[strlen(name) - 1] == '-') {
|
||||
minus_ = true;
|
||||
}
|
||||
}
|
||||
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0;
|
||||
|
||||
protected:
|
||||
unsigned topn_;
|
||||
std::string name_;
|
||||
bool minus_;
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
struct EvalPrecision : public EvalRankList{
|
||||
public:
|
||||
explicit EvalPrecision(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
// calculate Preicsion
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned nhit = 0;
|
||||
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
|
||||
nhit += (rec[j].second != 0);
|
||||
}
|
||||
return static_cast<float>(nhit) / topn_;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief NDCG */
|
||||
struct EvalNDCG : public EvalRankList{
|
||||
public:
|
||||
explicit EvalNDCG(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
double sumdcg = 0.0;
|
||||
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
|
||||
const unsigned rel = rec[i].second;
|
||||
if (rel != 0) {
|
||||
sumdcg += ((1 << rel) - 1) / logf(i + 2);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sumdcg);
|
||||
}
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
std::stable_sort(rec.begin(), rec.end(), CmpFirst);
|
||||
float dcg = this->CalcDCG(rec);
|
||||
std::stable_sort(rec.begin(), rec.end(), CmpSecond);
|
||||
float idcg = this->CalcDCG(rec);
|
||||
if (idcg == 0.0f) {
|
||||
if (minus_) {
|
||||
return 0.0f;
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
return dcg/idcg;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
struct EvalMAP : public EvalRankList {
|
||||
public:
|
||||
explicit EvalMAP(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned nhits = 0;
|
||||
double sumap = 0.0;
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
if (rec[i].second != 0) {
|
||||
nhits += 1;
|
||||
if (i < this->topn_) {
|
||||
sumap += static_cast<float>(nhits) / (i+1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nhits != 0) {
|
||||
sumap /= nhits;
|
||||
return static_cast<float>(sumap);
|
||||
} else {
|
||||
if (minus_) {
|
||||
return 0.0f;
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
82
learner/evaluation.h
Normal file
82
learner/evaluation.h
Normal file
@@ -0,0 +1,82 @@
|
||||
#ifndef XGBOOST_LEARNER_EVALUATION_H_
|
||||
#define XGBOOST_LEARNER_EVALUATION_H_
|
||||
/*!
|
||||
* \file evaluation.h
|
||||
* \brief interface of evaluation function supported in xgboost
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "../utils/utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
struct IEvaluator{
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param info information, including label etc.
|
||||
*/
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const = 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name(void) const = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IEvaluator(void) {}
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
|
||||
// include implementations of evaluation functions
|
||||
#include "evaluation-inl.hpp"
|
||||
// factory function
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
if (!strcmp(name, "rmse")) return new EvalRMSE();
|
||||
if (!strcmp(name, "error")) return new EvalError();
|
||||
if (!strcmp(name, "merror")) return new EvalMatchError();
|
||||
if (!strcmp(name, "logloss")) return new EvalLogLoss();
|
||||
if (!strcmp(name, "auc")) return new EvalAuc();
|
||||
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
|
||||
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
||||
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
||||
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
|
||||
utils::Error("unknown evaluation metric type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*! \brief a set of evaluators */
|
||||
class EvalSet{
|
||||
public:
|
||||
inline void AddEval(const char *name) {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
if (!strcmp(name, evals_[i]->Name())) return;
|
||||
}
|
||||
evals_.push_back(CreateEvaluator(name));
|
||||
}
|
||||
~EvalSet(void) {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
delete evals_[i];
|
||||
}
|
||||
}
|
||||
inline std::string Eval(const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
std::string result = "";
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
float res = evals_[i]->Eval(preds, info);
|
||||
char tmp[1024];
|
||||
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
result += tmp;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_EVALUATION_H_
|
||||
50
learner/helper_utils.h
Normal file
50
learner/helper_utils.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
#define XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
/*!
|
||||
* \file helper_utils.h
|
||||
* \brief useful helper functions
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
// simple helper function to do softmax
|
||||
inline static void Softmax(std::vector<float>* p_rec) {
|
||||
std::vector<float> &rec = *p_rec;
|
||||
float wmax = rec[0];
|
||||
for (size_t i = 1; i < rec.size(); ++i) {
|
||||
wmax = std::max(rec[i], wmax);
|
||||
}
|
||||
double wsum = 0.0f;
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] = std::exp(rec[i]-wmax);
|
||||
wsum += rec[i];
|
||||
}
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] /= static_cast<float>(wsum);
|
||||
}
|
||||
}
|
||||
// simple helper function to do softmax
|
||||
inline static int FindMaxIndex(const std::vector<float>& rec) {
|
||||
size_t mxid = 0;
|
||||
for (size_t i = 1; i < rec.size(); ++i) {
|
||||
if (rec[i] > rec[mxid] + 1e-6f) {
|
||||
mxid = i;
|
||||
}
|
||||
}
|
||||
return static_cast<int>(mxid);
|
||||
}
|
||||
|
||||
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
inline static bool CmpSecond(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.second > b.second;
|
||||
}
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
296
learner/learner-inl.hpp
Normal file
296
learner/learner-inl.hpp
Normal file
@@ -0,0 +1,296 @@
|
||||
#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
#define XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
/*!
|
||||
* \file learner-inl.hpp
|
||||
* \brief learning algorithm
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./objective.h"
|
||||
#include "./evaluation.h"
|
||||
#include "../gbm/gbm.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for learning algorithm */
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief learner that takes do gradient boosting on specific objective functions
|
||||
* and do training and prediction
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class BoostLearner {
|
||||
public:
|
||||
BoostLearner(void) {
|
||||
obj_ = NULL;
|
||||
gbm_ = NULL;
|
||||
name_obj_ = "reg:linear";
|
||||
name_gbm_ = "gbtree";
|
||||
}
|
||||
~BoostLearner(void) {
|
||||
if (obj_ != NULL) delete obj_;
|
||||
if (gbm_ != NULL) delete gbm_;
|
||||
}
|
||||
/*!
|
||||
* \brief add internal cache space for mat, this can speedup prediction for matrix,
|
||||
* please cache prediction for training and eval data
|
||||
* warning: if the model is loaded from file from some previous training history
|
||||
* set cache data must be called with exactly SAME
|
||||
* data matrices to continue training otherwise it will cause error
|
||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||
*/
|
||||
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
|
||||
// estimate feature bound
|
||||
unsigned num_feature = 0;
|
||||
// assign buffer index
|
||||
size_t buffer_size = 0;
|
||||
utils::Assert(cache_.size() == 0, "can only call cache data once");
|
||||
for (size_t i = 0; i < mats.size(); ++i) {
|
||||
bool dupilicate = false;
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
if (mats[i] == mats[j]) dupilicate = true;
|
||||
}
|
||||
if (dupilicate) continue;
|
||||
// set mats[i]'s cache learner pointer to this
|
||||
mats[i]->cache_learner_ptr_ = this;
|
||||
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
|
||||
buffer_size += mats[i]->num_row;
|
||||
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
|
||||
}
|
||||
char str_temp[25];
|
||||
if (num_feature > mparam.num_feature) {
|
||||
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||
this->SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
|
||||
this->SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%ld\n", buffer_size);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (gbm_ == NULL) {
|
||||
if (!strcmp(name, "objective")) name_obj_ = val;
|
||||
if (!strcmp(name, "booster")) name_gbm_ = val;
|
||||
mparam.SetParam(name, val);
|
||||
}
|
||||
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the model
|
||||
*/
|
||||
inline void InitModel(void) {
|
||||
this->InitObjGBM();
|
||||
// adapt the base score
|
||||
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
|
||||
gbm_->InitModel();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||
"BoostLearner: wrong model format");
|
||||
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
|
||||
utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
|
||||
// delete existing gbm if any
|
||||
if (obj_ != NULL) delete obj_;
|
||||
if (gbm_ != NULL) delete gbm_;
|
||||
this->InitObjGBM();
|
||||
gbm_->LoadModel(fi);
|
||||
}
|
||||
/*!
|
||||
* \brief load model from file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void LoadModel(const char *fname) {
|
||||
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
|
||||
this->LoadModel(fi);
|
||||
fi.Close();
|
||||
}
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
fo.Write(&name_obj_);
|
||||
fo.Write(&name_gbm_);
|
||||
gbm_->SaveModel(fo);
|
||||
}
|
||||
/*!
|
||||
* \brief save model into file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void SaveModel(const char *fname) const {
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
this->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iter current iteration number
|
||||
* \param p_train pointer to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
|
||||
this->PredictRaw(preds_, *p_train);
|
||||
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
|
||||
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
|
||||
}
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
* \param iter iteration number
|
||||
* \param evals datas i want to evaluate
|
||||
* \param evname name of each dataset
|
||||
* \return a string corresponding to the evaluation result
|
||||
*/
|
||||
inline std::string EvalOneIter(int iter,
|
||||
const std::vector<const DMatrix<FMatrix>*> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
std::string res;
|
||||
char tmp[256];
|
||||
snprintf(tmp, sizeof(tmp), "[%d]", iter);
|
||||
res = tmp;
|
||||
for (size_t i = 0; i < evals.size(); ++i) {
|
||||
this->PredictRaw(*evals[i], &preds_);
|
||||
obj_->EvalTransform(&preds_);
|
||||
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
/*!
|
||||
* \brief simple evaluation function, with a specified metric
|
||||
* \param data input data
|
||||
* \param metric name of metric
|
||||
* \return a pair of <evaluation name, result>
|
||||
*/
|
||||
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
|
||||
if (metric == "auto") metric = obj_->DefaultEvalMetric();
|
||||
IEvaluator *ev = CreateEvaluator(metric.c_str());
|
||||
this->PredictRaw(data, &preds_);
|
||||
obj_->EvalTransform(&preds_);
|
||||
float res = ev->Eval(preds_, data.info);
|
||||
delete ev;
|
||||
return std::make_pair(metric, res);
|
||||
}
|
||||
/*!
|
||||
* \brief get prediction
|
||||
* \param data input data
|
||||
* \param out_preds output vector that stores the prediction
|
||||
*/
|
||||
inline void Predict(const DMatrix<FMatrix> &data,
|
||||
std::vector<float> *out_preds) const {
|
||||
this->PredictRaw(data, out_preds);
|
||||
obj_->PredTransform(out_preds);
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief initialize the objective function and GBM,
|
||||
* if not yet done
|
||||
*/
|
||||
inline void InitObjGBM(void) {
|
||||
if (obj_ != NULL) return;
|
||||
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
|
||||
obj_ = CreateObjFunction(name_obj_.c_str());
|
||||
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
|
||||
for (size_t i = 0; i < cfg_.size(); ++i) {
|
||||
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
}
|
||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||
}
|
||||
/*!
|
||||
* \brief get un-transformed prediction
|
||||
* \param data training data matrix
|
||||
* \param out_preds output vector that stores the prediction
|
||||
*/
|
||||
inline void PredictRaw(const DMatrix<FMatrix> &data,
|
||||
std::vector<float> *out_preds) {
|
||||
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
|
||||
data.info, out_preds);
|
||||
}
|
||||
|
||||
/*! \brief training parameter for regression */
|
||||
struct ModelParam{
|
||||
/* \brief global bias */
|
||||
float base_score;
|
||||
/* \brief number of features */
|
||||
unsigned num_feature;
|
||||
/* \brief number of class, if it is multi-class classification */
|
||||
int num_class;
|
||||
/*! \brief reserved field */
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
base_score = 0.5f;
|
||||
num_feature = 0;
|
||||
num_class = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
|
||||
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
};
|
||||
// data fields
|
||||
// silent during training
|
||||
int silent;
|
||||
// evaluation set
|
||||
EvalSet evaluator_;
|
||||
// model parameter
|
||||
ModelParam mparam;
|
||||
// gbm model that back everything
|
||||
gbm::IGradBooster<FMatrix> *gbm_;
|
||||
// name of gbm model used for training
|
||||
std::string name_gbm_;
|
||||
// objective fnction
|
||||
IObjFunction *obj_;
|
||||
// name of objective function
|
||||
std::string name_obj_;
|
||||
// configurations
|
||||
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||
// temporal storages for prediciton
|
||||
std::vector<float> preds_;
|
||||
// gradient pairs
|
||||
std::vector<bst_gpair> gpair_;
|
||||
|
||||
private:
|
||||
// cache entry object that helps handle feature caching
|
||||
struct CacheEntry {
|
||||
const DMatrix<FMatrix> *mat_;
|
||||
size_t buffer_offset_;
|
||||
size_t num_row_;
|
||||
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
|
||||
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
||||
};
|
||||
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
|
||||
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
||||
if (cache_[i].num_row_ == mat.num_row) {
|
||||
return cache_[i].buffer_offset_;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
// data structure field
|
||||
/*! \brief the entries indicates that we have internal prediction cache */
|
||||
std::vector<CacheEntry> cache_;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
137
learner/objective-inl.hpp
Normal file
137
learner/objective-inl.hpp
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
/*!
|
||||
* \file objective-inl.hpp
|
||||
* \brief objective function implementations
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "./objective.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief defines functions to calculate some commonly used functions */
|
||||
struct LossType {
|
||||
/*! \brief indicate which type we are using */
|
||||
int loss_type;
|
||||
// list of constants
|
||||
static const int kLinearSquare = 0;
|
||||
static const int kLogisticNeglik = 1;
|
||||
static const int kLogisticClassify = 2;
|
||||
static const int kLogisticRaw = 3;
|
||||
/*!
|
||||
* \brief transform the linear sum to prediction
|
||||
* \param x linear sum of boosting ensemble
|
||||
* \return transformed prediction
|
||||
*/
|
||||
inline float PredTransform(float x) const {
|
||||
switch (loss_type) {
|
||||
case kLogisticRaw:
|
||||
case kLinearSquare: return x;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief calculate first order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return predt - label;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt - label;
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief calculate second order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return 1.0f;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt * (1 - predt);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief transform probability value back to margin
|
||||
*/
|
||||
inline float ProbToMargin(float base_score) const {
|
||||
if (loss_type == kLogisticRaw ||
|
||||
loss_type == kLogisticClassify ||
|
||||
loss_type == kLogisticNeglik ) {
|
||||
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
||||
"base_score must be in (0,1) for logistic loss");
|
||||
base_score = -logf(1.0f / base_score - 1.0f);
|
||||
}
|
||||
return base_score;
|
||||
}
|
||||
/*! \brief get default evaluation metric for the objective */
|
||||
inline const char *DefaultEvalMetric(void) const {
|
||||
if (loss_type == kLogisticClassify) return "error";
|
||||
if (loss_type == kLogisticRaw) return "auc";
|
||||
return "rmse";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief objective function that only need to */
|
||||
class RegLossObj : public IObjFunction{
|
||||
public:
|
||||
explicit RegLossObj(int loss_type) {
|
||||
loss.loss_type = loss_type;
|
||||
scale_pos_weight = 1.0f;
|
||||
}
|
||||
virtual ~RegLossObj(void) {}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("scale_pos_weight", name)) {
|
||||
scale_pos_weight = static_cast<float>(atof(val));
|
||||
}
|
||||
}
|
||||
virtual void GetGradient(const std::vector<float>& preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"labels are not correctly provided");
|
||||
std::vector<bst_gpair> &gpair = *out_gpair;
|
||||
gpair.resize(preds.size());
|
||||
// start calculating gradient
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
float p = loss.PredTransform(preds[j]);
|
||||
float w = info.GetWeight(j);
|
||||
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
|
||||
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
||||
loss.SecondOrderGradient(p, info.labels[j]) * w);
|
||||
}
|
||||
}
|
||||
virtual const char* DefaultEvalMetric(void) {
|
||||
return loss.DefaultEvalMetric();
|
||||
}
|
||||
virtual void PredTransform(std::vector<float> *io_preds) {
|
||||
std::vector<float> &preds = *io_preds;
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = loss.PredTransform(preds[j]);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_pos_weight;
|
||||
LossType loss;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
80
learner/objective.h
Normal file
80
learner/objective.h
Normal file
@@ -0,0 +1,80 @@
|
||||
#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
#define XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
/*!
|
||||
* \file objective.h
|
||||
* \brief interface of objective function used for gradient boosting
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include "dmatrix.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief interface of objective function */
|
||||
class IObjFunction{
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IObjFunction(void){}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief get gradient over each of predictions, given existing information
|
||||
* \param preds prediction of current round
|
||||
* \param info information about labels, weights, groups in rank
|
||||
* \param iter current iteration number
|
||||
* \param out_gpair output of get gradient, saves gradient and second order gradient in
|
||||
*/
|
||||
virtual void GetGradient(const std::vector<float>& preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) = 0;
|
||||
/*! \return the default evaluation metric for the objective */
|
||||
virtual const char* DefaultEvalMetric(void) = 0;
|
||||
// the following functions are optional, most of time default implementation is good enough
|
||||
/*!
|
||||
* \brief transform prediction values, this is only called when Prediction is called
|
||||
* \param io_preds prediction values, saves to this vector as well
|
||||
*/
|
||||
virtual void PredTransform(std::vector<float> *io_preds){}
|
||||
/*!
|
||||
* \brief transform prediction values, this is only called when Eval is called,
|
||||
* usually it redirect to PredTransform
|
||||
* \param io_preds prediction values, saves to this vector as well
|
||||
*/
|
||||
virtual void EvalTransform(std::vector<float> *io_preds) {
|
||||
this->PredTransform(io_preds);
|
||||
}
|
||||
/*!
|
||||
* \brief transform probability value back to margin
|
||||
* this is used to transform user-set base_score back to margin
|
||||
* used by gradient boosting
|
||||
* \return transformed value
|
||||
*/
|
||||
virtual float ProbToMargin(float base_score) {
|
||||
return base_score;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
|
||||
// this are implementations of objective functions
|
||||
#include "objective-inl.hpp"
|
||||
// factory function
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief factory funciton to create objective function by name */
|
||||
inline IObjFunction* CreateObjFunction(const char *name) {
|
||||
if (!strcmp("reg:linear", name)) return new RegLossObj( LossType::kLinearSquare );
|
||||
if (!strcmp("reg:logistic", name)) return new RegLossObj( LossType::kLogisticNeglik );
|
||||
if (!strcmp("binary:logistic", name)) return new RegLossObj( LossType::kLogisticClassify );
|
||||
if (!strcmp("binary:logitraw", name)) return new RegLossObj( LossType::kLogisticRaw );
|
||||
utils::Error("unknown objective function type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
Reference in New Issue
Block a user