lambda rank added

This commit is contained in:
kalenhaha
2014-04-10 22:09:19 +08:00
parent a10f594644
commit c8b2f46b89
18 changed files with 1792 additions and 76 deletions

View File

@@ -174,7 +174,7 @@ namespace xgboost{
inline void InitLearner(void){
cfg.BeforeFirst();
while (cfg.Next()){
learner_->SetParam(cfg.name(), cfg.val());
learner_->SetParam(cfg.name(), cfg.val());
}
if (model_in != "NULL"){
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));

View File

@@ -10,8 +10,8 @@
namespace xgboost{
namespace base{
/*! \brief data matrix for regression,classification,rank content */
struct DMatrix{
/*! \brief data matrix for regression, classification, rank content */
struct DMatrix{
public:
/*! \brief maximum feature dimension */
unsigned num_feature;
@@ -74,7 +74,7 @@ namespace xgboost{
}
inline void LoadGroup(const char* fgroup, bool silent = false){
//if exists group data load it in
//if exists group data load it in
FILE *file_group = fopen64(fgroup, "r");
if (file_group != NULL){
@@ -117,6 +117,7 @@ namespace xgboost{
LoadGroupBinary(fgroup,silent);
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
@@ -139,7 +140,7 @@ namespace xgboost{
}
inline void SaveGroupBinary(const char* fgroup, bool silent = false){
//save group data
//save group data
if (group_index.size() > 0){
utils::FileStream file_group(utils::FopenCheck(fgroup, "wb"));
int group_index_size = group_index.size();
@@ -151,7 +152,7 @@ namespace xgboost{
}
inline void LoadGroupBinary(const char* fgroup, bool silent = false){
//if group data exists load it in
//if group data exists load it in
FILE *file_group = fopen64(fgroup, "r");
if (file_group != NULL){
int group_index_size = 0;
@@ -168,8 +169,8 @@ namespace xgboost{
}else{
if(!silent){printf("The binary file of group info not exists");}
}
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
@@ -207,9 +208,6 @@ namespace xgboost{
}
}
};
}
};

View File

@@ -144,17 +144,24 @@ namespace xgboost {
this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_);
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
// printf("xgboost_learner.h:UpdateOneIter\n");
// const unsigned ndata = static_cast<unsigned>(train_->Size());
// #pragma omp parallel for schedule( static )
// for (unsigned j = 0; j < ndata; ++j) {
// printf("haha:%d %f\n",j,base_gbm.Predict(train_->data, j, j));
// }
}
/*! \brief get intransformed prediction, without buffering */
inline void Predict(std::vector<float> &preds, const DMatrix &data) {
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j) {
preds[j] = base_gbm.Predict(data.data, j, -1);
}
preds[j] = base_gbm.Predict(data.data, j, -1);
}
}
public:
@@ -194,7 +201,7 @@ namespace xgboost {
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j) {
preds[j] = base_gbm.InteractPredict(data.data, j, buffer_offset + j);
}
@@ -202,7 +209,7 @@ namespace xgboost {
/*! \brief repredict trial */
inline void InteractRePredict(const xgboost::base::DMatrix &data, unsigned buffer_offset) {
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j) {
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
}
@@ -212,10 +219,11 @@ namespace xgboost {
virtual inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j) {
preds[j] = base_gbm.Predict(data.data, j, buffer_offset + j);
}
}
}
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
@@ -248,7 +256,7 @@ namespace xgboost {
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp("loss_type", name)) loss_type = atoi(val);
if (!strcmp("loss_type", name)) loss_type = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}

View File

@@ -7,7 +7,7 @@
*/
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <vector>
#include "xgboost_sample.h"
#include "xgboost_rank_eval.h"
#include "../base/xgboost_data_instance.h"
@@ -71,11 +71,139 @@ namespace xgboost {
fprintf(fo, "\n");
}
inline void SetParam(const char *name, const char *val){
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
virtual inline void SetParam(const char *name, const char *val){
BoostLearner::SetParam(name,val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp(name, "rank:sampler")) sampler.AssignSampler(atoi(val));
}
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
private:
inline std::vector< Triple<float,float,int> > GetSortedTuple(const std::vector<float> &preds,
const std::vector<float> &labels,
const std::vector<int> &group_index,
int group){
std::vector< Triple<float,float,int> > sorted_triple;
for(int j = group_index[group]; j < group_index[group+1]; j++){
sorted_triple.push_back(Triple<float,float,int>(preds[j],labels[j],j));
}
std::sort(sorted_triple.begin(),sorted_triple.end(),Triplef1Comparer);
return sorted_triple;
}
inline std::vector<int> GetIndexMap(std::vector< Triple<float,float,int> > sorted_triple,int start){
std::vector<int> index_remap;
index_remap.resize(sorted_triple.size());
for(int i = 0; i < sorted_triple.size(); i++){
index_remap[sorted_triple[i].f3_-start] = i;
}
return index_remap;
}
inline float GetLambdaMAP(const std::vector< Triple<float,float,int> > sorted_triple,
int index1,int index2,
std::vector< Quadruple<float,float,float,float> > map_acc){
if(index1 > index2) std::swap(index1,index2);
float original = map_acc[index2].f1_;
if(index1 != 0) original -= map_acc[index1 - 1].f1_;
float changed = 0;
if(sorted_triple[index1].f2_ < sorted_triple[index2].f2_){
changed += map_acc[index2 - 1].f3_ - map_acc[index1].f3_;
changed += (map_acc[index1].f4_ + 1.0f)/(index1 + 1);
}else{
changed += map_acc[index2 - 1].f2_ - map_acc[index1].f2_;
changed += map_acc[index2].f4_/(index2 + 1);
}
float ans = (changed - original)/(map_acc[map_acc.size() - 1].f4_);
if(ans < 0) ans = -ans;
return ans;
}
inline float GetLambdaNDCG(const std::vector< Triple<float,float,int> > sorted_triple,
int index1,
int index2,float IDCG){
float original = pow(2,sorted_triple[index1].f2_)/log(index1+2)
+ pow(2,sorted_triple[index2].f2_)/log(index2+2);
float changed = pow(2,sorted_triple[index2].f2_)/log(index1+2)
+ pow(2,sorted_triple[index1].f2_)/log(index2+2);
float ans = (original - changed)/IDCG;
if(ans < 0) ans = -ans;
return ans;
}
inline float GetIDCG(const std::vector< Triple<float,float,int> > sorted_triple){
std::vector<float> labels;
for(int i = 0; i < sorted_triple.size(); i++){
labels.push_back(sorted_triple[i].f2_);
}
std::sort(labels.begin(),labels.end(),std::greater<float>());
return EvalNDCG::DCG(labels);
}
inline std::vector< Quadruple<float,float,float,float> > GetMAPAcc(const std::vector< Triple<float,float,int> > sorted_triple){
std::vector< Quadruple<float,float,float,float> > map_acc;
float hit = 0,acc1 = 0,acc2 = 0,acc3 = 0;
for(int i = 0; i < sorted_triple.size(); i++){
if(sorted_triple[i].f2_ == 1) {
hit++;
acc1 += hit /( i + 1 );
acc2 += (hit - 1)/(i+1);
acc3 += (hit + 1)/(i+1);
}
map_acc.push_back(Quadruple<float,float,float,float>(acc1,acc2,acc3,hit));
}
return map_acc;
}
inline void GetGroupGradient(const std::vector<float> &preds,
const std::vector<float> &labels,
const std::vector<int> &group_index,
std::vector<float> &grad,
std::vector<float> &hess,
const std::vector< Triple<float,float,int> > sorted_triple,
const std::vector<int> index_remap,
const sample::Pairs& pairs,
int group){
bool j_better;
float IDCG, pred_diff, pred_diff_exp, delta;
float first_order_gradient, second_order_gradient;
std::vector< Quadruple<float,float,float,float> > map_acc;
if(mparam.loss_type == NDCG){
IDCG = GetIDCG(sorted_triple);
}else if(mparam.loss_type == MAP){
map_acc = GetMAPAcc(sorted_triple);
}
for (int j = group_index[group]; j < group_index[group + 1]; j++){
std::vector<int> pair_instance = pairs.GetPairs(j);
for (int k = 0; k < pair_instance.size(); k++){
j_better = labels[j] > labels[pair_instance[k]];
if (j_better){
switch(mparam.loss_type){
case PAIRWISE: delta = 1.0;break;
case MAP: delta = GetLambdaMAP(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],map_acc);break;
case NDCG: delta = GetLambdaNDCG(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],IDCG);break;
default: utils::Error("Cannot find the specified loss type");
}
pred_diff = preds[preds[j] - pair_instance[k]];
pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff);
first_order_gradient = delta * FirstOrderGradient(pred_diff_exp);
second_order_gradient = 2 * delta * SecondOrderGradient(pred_diff_exp);
hess[j] += second_order_gradient;
grad[j] += first_order_gradient;
hess[pair_instance[k]] += second_order_gradient;
grad[pair_instance[k]] += -first_order_gradient;
}
}
}
}
public:
/*! \brief get the first order and second order gradient, given the
* intransformed predictions and labels */
inline void GetGradient(const std::vector<float> &preds,
const std::vector<float> &labels,
const std::vector<int> &group_index,
@@ -83,32 +211,44 @@ namespace xgboost {
std::vector<float> &hess) {
grad.resize(preds.size());
hess.resize(preds.size());
bool j_better;
float pred_diff, pred_diff_exp, first_order_gradient, second_order_gradient;
for (int i = 0; i < group_index.size() - 1; i++){
sample::Pairs pairs = sampler.GenPairs(preds, labels, group_index[i], group_index[i + 1]);
for (int j = group_index[i]; j < group_index[i + 1]; j++){
std::vector<int> pair_instance = pairs.GetPairs(j);
for (int k = 0; k < pair_instance.size(); k++){
j_better = labels[j] > labels[pair_instance[k]];
if (j_better){
pred_diff = preds[preds[j] - pair_instance[k]];
pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff);
first_order_gradient = FirstOrderGradient(pred_diff_exp);
second_order_gradient = 2 * SecondOrderGradient(pred_diff_exp);
hess[j] += second_order_gradient;
grad[j] += first_order_gradient;
hess[pair_instance[k]] += second_order_gradient;
grad[pair_instance[k]] += -first_order_gradient;
}
}
}
//pairs.GetPairs()
std::vector< Triple<float,float,int> > sorted_triple = GetSortedTuple(preds,labels,group_index,i);
std::vector<int> index_remap = GetIndexMap(sorted_triple,group_index[i]);
GetGroupGradient(preds,labels,group_index,
grad,hess,sorted_triple,index_remap,pairs,i);
}
}
inline void UpdateInteract(std::string action) {
this->InteractPredict(preds_, *train_, 0);
int buffer_offset = static_cast<int>(train_->Size());
for (size_t i = 0; i < evals_.size(); ++i){
std::vector<float> &preds = this->eval_preds_[i];
this->InteractPredict(preds, *evals_[i], buffer_offset);
buffer_offset += static_cast<int>(evals_[i]->Size());
}
if (action == "remove"){
base_gbm.DelteBooster(); return;
}
this->GetGradient(preds_, train_->labels,train_->group_index, grad_, hess_);
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
this->InteractRePredict(*train_, 0);
buffer_offset = static_cast<int>(train_->Size());
for (size_t i = 0; i < evals_.size(); ++i){
this->InteractRePredict(*evals_[i], buffer_offset);
buffer_offset += static_cast<int>(evals_[i]->Size());
}
}
private:
enum LossType {
PAIRWISE = 0,

View File

@@ -34,27 +34,52 @@ namespace xgboost {
float key_;
float value_;
Pair(float key, float value){
key_ = key;
value_ = value_;
Pair(float key, float value):key_(key),value_(value){
}
};
bool PairKeyComparer(const Pair &a, const Pair &b){
return a.key_ < b.key_;
bool PairKeyComparer(const Pair &a, const Pair &b){
return a.key_ < b.key_;
}
bool PairValueComparer(const Pair &a, const Pair &b){
return a.value_ < b.value_;
}
template<typename T1,typename T2,typename T3>
class Triple{
public:
T1 f1_;
T2 f2_;
T3 f3_;
Triple(T1 f1,T2 f2,T3 f3):f1_(f1),f2_(f2),f3_(f3){
}
};
template<typename T1,typename T2,typename T3,typename T4>
class Quadruple{
public:
T1 f1_;
T2 f2_;
T3 f3_;
T4 f4_;
Quadruple(T1 f1,T2 f2,T3 f3,T4 f4):f1_(f1),f2_(f2),f3_(f3),f4_(f4){
}
};
bool Triplef1Comparer(const Triple<float,float,int> &a, const Triple<float,float,int> &b){
return a.f1_< b.f1_;
}
/*! \brief Mean Average Precision */
class EvalMAP : public IRankEvaluator {
public:
float Eval(const std::vector<float> &preds,
const std::vector<float> &labels,
const std::vector<int> &group_index) const {
if (group_index.size() <= 1) return 0;
float acc = 0;
std::vector<Pair> pairs_sort;
for (int i = 0; i < group_index.size() - 1; i++){
@@ -66,12 +91,13 @@ namespace xgboost {
}
return acc / (group_index.size() - 1);
}
virtual const char *Name(void) const {
return "MAP";
}
private:
float average_precision(std::vector<Pair> pairs_sort) const{
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
@@ -94,12 +120,31 @@ namespace xgboost {
float Eval(const std::vector<float> &preds,
const std::vector<float> &labels,
const std::vector<int> &group_index) const {
return 0;
}
if (group_index.size() <= 1) return 0;
float acc = 0;
for (int i = 0; i < group_index.size() - 1; i++){
acc += Count_Inversion(preds,labels,
group_index[i],group_index[i+1]);
}
return acc / (group_index.size() - 1);
}
const char *Name(void) const {
return "PAIR";
}
private:
float Count_Inversion(const std::vector<float> &preds,
const std::vector<float> &labels,int begin,int end
) const{
float ans = 0;
for(int i = begin; i < end; i++){
for(int j = i + 1; j < end; j++){
if(preds[i] > preds[j] && labels[i] < labels[j])
ans++;
}
}
return ans;
}
};
/*! \brief Normalized DCG */
@@ -120,7 +165,20 @@ namespace xgboost {
}
return acc / (group_index.size() - 1);
}
static float DCG(const std::vector<float> &labels){
float ans = 0.0;
for (int i = 0; i < labels.size(); i++){
ans += (pow(2,labels[i]) - 1 ) / log(i + 2);
}
return ans;
}
virtual const char *Name(void) const {
return "NDCG";
}
private:
float NDCG(std::vector<Pair> pairs_sort) const{
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
float dcg = DCG(pairs_sort);
@@ -131,17 +189,14 @@ namespace xgboost {
}
float DCG(std::vector<Pair> pairs_sort) const{
float ans = 0.0;
ans += pairs_sort[0].value_;
for (int i = 1; i < pairs_sort.size(); i++){
ans += pairs_sort[i].value_ / log(i + 1);
}
return ans;
std::vector<float> labels;
for (int i = 1; i < pairs_sort.size(); i++){
labels.push_back(pairs_sort[i].value_);
}
return DCG(labels);
}
virtual const char *Name(void) const {
return "NDCG";
}
};
};

View File

@@ -13,7 +13,8 @@
#include "../regression/xgboost_reg.h"
#include "../regression/xgboost_reg_main.cpp"
#include "../base/xgboost_data_instance.h"
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
xgboost::random::Seed(0);
xgboost::base::BoostTask rank_tsk;
rank_tsk.SetLearner(new xgboost::rank::RankBoostLearner);

View File

@@ -19,7 +19,7 @@ namespace xgboost {
* \param start the begin index of the group
* \param end the end index of the group
*/
Pairs(int start, int end) :start_(start), end_(end_){
Pairs(int start, int end) :start_(start), end_(end){
for (int i = start; i < end; i++){
std::vector<int> v;
pairs_.push_back(v);
@@ -30,7 +30,7 @@ namespace xgboost {
* \param index, the index of retrieved instance
* \return the index of instances paired
*/
std::vector<int> GetPairs(int index) {
std::vector<int> GetPairs(int index) const{
utils::Assert(index >= start_ && index < end_, "The query index out of sampling bound");
return pairs_[index - start_];
}