Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts: demo/rank/mq2008.conf demo/rank/runexp.sh regrank/xgboost_regrank_obj.h
This commit is contained in:
commit
da482500c7
5
.gitignore
vendored
5
.gitignore
vendored
@ -18,3 +18,8 @@
|
||||
*model
|
||||
xgboost
|
||||
*pyc
|
||||
*train
|
||||
*test
|
||||
*group
|
||||
*rar
|
||||
*vali
|
||||
|
||||
@ -2,9 +2,10 @@ xgboost: eXtreme Gradient Boosting
|
||||
=======
|
||||
An optimized general purpose gradient boosting (tree) library.
|
||||
|
||||
Authors:
|
||||
Contributors:
|
||||
* Tianqi Chen, project creater
|
||||
* Kailong Chen, contributes regression module
|
||||
* Bing Xu, contributes python interface, higgs example
|
||||
|
||||
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# choose the tree booster, 0: tree, 1: linear
|
||||
booster_type = 0
|
||||
# choose logistic regression loss function for binary classification
|
||||
loss_type = 2
|
||||
objective = binary:logistic
|
||||
|
||||
# Tree Booster Parameters
|
||||
# step size shrinkage
|
||||
|
||||
@ -17,4 +17,4 @@ make
|
||||
|
||||
|
||||
|
||||
|
||||
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
||||
|
||||
@ -31,8 +31,9 @@ xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||
|
||||
# setup parameters for xgboost
|
||||
param = {}
|
||||
# use logistic regression loss
|
||||
param['loss_type'] = 3
|
||||
# use logistic regression loss, use raw prediction before logistic transformation
|
||||
# since we only need the rank
|
||||
param['objective'] = 'binary:logitraw'
|
||||
# scale weight of positive examples
|
||||
param['scale_pos_weight'] = sum_wneg/sum_wpos
|
||||
param['bst:eta'] = 0.1
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
./higgs-numpy.py
|
||||
./higgs-pred.py
|
||||
python higgs-numpy.py
|
||||
python higgs-pred.py
|
||||
66
demo/kaggle-higgs/speedtest.py
Executable file
66
demo/kaggle-higgs/speedtest.py
Executable file
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/python
|
||||
# this is the example script to use xgboost to train
|
||||
import sys
|
||||
import numpy as np
|
||||
# add path of xgboost python module
|
||||
sys.path.append('../../python/')
|
||||
import xgboost as xgb
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
import time
|
||||
test_size = 550000
|
||||
|
||||
# path to where the data lies
|
||||
dpath = 'data'
|
||||
|
||||
# load in training data, directly use numpy
|
||||
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
|
||||
print 'finish loading from csv '
|
||||
|
||||
label = dtrain[:,32]
|
||||
data = dtrain[:,1:31]
|
||||
# rescale weight to make it same as test set
|
||||
weight = dtrain[:,31] * float(test_size) / len(label)
|
||||
|
||||
sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 )
|
||||
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 )
|
||||
|
||||
# print weight statistics
|
||||
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )
|
||||
|
||||
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
|
||||
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||
|
||||
# setup parameters for xgboost
|
||||
param = {}
|
||||
# use logistic regression loss
|
||||
param['objective'] = 'binary:logitraw'
|
||||
# scale weight of positive examples
|
||||
param['scale_pos_weight'] = sum_wneg/sum_wpos
|
||||
param['bst:eta'] = 0.1
|
||||
param['bst:max_depth'] = 6
|
||||
param['eval_metric'] = 'auc'
|
||||
param['silent'] = 1
|
||||
param['nthread'] = 4
|
||||
|
||||
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||
|
||||
watchlist = [ (xgmat,'train') ]
|
||||
# boost 10 tres
|
||||
num_round = 10
|
||||
print 'loading data end, start to boost trees'
|
||||
print "training GBM from sklearn"
|
||||
tmp = time.time()
|
||||
gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
|
||||
gbm.fit(data, label)
|
||||
print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp)
|
||||
#raw_input()
|
||||
print "training xgboost"
|
||||
threads = [1, 2, 4, 16]
|
||||
for i in threads:
|
||||
param['nthread'] = i
|
||||
tmp = time.time()
|
||||
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||
bst = xgb.train( plst, xgmat, num_round, watchlist );
|
||||
print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))
|
||||
|
||||
print 'finish training'
|
||||
@ -1 +1,13 @@
|
||||
The dataset for ranking demo is from LETOR04 MQ2008 fold1,http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx
|
||||
Instructions:
|
||||
The dataset for ranking demo is from LETOR04 MQ2008 fold1,
|
||||
You can use the following command to run the example
|
||||
|
||||
|
||||
Get the data: ./wgetdata.sh
|
||||
Run the example: ./runexp.sh
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -2,10 +2,8 @@
|
||||
# choose the tree booster, 0: tree, 1: linear
|
||||
booster_type = 0
|
||||
|
||||
# so far, we have pairwise rank
|
||||
objective="rank:pairwise"
|
||||
#objective="rank:softmax"
|
||||
#objective="rank:map"
|
||||
#objective="rank:ndcg"
|
||||
|
||||
# Tree Booster Parameters
|
||||
# step size shrinkage
|
||||
@ -16,8 +14,7 @@ bst:gamma = 1.0
|
||||
bst:min_child_weight = 0.1
|
||||
# maximum depth of a tree
|
||||
bst:max_depth = 6
|
||||
eval_metric = "ndcg"
|
||||
eval_metric = "map"
|
||||
|
||||
# Task parameters
|
||||
# the number of round to do boosting
|
||||
num_round = 4
|
||||
|
||||
0
demo/rank/runexp.sh
Normal file → Executable file
0
demo/rank/runexp.sh
Normal file → Executable file
4
demo/rank/wgetdata.sh
Executable file
4
demo/rank/wgetdata.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
|
||||
unrar x MQ2008.rar
|
||||
mv -f MQ2008/Fold1/*.txt .
|
||||
@ -1,9 +1,9 @@
|
||||
# General Parameters, see comment for each definition
|
||||
# choose the tree booster, 0: tree, 1: linear
|
||||
booster_type = 0
|
||||
# this is the only difference with classification, use 0: linear regression
|
||||
# when labels are in [0,1] we can also use 1: logistic regression
|
||||
loss_type = 0
|
||||
# this is the only difference with classification, use reg:linear to do linear classification
|
||||
# when labels are in [0,1] we can also use reg:logistic
|
||||
objective = reg:linear
|
||||
|
||||
# Tree Booster Parameters
|
||||
# step size shrinkage
|
||||
|
||||
@ -12,7 +12,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 }
|
||||
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
|
||||
# specify validations set to watch performance
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
@ -29,11 +29,6 @@ bst.dump_model('dump.raw.txt')
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.raw.txt','featmap.txt')
|
||||
|
||||
# beta: interact mode
|
||||
bst.set_param('bst:interact:expand',4)
|
||||
bst.update_interact( dtrain, 'update', 0)
|
||||
bst.dump_model('dump.raw2.txt')
|
||||
|
||||
###
|
||||
# build dmatrix in python iteratively
|
||||
#
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# Author: Tianqi Chen, Bing Xu
|
||||
# module for xgboost
|
||||
import ctypes
|
||||
import os
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
// implementations in ctypes
|
||||
#include "xgboost_python.h"
|
||||
#include "../regrank/xgboost_regrank.h"
|
||||
#include "../regrank/xgboost_regrank_data.h"
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
#ifndef XGBOOST_PYTHON_H
|
||||
#define XGBOOST_PYTHON_H
|
||||
/*!
|
||||
* \file xgboost_regrank_data.h
|
||||
* \file xgboost_python.h
|
||||
* \author Tianqi Chen
|
||||
* \brief python wrapper for xgboost, using ctypes,
|
||||
* hides everything behind functions
|
||||
* use c style interface
|
||||
|
||||
@ -25,7 +25,7 @@ namespace xgboost{
|
||||
RegRankBoostLearner(void){
|
||||
silent = 0;
|
||||
obj_ = NULL;
|
||||
name_obj_ = "reg";
|
||||
name_obj_ = "reg:linear";
|
||||
}
|
||||
/*!
|
||||
* \brief a regression booter associated with training and evaluating data
|
||||
|
||||
@ -129,7 +129,9 @@ namespace xgboost{
|
||||
if( fs.Read(&nwt, sizeof(unsigned) ) != 0 ){
|
||||
utils::Assert( nwt == 0 || nwt == data.NumRow(), "invalid weight" );
|
||||
info.weights.resize( nwt );
|
||||
utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file");
|
||||
if( nwt != 0 ){
|
||||
utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file");
|
||||
}
|
||||
}
|
||||
}
|
||||
fs.Close();
|
||||
|
||||
@ -1,164 +0,0 @@
|
||||
// some backup code
|
||||
|
||||
class LambdaRankObj_NDCG : public LambdaRankObj{
|
||||
|
||||
static inline float CalcDCG(const std::vector< float > &rec) {
|
||||
double sumdcg = 0.0;
|
||||
for (size_t i = 0; i < rec.size(); i++){
|
||||
const unsigned rel = static_cast<unsigned>(rec[i]);
|
||||
if (rel != 0){
|
||||
sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 2);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sumdcg);
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2
|
||||
* in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1)
|
||||
* \param sorted_triple the fields are predition,label,original index
|
||||
* \param index1,index2 the instances switched
|
||||
* \param the IDCG of the list
|
||||
*/
|
||||
inline float GetLambdaNDCG(const std::vector< Triple > sorted_triple,
|
||||
int index1,
|
||||
int index2, float IDCG){
|
||||
double original = (1 << static_cast<int>(sorted_triple[index1].label_)) / log(index1 + 2)
|
||||
+ (1 << static_cast<int>(sorted_triple[index2].label_)) / log(index2 + 2);
|
||||
double changed = (1 << static_cast<int>(sorted_triple[index2].label_)) / log(index1 + 2)
|
||||
+ (1 << static_cast<int>(sorted_triple[index1].label_)) / log(index2 + 2);
|
||||
double ans = (original - changed) / IDCG;
|
||||
if (ans < 0) ans = -ans;
|
||||
return static_cast<float>(ans);
|
||||
}
|
||||
|
||||
|
||||
inline float GetIDCG(const std::vector< Triple > sorted_triple){
|
||||
std::vector<float> labels;
|
||||
for (size_t i = 0; i < sorted_triple.size(); i++){
|
||||
labels.push_back(sorted_triple[i].label_);
|
||||
}
|
||||
|
||||
std::sort(labels.begin(), labels.end(), std::greater<float>());
|
||||
return CalcDCG(labels);
|
||||
}
|
||||
|
||||
inline void GetLambda(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<unsigned> &group_index,
|
||||
const std::vector< std::pair<int, int> > &pairs, std::vector<float> &lambda, int group){
|
||||
std::vector< Triple > sorted_triple;
|
||||
std::vector<int> index_remap;
|
||||
float IDCG;
|
||||
|
||||
GetSortedTuple(preds, labels, group_index, group, sorted_triple);
|
||||
GetIndexMap(sorted_triple, group_index[group], index_remap);
|
||||
IDCG = GetIDCG(sorted_triple);
|
||||
|
||||
lambda.resize(pairs.size());
|
||||
for (size_t i = 0; i < pairs.size(); i++){
|
||||
lambda[i] = GetLambdaNDCG(sorted_triple,
|
||||
index_remap[pairs[i].first],index_remap[pairs[i].second],IDCG);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class LambdaRankObj_MAP : public LambdaRankObj{
|
||||
class Quadruple{
|
||||
public:
|
||||
/* \brief the accumulated precision */
|
||||
float ap_acc_;
|
||||
/* \brief the accumulated precision assuming a positive instance is missing*/
|
||||
float ap_acc_miss_;
|
||||
/* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/
|
||||
float ap_acc_add_;
|
||||
/* \brief the accumulated positive instance count */
|
||||
float hits_;
|
||||
|
||||
Quadruple(){}
|
||||
|
||||
Quadruple(const Quadruple& q){
|
||||
ap_acc_ = q.ap_acc_;
|
||||
ap_acc_miss_ = q.ap_acc_miss_;
|
||||
ap_acc_add_ = q.ap_acc_add_;
|
||||
hits_ = q.hits_;
|
||||
}
|
||||
|
||||
Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits
|
||||
) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2
|
||||
* in sorted triples
|
||||
* \param sorted_triple the fields are predition,label,original index
|
||||
* \param index1,index2 the instances switched
|
||||
* \param map_acc a vector containing the accumulated precisions for each position in a list
|
||||
*/
|
||||
inline float GetLambdaMAP(const std::vector< Triple > sorted_triple,
|
||||
int index1, int index2,
|
||||
std::vector< Quadruple > &map_acc){
|
||||
if (index1 == index2 || sorted_triple[index1].label_ == sorted_triple[index2].label_) return 0.0;
|
||||
if (index1 > index2) std::swap(index1, index2);
|
||||
float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2]
|
||||
if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_;
|
||||
float changed = 0;
|
||||
if (sorted_triple[index1].label_ < sorted_triple[index2].label_){
|
||||
changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_;
|
||||
changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1);
|
||||
}
|
||||
else{
|
||||
changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_;
|
||||
changed += map_acc[index2].hits_ / (index2 + 1);
|
||||
}
|
||||
float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_);
|
||||
if (ans < 0) ans = -ans;
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* \brief preprocessing results for calculating delta MAP
|
||||
* \return The first field is the accumulated precision, the second field is the
|
||||
* accumulated precision assuming a positive instance is missing,
|
||||
* the third field is the accumulated precision assuming that one more positive
|
||||
* instance is inserted, the fourth field is the accumulated positive instance count
|
||||
*/
|
||||
inline void GetMAPAcc(const std::vector< Triple > sorted_triple,
|
||||
std::vector< Quadruple > &map_acc){
|
||||
map_acc.resize(sorted_triple.size());
|
||||
float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
|
||||
for (size_t i = 1; i <= sorted_triple.size(); i++){
|
||||
if ((int)sorted_triple[i - 1].label_ == 1) {
|
||||
hit++;
|
||||
acc1 += hit / i;
|
||||
acc2 += (hit - 1) / i;
|
||||
acc3 += (hit + 1) / i;
|
||||
}
|
||||
map_acc[i-1] = Quadruple(acc1, acc2, acc3, hit);
|
||||
}
|
||||
}
|
||||
|
||||
inline void GetLambda(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<unsigned> &group_index,
|
||||
const std::vector< std::pair<int, int> > &pairs, std::vector<float> &lambda, int group){
|
||||
std::vector< Triple > sorted_triple;
|
||||
std::vector<int> index_remap;
|
||||
std::vector< Quadruple > map_acc;
|
||||
|
||||
GetSortedTuple(preds, labels, group_index, group, sorted_triple);
|
||||
GetIndexMap(sorted_triple, group_index[group], index_remap);
|
||||
GetMAPAcc(sorted_triple, map_acc);
|
||||
|
||||
lambda.resize(pairs.size());
|
||||
for (size_t i = 0; i < pairs.size(); i++){
|
||||
lambda[i] = GetLambdaMAP(sorted_triple,
|
||||
index_remap[pairs[i].first], index_remap[pairs[i].second], map_acc);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -109,6 +109,7 @@ namespace xgboost{
|
||||
namespace xgboost{
|
||||
namespace regrank{
|
||||
inline IObjFunction* CreateObjFunction( const char *name ){
|
||||
<<<<<<< HEAD
|
||||
if( !strcmp("reg", name ) ) return new RegressionObj();
|
||||
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
||||
@ -118,6 +119,18 @@ namespace xgboost{
|
||||
utils::Error("unknown objective function type");
|
||||
return NULL;
|
||||
}
|
||||
=======
|
||||
if( !strcmp("reg:linear", name ) ) return new RegressionObj( LossType::kLinearSquare );
|
||||
if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik );
|
||||
if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
|
||||
if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
|
||||
if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj();
|
||||
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
||||
utils::Error("unknown objective function type");
|
||||
return NULL;
|
||||
}
|
||||
>>>>>>> 9eabb5c7f912a326005aca53a76c2e53a1661842
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -14,8 +14,8 @@ namespace xgboost{
|
||||
namespace regrank{
|
||||
class RegressionObj : public IObjFunction{
|
||||
public:
|
||||
RegressionObj(void){
|
||||
loss.loss_type = LossType::kLinearSquare;
|
||||
RegressionObj( int loss_type ){
|
||||
loss.loss_type = loss_type;
|
||||
}
|
||||
virtual ~RegressionObj(){}
|
||||
virtual void SetParam(const char *name, const char *val){
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user