From d05cb137515fe0c539f146a6057a00db901b7eaa Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 20:57:42 -0600 Subject: [PATCH 01/18] demo --- demo/multi_classification/train.py | 42 +++++++++++++++++++++++++++ demo/multi_classification/wgetdata.sh | 2 ++ 2 files changed, 44 insertions(+) create mode 100755 demo/multi_classification/train.py create mode 100755 demo/multi_classification/wgetdata.sh diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py new file mode 100755 index 000000000..2dc98f4d6 --- /dev/null +++ b/demo/multi_classification/train.py @@ -0,0 +1,42 @@ + +import sys +import numpy as np +sys.path.append('../../python/') +import xgboost as xgb + + + +data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x) } ) +sz = data.shape + +train = data[:int(sz[0] * 0.7), :] +test = data[int(sz[0] * 0.7):, :] + +train_X = train[:,0:33] +train_Y = train[:, 34] + + +test_X = test[:,0:33] +test_Y = test[:, 34] + +xg_train = xgb.DMatrix( train_X, label=train_Y) +xg_test = xgb.DMatrix(test_X, label=test_Y) +# setup parameters for xgboost +param = {} +# use logistic regression loss, use raw prediction before logistic transformation +# since we only need the rank +param['objective'] = 'multi:softmax' +# scale weight of positive examples +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['eval_metric'] = 'auc' +param['silent'] = 1 +param['nthread'] = 4 +param['num_class'] = 5 + +watchlist = [ (xg_train,'train'), (xg_test, 'test') ] +num_round = 5 +bst = xgb.train(param, xg_train, num_round, watchlist ); + + + diff --git a/demo/multi_classification/wgetdata.sh b/demo/multi_classification/wgetdata.sh new file mode 100755 index 000000000..10dbcd8fb --- /dev/null +++ b/demo/multi_classification/wgetdata.sh @@ -0,0 +1,2 @@ +#! /bin/bash +wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data From 7537d691d9a2dd541a8a588e2eb988494fa0b1d0 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:00:20 -0700 Subject: [PATCH 02/18] Update README.md --- demo/kaggle-higgs/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index a3c208002..b3db23266 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -16,5 +16,6 @@ make - +Speed +===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM From f52f7b78995abd92d830bf8758e825750a76ac53 Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 21:05:11 -0600 Subject: [PATCH 03/18] demo --- demo/multi_classification/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index 2dc98f4d6..89a065450 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -1,4 +1,4 @@ - +#! /usr/bin/python import sys import numpy as np sys.path.append('../../python/') @@ -29,7 +29,6 @@ param['objective'] = 'multi:softmax' # scale weight of positive examples param['bst:eta'] = 0.1 param['bst:max_depth'] = 6 -param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 5 From 8e5e3340a29c9a0580326de5afd2407401cae72c Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:12:04 -0700 Subject: [PATCH 04/18] multi class --- demo/multi_classification/runexp.sh | 9 +++++++++ demo/multi_classification/train.py | 9 +++------ demo/multi_classification/wgetdata.sh | 2 -- 3 files changed, 12 insertions(+), 8 deletions(-) create mode 100755 demo/multi_classification/runexp.sh delete mode 100755 demo/multi_classification/wgetdata.sh diff --git a/demo/multi_classification/runexp.sh b/demo/multi_classification/runexp.sh new file mode 100755 index 000000000..0af814725 --- /dev/null +++ b/demo/multi_classification/runexp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f dermatology.data ] +then + echo "use existing data to run multi class classification" +else + echo "getting data from uci, make sure you are connected to internet" + wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data +fi +python train.py diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index 89a065450..d51824a16 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -4,9 +4,8 @@ import numpy as np sys.path.append('../../python/') import xgboost as xgb - - -data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x) } ) +# label need to be 0 to num_class -1 +data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } ) sz = data.shape train = data[:int(sz[0] * 0.7), :] @@ -31,11 +30,9 @@ param['bst:eta'] = 0.1 param['bst:max_depth'] = 6 param['silent'] = 1 param['nthread'] = 4 -param['num_class'] = 5 +param['num_class'] = 6 watchlist = [ (xg_train,'train'), (xg_test, 'test') ] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist ); - - diff --git a/demo/multi_classification/wgetdata.sh b/demo/multi_classification/wgetdata.sh deleted file mode 100755 index 10dbcd8fb..000000000 --- a/demo/multi_classification/wgetdata.sh +++ /dev/null @@ -1,2 +0,0 @@ -#! /bin/bash -wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data From cfd6c9e3b7c3fa7d97d86c9e40164be6b11f7605 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:16:10 -0700 Subject: [PATCH 05/18] Update train.py --- demo/multi_classification/train.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index d51824a16..d4cf2a0d4 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -22,8 +22,7 @@ xg_train = xgb.DMatrix( train_X, label=train_Y) xg_test = xgb.DMatrix(test_X, label=test_Y) # setup parameters for xgboost param = {} -# use logistic regression loss, use raw prediction before logistic transformation -# since we only need the rank +# use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['bst:eta'] = 0.1 @@ -35,4 +34,9 @@ param['num_class'] = 6 watchlist = [ (xg_train,'train'), (xg_test, 'test') ] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist ); +# get prediction +pred = bst.predict( xg_test ); + +print 'error=%f' % sum(int(pred[i]) != test_Y[i] for i in len(test_Y)) / float(len(test_Y)) + From 6c72d0220524ef9d42a8357493f32cdb4fb25388 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:18:34 -0700 Subject: [PATCH 06/18] chg --- demo/multi_classification/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index d4cf2a0d4..38d818890 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -37,6 +37,6 @@ bst = xgb.train(param, xg_train, num_round, watchlist ); # get prediction pred = bst.predict( xg_test ); -print 'error=%f' % sum(int(pred[i]) != test_Y[i] for i in len(test_Y)) / float(len(test_Y)) +print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) ) From 3e4dd2fce070fd6d03e59821b3a47bac25d81e9a Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 21:27:37 -0600 Subject: [PATCH 07/18] chg --- demo/multiclass_classification/README.md | 8 ++++++++ .../runexp.sh | 0 .../train.py | 0 3 files changed, 8 insertions(+) create mode 100644 demo/multiclass_classification/README.md rename demo/{multi_classification => multiclass_classification}/runexp.sh (100%) rename demo/{multi_classification => multiclass_classification}/train.py (100%) diff --git a/demo/multiclass_classification/README.md b/demo/multiclass_classification/README.md new file mode 100644 index 000000000..4d66ee06a --- /dev/null +++ b/demo/multiclass_classification/README.md @@ -0,0 +1,8 @@ +Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology) + +1. Run runexp.sh +```bash +./runexp.sh +``` + +Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki) diff --git a/demo/multi_classification/runexp.sh b/demo/multiclass_classification/runexp.sh similarity index 100% rename from demo/multi_classification/runexp.sh rename to demo/multiclass_classification/runexp.sh diff --git a/demo/multi_classification/train.py b/demo/multiclass_classification/train.py similarity index 100% rename from demo/multi_classification/train.py rename to demo/multiclass_classification/train.py From b07ff1ac8d4625c7b59ac2ce59d503d1a1c9c4c8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:28:07 -0700 Subject: [PATCH 08/18] fix softmax --- regrank/xgboost_regrank.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index b2649735c..e3868002a 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -97,8 +97,8 @@ namespace xgboost{ */ inline void InitTrainer(void){ if( mparam.num_class != 0 ){ - if( name_obj_ != "softmax" ){ - name_obj_ = "softmax"; + if( name_obj_ != "multi:softmax" ){ + name_obj_ = "multi:softmax"; printf("auto select objective=softmax to support multi-class classification\n" ); } } From 255bad90cb8c7d4f95b61e5ba6c624cee06f551f Mon Sep 17 00:00:00 2001 From: yepyao Date: Sat, 17 May 2014 11:34:24 +0800 Subject: [PATCH 09/18] small change --- demo/rank/mq2008.conf | 4 +- demo/rank/runexp.sh | 12 +-- regrank/xgboost_regrank_obj.h | 6 +- regrank/xgboost_regrank_obj.hpp | 157 ++++++++++++++++++++++++++++++++ 4 files changed, 167 insertions(+), 12 deletions(-) diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 0d26580c2..f355b0578 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -2,8 +2,8 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 -# so far, we have pairwise rank -objective="rank:pairwise" +# specify objective +objective="rank:map" # Tree Booster Parameters # step size shrinkage diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index c17ebee05..3867047f3 100755 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -1,14 +1,8 @@ -#Download the dataset from web site -wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +python trans_data.py train.txt mq2008.train mq2008.train.group -#please first install the unrar package -unrar x MQ2008 +python trans_data.py test.txt mq2008.test mq2008.test.group -python trans_data.py MQ2008/Fold1/train.txt mq2008.train mq2008.train.group - -python trans_data.py MQ2008/Fold1/test.txt mq2008.test mq2008.test.group - -python trans_data.py MQ2008/Fold1/vali.txt mq2008.vali mq2008.vali.group +python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index 24396101b..a33b828d9 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -116,7 +116,11 @@ namespace xgboost{ if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); - utils::Error("unknown objective function type"); + if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + if( !strcmp("rank:map", name ) ) return new LambdaRankObj_MAP(); + if( !strcmp("rank:ndcg", name ) ) return new LambdaRankObj_NDCG(); + utils::Error("unknown objective function type"); return NULL; } }; diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 71ebec0ab..70d4347f7 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -330,6 +330,163 @@ namespace xgboost{ virtual ~PairwiseRankObj(void){} virtual void GetLambdaWeight( const std::vector &sorted_list, std::vector &pairs ){} }; + + + class LambdaRankObj_NDCG : public LambdaRankObj{ + + public: + virtual ~LambdaRankObj_NDCG(void){} + + inline float CalcDCG( const std::vector &labels ){ + double sumdcg = 0.0; + for( size_t i = 0; i < labels.size(); i ++ ){ + const unsigned rel = labels[i]; + if( rel != 0 ){ + sumdcg += logf(2.0f) * ((1<(sumdcg); + } + + inline float GetIDCG(const std::vector &sorted_list){ + std::vector labels; + for (size_t i = 0; i < sorted_list.size(); i++){ + labels.push_back(sorted_list[i].label); + } + + std::sort(labels.begin(), labels.end(), std::greater()); + return CalcDCG(labels); + } + + /* + * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 + * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param the IDCG of the list + */ + inline float GetLambdaNDCG(const std::vector &sorted_list, + int index1, + int index2, float IDCG){ + double original = (1 << static_cast(sorted_list[index1].label)) / log(index1 + 2) + + (1 << static_cast(sorted_list[index2].label)) / log(index2 + 2); + double changed = (1 << static_cast(sorted_list[index2].label)) / log(index1 + 2) + + (1 << static_cast(sorted_list[index1].label)) / log(index2 + 2); + double ans = (original - changed) / IDCG; + if (ans < 0) ans = -ans; + return static_cast(ans); + } + + virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ + float IDCG = GetIDCG(sorted_list); + for (size_t i = 0; i < pairs.size(); i++){ + pairs[i].weight = GetLambdaNDCG(sorted_list, + pairs[i].pos_index, pairs[i].neg_index, IDCG); + } + } + + }; + + class LambdaRankObj_MAP : public LambdaRankObj{ + + class Quadruple{ + public: + /* \brief the accumulated precision */ + float ap_acc_; + /* \brief the accumulated precision assuming a positive instance is missing*/ + float ap_acc_miss_; + /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ + float ap_acc_add_; + /* \brief the accumulated positive instance count */ + float hits_; + + Quadruple(){} + + Quadruple(const Quadruple& q){ + ap_acc_ = q.ap_acc_; + ap_acc_miss_ = q.ap_acc_miss_; + ap_acc_add_ = q.ap_acc_add_; + hits_ = q.hits_; + } + + Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits + ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ + + } + + }; + + public: + virtual ~LambdaRankObj_MAP(void){} + + /* + * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 + * in sorted triples + * \param sorted_list the list containing entry information + * \param index1,index2 the instances switched + * \param map_acc a vector containing the accumulated precisions for each position in a list + */ + inline float GetLambdaMAP(const std::vector &sorted_list, + int index1, int index2, + std::vector< Quadruple > &map_acc){ + if (index1 == index2 + || sorted_list[index1].label == sorted_list[index2].label + || map_acc[map_acc.size() - 1].hits_ == 0 + ) return 0.0; + if (index1 > index2) std::swap(index1, index2); + float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] + if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; + float changed = 0; + if (sorted_list[index1].label < sorted_list[index2].label){ + changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; + changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); + } + else{ + changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; + changed += map_acc[index2].hits_ / (index2 + 1); + } + if(map_acc[map_acc.size() - 1].hits_ == 0) printf("haha\n"); + + float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); + if (ans < 0) ans = -ans; + return ans; + } + + /* + * \brief preprocessing results for calculating delta MAP + * \return The first field is the accumulated precision, the second field is the + * accumulated precision assuming a positive instance is missing, + * the third field is the accumulated precision assuming that one more positive + * instance is inserted, the fourth field is the accumulated positive instance count + */ + inline void GetMAPAcc(const std::vector &sorted_list, + std::vector< Quadruple > &map_acc){ + map_acc.resize(sorted_list.size()); + float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; + for (size_t i = 1; i <= sorted_list.size(); i++){ + if ((int)sorted_list[i - 1].label > 0) { + hit++; + acc1 += hit / i; + acc2 += (hit - 1) / i; + acc3 += (hit + 1) / i; + } + map_acc[i - 1].ap_acc_ = acc1; + map_acc[i - 1].ap_acc_miss_ = acc2; + map_acc[i - 1].ap_acc_add_ = acc3; + map_acc[i - 1].hits_ = hit; + + } + } + virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ + std::vector< Quadruple > map_acc; + GetMAPAcc(sorted_list, map_acc); + for (size_t i = 0; i < pairs.size(); i++){ + pairs[i].weight = GetLambdaMAP(sorted_list, pairs[i].pos_index, pairs[i].neg_index, map_acc); + } + } + + }; + }; }; #endif From 391be108061ccadb166813abdba3be59f3bb92b8 Mon Sep 17 00:00:00 2001 From: yepyao Date: Sat, 17 May 2014 11:35:43 +0800 Subject: [PATCH 10/18] small change --- demo/rank/mq2008.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index f355b0578..65ad19b8e 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -3,7 +3,7 @@ booster_type = 0 # specify objective -objective="rank:map" +objective="rank:pairwise" # Tree Booster Parameters # step size shrinkage From d429289ad35d1440c431176e2965af6fb7bd8200 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:37:45 -0700 Subject: [PATCH 11/18] ok --- README.md | 2 ++ demo/kaggle-higgs/README.md | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c775c9776..c7e22d706 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Features - Sparse feature format allows easy handling of missing values, and improve computation efficiency. * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. +* Speed: XGBoost is very fast + - IN [demo/higgs/speedtest.py](../blob/master/demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index b3db23266..28472a848 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -14,8 +14,6 @@ make 3. Run ./run.sh - - Speed ===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM From 58cbfa06920c6c044ce43cc9e0755cb8646d1aab Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:41:05 -0700 Subject: [PATCH 12/18] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7e22d706..c8d57105c 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Features * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. * Speed: XGBoost is very fast - - IN [demo/higgs/speedtest.py](../blob/master/demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier + - IN [demo/higgs/speedtest.py](../demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix From 32a33710737b0f239465f4f74c02e0770b357f6f Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:41:21 -0700 Subject: [PATCH 13/18] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c8d57105c..b32ddbc18 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Features * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. * Speed: XGBoost is very fast - - IN [demo/higgs/speedtest.py](../demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier + - IN [demo/higgs/speedtest.py](../demo/master/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix From 4218c1ef53d2a8902ad650525eaa155cdb0c3f8a Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:41:43 -0700 Subject: [PATCH 14/18] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b32ddbc18..c8d57105c 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Features * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. * Speed: XGBoost is very fast - - IN [demo/higgs/speedtest.py](../demo/master/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier + - IN [demo/higgs/speedtest.py](../demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix From 4dadc766527f3be034e9cc97a447a5295fc08f87 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:41:59 -0700 Subject: [PATCH 15/18] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c8d57105c..659f8d5fa 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Features * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. * Speed: XGBoost is very fast - - IN [demo/higgs/speedtest.py](../demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier + - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix From d7bb10eb79715ab90189413cb03cfd410ccd0367 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:44:02 -0700 Subject: [PATCH 16/18] final check --- regrank/xgboost_regrank_obj.h | 9 +- regrank/xgboost_regrank_obj.hpp | 157 -------------------------------- 2 files changed, 3 insertions(+), 163 deletions(-) diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index a33b828d9..2778686a3 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -113,14 +113,11 @@ namespace xgboost{ if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik ); if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify ); if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw ); - if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); + if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); - if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); - if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); - if( !strcmp("rank:map", name ) ) return new LambdaRankObj_MAP(); - if( !strcmp("rank:ndcg", name ) ) return new LambdaRankObj_NDCG(); - utils::Error("unknown objective function type"); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + utils::Error("unknown objective function type"); return NULL; } }; diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 70d4347f7..71ebec0ab 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -330,163 +330,6 @@ namespace xgboost{ virtual ~PairwiseRankObj(void){} virtual void GetLambdaWeight( const std::vector &sorted_list, std::vector &pairs ){} }; - - - class LambdaRankObj_NDCG : public LambdaRankObj{ - - public: - virtual ~LambdaRankObj_NDCG(void){} - - inline float CalcDCG( const std::vector &labels ){ - double sumdcg = 0.0; - for( size_t i = 0; i < labels.size(); i ++ ){ - const unsigned rel = labels[i]; - if( rel != 0 ){ - sumdcg += logf(2.0f) * ((1<(sumdcg); - } - - inline float GetIDCG(const std::vector &sorted_list){ - std::vector labels; - for (size_t i = 0; i < sorted_list.size(); i++){ - labels.push_back(sorted_list[i].label); - } - - std::sort(labels.begin(), labels.end(), std::greater()); - return CalcDCG(labels); - } - - /* - * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 - * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) - * \param sorted_list the list containing entry information - * \param index1,index2 the instances switched - * \param the IDCG of the list - */ - inline float GetLambdaNDCG(const std::vector &sorted_list, - int index1, - int index2, float IDCG){ - double original = (1 << static_cast(sorted_list[index1].label)) / log(index1 + 2) - + (1 << static_cast(sorted_list[index2].label)) / log(index2 + 2); - double changed = (1 << static_cast(sorted_list[index2].label)) / log(index1 + 2) - + (1 << static_cast(sorted_list[index1].label)) / log(index2 + 2); - double ans = (original - changed) / IDCG; - if (ans < 0) ans = -ans; - return static_cast(ans); - } - - virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ - float IDCG = GetIDCG(sorted_list); - for (size_t i = 0; i < pairs.size(); i++){ - pairs[i].weight = GetLambdaNDCG(sorted_list, - pairs[i].pos_index, pairs[i].neg_index, IDCG); - } - } - - }; - - class LambdaRankObj_MAP : public LambdaRankObj{ - - class Quadruple{ - public: - /* \brief the accumulated precision */ - float ap_acc_; - /* \brief the accumulated precision assuming a positive instance is missing*/ - float ap_acc_miss_; - /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ - float ap_acc_add_; - /* \brief the accumulated positive instance count */ - float hits_; - - Quadruple(){} - - Quadruple(const Quadruple& q){ - ap_acc_ = q.ap_acc_; - ap_acc_miss_ = q.ap_acc_miss_; - ap_acc_add_ = q.ap_acc_add_; - hits_ = q.hits_; - } - - Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits - ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ - - } - - }; - - public: - virtual ~LambdaRankObj_MAP(void){} - - /* - * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 - * in sorted triples - * \param sorted_list the list containing entry information - * \param index1,index2 the instances switched - * \param map_acc a vector containing the accumulated precisions for each position in a list - */ - inline float GetLambdaMAP(const std::vector &sorted_list, - int index1, int index2, - std::vector< Quadruple > &map_acc){ - if (index1 == index2 - || sorted_list[index1].label == sorted_list[index2].label - || map_acc[map_acc.size() - 1].hits_ == 0 - ) return 0.0; - if (index1 > index2) std::swap(index1, index2); - float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] - if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; - float changed = 0; - if (sorted_list[index1].label < sorted_list[index2].label){ - changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; - changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); - } - else{ - changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; - changed += map_acc[index2].hits_ / (index2 + 1); - } - if(map_acc[map_acc.size() - 1].hits_ == 0) printf("haha\n"); - - float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); - if (ans < 0) ans = -ans; - return ans; - } - - /* - * \brief preprocessing results for calculating delta MAP - * \return The first field is the accumulated precision, the second field is the - * accumulated precision assuming a positive instance is missing, - * the third field is the accumulated precision assuming that one more positive - * instance is inserted, the fourth field is the accumulated positive instance count - */ - inline void GetMAPAcc(const std::vector &sorted_list, - std::vector< Quadruple > &map_acc){ - map_acc.resize(sorted_list.size()); - float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; - for (size_t i = 1; i <= sorted_list.size(); i++){ - if ((int)sorted_list[i - 1].label > 0) { - hit++; - acc1 += hit / i; - acc2 += (hit - 1) / i; - acc3 += (hit + 1) / i; - } - map_acc[i - 1].ap_acc_ = acc1; - map_acc[i - 1].ap_acc_miss_ = acc2; - map_acc[i - 1].ap_acc_add_ = acc3; - map_acc[i - 1].hits_ = hit; - - } - } - virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector &pairs){ - std::vector< Quadruple > map_acc; - GetMAPAcc(sorted_list, map_acc); - for (size_t i = 0; i < pairs.size(); i++){ - pairs[i].weight = GetLambdaMAP(sorted_list, pairs[i].pos_index, pairs[i].neg_index, map_acc); - } - } - - }; - }; }; #endif From 348d35a6689c773f97bc773ca3533515745de98a Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:46:08 -0700 Subject: [PATCH 17/18] add ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ae2c00e76..5227dbc87 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ xgboost *group *rar *vali +*data From 8e941b2a7964868f6fa20196ed1ab4171082c529 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:49:05 -0700 Subject: [PATCH 18/18] Update README.md --- demo/multiclass_classification/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/demo/multiclass_classification/README.md b/demo/multiclass_classification/README.md index 4d66ee06a..72607de09 100644 --- a/demo/multiclass_classification/README.md +++ b/demo/multiclass_classification/README.md @@ -1,5 +1,7 @@ Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology) +Make sure you make make xgboost python module in ../../python + 1. Run runexp.sh ```bash ./runexp.sh