From e0a0343ae671138a0e119c7f954c7ce798554547 Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 17:48:03 -0600 Subject: [PATCH 01/12] speedtest --- demo/kaggle-higgs/speedtest.py | 63 ++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100755 demo/kaggle-higgs/speedtest.py diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py new file mode 100755 index 000000000..1b07d4619 --- /dev/null +++ b/demo/kaggle-higgs/speedtest.py @@ -0,0 +1,63 @@ +#!/usr/local/bin/python +# this is the example script to use xgboost to train +import sys +import numpy as np +# add path of xgboost python module +sys.path.append('../../python/') +import xgboost as xgb +from sklearn.ensemble import GradientBoostingClassifier +import time +test_size = 550000 + +# path to where the data lies +dpath = 'data' + +# load in training data, directly use numpy +dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) +print 'finish loading from csv ' + +label = dtrain[:,32] +data = dtrain[:,1:31] +# rescale weight to make it same as test set +weight = dtrain[:,31] * float(test_size) / len(label) + +sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) +sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) + +# print weight statistics +print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) + +# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value +xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) + +# setup parameters for xgboost +param = {} +# use logistic regression loss +param['loss_type'] = 1 +# scale weight of positive examples +param['scale_pos_weight'] = sum_wneg/sum_wpos +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['eval_metric'] = 'auc' +param['silent'] = 1 +param['nthread'] = 4 + +# you can directly throw param in, though we want to watch multiple metrics here +plst = param.items()+[('eval_metric', 'ams@0.15')] + +watchlist = [ (xgmat,'train') ] +# boost 135 tres +num_round = 135 +print 'loading data end, start to boost trees' +print "training GBM from sklearn" +tmp = time.time() +gbm = GradientBoostingClassifier(n_estimators=135, max_depth=6, verbose=2) +gbm.fit(data, label) +print "GBM costs: %s seconds" % str(time.time() - tmp) +#raw_input() +print "training xgboost" +tmp = time.time() +bst = xgb.train( plst, xgmat, num_round, watchlist ); +print "XGBoost costs: %s seconds" % str(time.time() - tmp) + +print 'finish training' From ae70b9b152c659d2d79ba9497e523430cf0b0f83 Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 18:05:17 -0600 Subject: [PATCH 02/12] new speed test --- demo/kaggle-higgs/speedtest.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py index 1b07d4619..d17c2680b 100755 --- a/demo/kaggle-higgs/speedtest.py +++ b/demo/kaggle-higgs/speedtest.py @@ -42,22 +42,25 @@ param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = 4 -# you can directly throw param in, though we want to watch multiple metrics here plst = param.items()+[('eval_metric', 'ams@0.15')] watchlist = [ (xgmat,'train') ] -# boost 135 tres -num_round = 135 +# boost 10 tres +num_round = 10 print 'loading data end, start to boost trees' print "training GBM from sklearn" tmp = time.time() -gbm = GradientBoostingClassifier(n_estimators=135, max_depth=6, verbose=2) +gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) print "GBM costs: %s seconds" % str(time.time() - tmp) #raw_input() print "training xgboost" -tmp = time.time() -bst = xgb.train( plst, xgmat, num_round, watchlist ); -print "XGBoost costs: %s seconds" % str(time.time() - tmp) +threads = [1, 2, 4, 16] +for i in threads: + param['nthread'] = i + tmp = time.time() + plst = param.items()+[('eval_metric', 'ams@0.15')] + bst = xgb.train( plst, xgmat, num_round, watchlist ); + print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)) print 'finish training' From c28a1be34c0c6ed32fb1c7db042fced9a13979e7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:19:57 -0700 Subject: [PATCH 03/12] minor changes --- demo/kaggle-higgs/README.md | 2 +- demo/kaggle-higgs/run.sh | 4 ++-- demo/kaggle-higgs/speedtest.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index b2ec971bd..2736b1326 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -7,4 +7,4 @@ run ./run.sh you need to compile xgboost python model in ../../python - +speedtest.py compares xgboost's speed on this dataset with sklearn diff --git a/demo/kaggle-higgs/run.sh b/demo/kaggle-higgs/run.sh index e6b5d91fa..c69426c25 100755 --- a/demo/kaggle-higgs/run.sh +++ b/demo/kaggle-higgs/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -./higgs-numpy.py -./higgs-pred.py \ No newline at end of file +python higgs-numpy.py +python higgs-pred.py \ No newline at end of file diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py index d17c2680b..212389c01 100755 --- a/demo/kaggle-higgs/speedtest.py +++ b/demo/kaggle-higgs/speedtest.py @@ -1,4 +1,4 @@ -#!/usr/local/bin/python +#!/usr/bin/python # this is the example script to use xgboost to train import sys import numpy as np @@ -52,7 +52,7 @@ print "training GBM from sklearn" tmp = time.time() gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) -print "GBM costs: %s seconds" % str(time.time() - tmp) +print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp) #raw_input() print "training xgboost" threads = [1, 2, 4, 16] From b3c3ecd9c969aa81eee704a11e8928b9c9727430 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:25:01 -0700 Subject: [PATCH 04/12] chng few things --- demo/kaggle-higgs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index 2736b1326..ffd140d0a 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -7,4 +7,4 @@ run ./run.sh you need to compile xgboost python model in ../../python -speedtest.py compares xgboost's speed on this dataset with sklearn +speedtest.py compares xgboost's speed on this dataset with sklearn.GBM From 72d3a6a3cc2b2d5b322be82032da6847faa5504c Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:38:40 -0700 Subject: [PATCH 05/12] chg rank demo --- demo/rank/mq2008.conf | 6 +----- demo/rank/runexp.sh | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 08b9d9679..1ec289ba1 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -3,9 +3,6 @@ booster_type = 0 objective="rank:pairwise" -#objective="rank:softmax" -#objective="lambdarank:map" -#objective="lambdarank:ndcg" # Tree Booster Parameters # step size shrinkage @@ -16,8 +13,7 @@ bst:gamma = 1.0 bst:min_child_weight = 0.1 # maximum depth of a tree bst:max_depth = 6 -eval_metric = "ndcg" -eval_metric = "map" + # Task parameters # the number of round to do boosting num_round = 4 diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index cb15f1dd0..201c287e9 100644 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -8,4 +8,3 @@ python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf task=pred model_in=0002.model -../../xgboost mq2008.conf task=dump model_in=0002.model name_dump=dump.raw.txt \ No newline at end of file From fd2774e133371570079f54d7e226044938556eed Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:40:46 -0700 Subject: [PATCH 06/12] cleanup --- demo/rank/README | 5 ++++- demo/rank/train | 0 2 files changed, 4 insertions(+), 1 deletion(-) delete mode 100644 demo/rank/train diff --git a/demo/rank/README b/demo/rank/README index 6b40516d1..499b01d51 100644 --- a/demo/rank/README +++ b/demo/rank/README @@ -1 +1,4 @@ -The dataset for ranking demo is from LETOR04 MQ2008 fold1,http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx +Instructions: +The dataset for ranking demo is from LETOR04 MQ2008 fold1, +http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx + diff --git a/demo/rank/train b/demo/rank/train deleted file mode 100644 index e69de29bb..000000000 From 9bc6e83afee56b2d728a3c4fcbf877a55669a5b8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:46:43 -0700 Subject: [PATCH 07/12] chg scripts --- demo/rank/README | 13 +++++++++++-- demo/rank/mq2008.conf | 1 + demo/rank/runexp.sh | 2 -- demo/rank/wgetdata.sh | 4 ++++ 4 files changed, 16 insertions(+), 4 deletions(-) mode change 100644 => 100755 demo/rank/runexp.sh create mode 100755 demo/rank/wgetdata.sh diff --git a/demo/rank/README b/demo/rank/README index 499b01d51..357e96932 100644 --- a/demo/rank/README +++ b/demo/rank/README @@ -1,4 +1,13 @@ Instructions: -The dataset for ranking demo is from LETOR04 MQ2008 fold1, -http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx +The dataset for ranking demo is from LETOR04 MQ2008 fold1, +You can use the following command to run the example + + +Get the data: ./wgetdata.sh +Run the example: ./runexp.sh + + + + + diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 1ec289ba1..0d26580c2 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -2,6 +2,7 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 +# so far, we have pairwise rank objective="rank:pairwise" # Tree Booster Parameters diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh old mode 100644 new mode 100755 index 201c287e9..edfb91110 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -1,7 +1,5 @@ python trans_data.py train.txt mq2008.train mq2008.train.group - python trans_data.py test.txt mq2008.test mq2008.test.group - python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf diff --git a/demo/rank/wgetdata.sh b/demo/rank/wgetdata.sh new file mode 100755 index 000000000..16f7a2e05 --- /dev/null +++ b/demo/rank/wgetdata.sh @@ -0,0 +1,4 @@ +#!/bin/bash +wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +unrar x MQ2008.rar +mv -f MQ2008/Fold1/*.txt . From 1839e6efe99e19d86592bc2cfcf15f82a69b7713 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 18:49:02 -0700 Subject: [PATCH 08/12] pre-release version --- regrank/xgboost_regrank_dev.hpp | 164 -------------------------------- 1 file changed, 164 deletions(-) delete mode 100644 regrank/xgboost_regrank_dev.hpp diff --git a/regrank/xgboost_regrank_dev.hpp b/regrank/xgboost_regrank_dev.hpp deleted file mode 100644 index 1925e4c8d..000000000 --- a/regrank/xgboost_regrank_dev.hpp +++ /dev/null @@ -1,164 +0,0 @@ -// some backup code - - class LambdaRankObj_NDCG : public LambdaRankObj{ - - static inline float CalcDCG(const std::vector< float > &rec) { - double sumdcg = 0.0; - for (size_t i = 0; i < rec.size(); i++){ - const unsigned rel = static_cast(rec[i]); - if (rel != 0){ - sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 2); - } - } - return static_cast(sumdcg); - } - - /* - * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 - * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param the IDCG of the list - */ - inline float GetLambdaNDCG(const std::vector< Triple > sorted_triple, - int index1, - int index2, float IDCG){ - double original = (1 << static_cast(sorted_triple[index1].label_)) / log(index1 + 2) - + (1 << static_cast(sorted_triple[index2].label_)) / log(index2 + 2); - double changed = (1 << static_cast(sorted_triple[index2].label_)) / log(index1 + 2) - + (1 << static_cast(sorted_triple[index1].label_)) / log(index2 + 2); - double ans = (original - changed) / IDCG; - if (ans < 0) ans = -ans; - return static_cast(ans); - } - - - inline float GetIDCG(const std::vector< Triple > sorted_triple){ - std::vector labels; - for (size_t i = 0; i < sorted_triple.size(); i++){ - labels.push_back(sorted_triple[i].label_); - } - - std::sort(labels.begin(), labels.end(), std::greater()); - return CalcDCG(labels); - } - - inline void GetLambda(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - const std::vector< std::pair > &pairs, std::vector &lambda, int group){ - std::vector< Triple > sorted_triple; - std::vector index_remap; - float IDCG; - - GetSortedTuple(preds, labels, group_index, group, sorted_triple); - GetIndexMap(sorted_triple, group_index[group], index_remap); - IDCG = GetIDCG(sorted_triple); - - lambda.resize(pairs.size()); - for (size_t i = 0; i < pairs.size(); i++){ - lambda[i] = GetLambdaNDCG(sorted_triple, - index_remap[pairs[i].first],index_remap[pairs[i].second],IDCG); - } - } - }; - - class LambdaRankObj_MAP : public LambdaRankObj{ - class Quadruple{ - public: - /* \brief the accumulated precision */ - float ap_acc_; - /* \brief the accumulated precision assuming a positive instance is missing*/ - float ap_acc_miss_; - /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ - float ap_acc_add_; - /* \brief the accumulated positive instance count */ - float hits_; - - Quadruple(){} - - Quadruple(const Quadruple& q){ - ap_acc_ = q.ap_acc_; - ap_acc_miss_ = q.ap_acc_miss_; - ap_acc_add_ = q.ap_acc_add_; - hits_ = q.hits_; - } - - Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits - ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ - - } - - }; - - /* - * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 - * in sorted triples - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param map_acc a vector containing the accumulated precisions for each position in a list - */ - inline float GetLambdaMAP(const std::vector< Triple > sorted_triple, - int index1, int index2, - std::vector< Quadruple > &map_acc){ - if (index1 == index2 || sorted_triple[index1].label_ == sorted_triple[index2].label_) return 0.0; - if (index1 > index2) std::swap(index1, index2); - float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] - if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; - float changed = 0; - if (sorted_triple[index1].label_ < sorted_triple[index2].label_){ - changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; - changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); - } - else{ - changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; - changed += map_acc[index2].hits_ / (index2 + 1); - } - float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); - if (ans < 0) ans = -ans; - return ans; - } - - - /* - * \brief preprocessing results for calculating delta MAP - * \return The first field is the accumulated precision, the second field is the - * accumulated precision assuming a positive instance is missing, - * the third field is the accumulated precision assuming that one more positive - * instance is inserted, the fourth field is the accumulated positive instance count - */ - inline void GetMAPAcc(const std::vector< Triple > sorted_triple, - std::vector< Quadruple > &map_acc){ - map_acc.resize(sorted_triple.size()); - float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; - for (size_t i = 1; i <= sorted_triple.size(); i++){ - if ((int)sorted_triple[i - 1].label_ == 1) { - hit++; - acc1 += hit / i; - acc2 += (hit - 1) / i; - acc3 += (hit + 1) / i; - } - map_acc[i-1] = Quadruple(acc1, acc2, acc3, hit); - } - } - - inline void GetLambda(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - const std::vector< std::pair > &pairs, std::vector &lambda, int group){ - std::vector< Triple > sorted_triple; - std::vector index_remap; - std::vector< Quadruple > map_acc; - - GetSortedTuple(preds, labels, group_index, group, sorted_triple); - GetIndexMap(sorted_triple, group_index[group], index_remap); - GetMAPAcc(sorted_triple, map_acc); - - lambda.resize(pairs.size()); - for (size_t i = 0; i < pairs.size(); i++){ - lambda[i] = GetLambdaMAP(sorted_triple, - index_remap[pairs[i].first], index_remap[pairs[i].second], map_acc); - } - } - }; - From ebcce4a2bf4e96622b13f12d9129cc23235afce8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 19:10:52 -0700 Subject: [PATCH 09/12] chg all settings to obj --- .gitignore | 5 +++++ demo/binary_classification/mushroom.conf | 2 +- demo/kaggle-higgs/higgs-numpy.py | 5 +++-- demo/kaggle-higgs/speedtest.py | 2 +- demo/rank/runexp.sh | 3 +-- demo/regression/machine.conf | 6 +++--- regrank/xgboost_regrank.h | 2 +- regrank/xgboost_regrank_data.h | 4 +++- regrank/xgboost_regrank_obj.h | 19 ++++++++++--------- regrank/xgboost_regrank_obj.hpp | 4 ++-- 10 files changed, 30 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 5f3c96d0f..ae2c00e76 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,8 @@ *model xgboost *pyc +*train +*test +*group +*rar +*vali diff --git a/demo/binary_classification/mushroom.conf b/demo/binary_classification/mushroom.conf index 596857aee..dbc832244 100644 --- a/demo/binary_classification/mushroom.conf +++ b/demo/binary_classification/mushroom.conf @@ -2,7 +2,7 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 # choose logistic regression loss function for binary classification -loss_type = 2 +objective = binary:logistic # Tree Booster Parameters # step size shrinkage diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index c16673da5..2bf4a82a5 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -31,8 +31,9 @@ xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) # setup parameters for xgboost param = {} -# use logistic regression loss -param['loss_type'] = 3 +# use logistic regression loss, use raw prediction before logistic transformation +# since we only need the rank +param['objective'] = 'binary:logitraw' # scale weight of positive examples param['scale_pos_weight'] = sum_wneg/sum_wpos param['bst:eta'] = 0.1 diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py index 212389c01..8bef29ff2 100755 --- a/demo/kaggle-higgs/speedtest.py +++ b/demo/kaggle-higgs/speedtest.py @@ -33,7 +33,7 @@ xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) # setup parameters for xgboost param = {} # use logistic regression loss -param['loss_type'] = 1 +param['objective'] = 'binary:logitraw' # scale weight of positive examples param['scale_pos_weight'] = sum_wneg/sum_wpos param['bst:eta'] = 0.1 diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index edfb91110..d948ca5a0 100755 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -3,6 +3,5 @@ python trans_data.py test.txt mq2008.test mq2008.test.group python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf - -../../xgboost mq2008.conf task=pred model_in=0002.model +../../xgboost mq2008.conf task=pred model_in=0004.model diff --git a/demo/regression/machine.conf b/demo/regression/machine.conf index 88bb6102f..5142bcbcf 100644 --- a/demo/regression/machine.conf +++ b/demo/regression/machine.conf @@ -1,9 +1,9 @@ # General Parameters, see comment for each definition # choose the tree booster, 0: tree, 1: linear booster_type = 0 -# this is the only difference with classification, use 0: linear regression -# when labels are in [0,1] we can also use 1: logistic regression -loss_type = 0 +# this is the only difference with classification, use reg:linear to do linear classification +# when labels are in [0,1] we can also use reg:logistic +objective = reg:linear # Tree Booster Parameters # step size shrinkage diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index c7fa9a222..b2649735c 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -25,7 +25,7 @@ namespace xgboost{ RegRankBoostLearner(void){ silent = 0; obj_ = NULL; - name_obj_ = "reg"; + name_obj_ = "reg:linear"; } /*! * \brief a regression booter associated with training and evaluating data diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index d5cd95f3c..f9c78f51c 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -129,7 +129,9 @@ namespace xgboost{ if( fs.Read(&nwt, sizeof(unsigned) ) != 0 ){ utils::Assert( nwt == 0 || nwt == data.NumRow(), "invalid weight" ); info.weights.resize( nwt ); - utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file"); + if( nwt != 0 ){ + utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file"); + } } } fs.Close(); diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index f2fee0653..24396101b 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -109,15 +109,16 @@ namespace xgboost{ namespace xgboost{ namespace regrank{ inline IObjFunction* CreateObjFunction( const char *name ){ - if( !strcmp("reg", name ) ) return new RegressionObj(); - if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); - if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); - if( !strcmp("softmax", name ) ) return new SoftmaxMultiClassObj(); - // if (!strcmp("lambdarank:map", name)) return new LambdaRankObj_MAP(); - // if (!strcmp("lambdarank:ndcg", name)) return new LambdaRankObj_NDCG(); - utils::Error("unknown objective function type"); - return NULL; - } + if( !strcmp("reg:linear", name ) ) return new RegressionObj( LossType::kLinearSquare ); + if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik ); + if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify ); + if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw ); + if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); + if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + utils::Error("unknown objective function type"); + return NULL; + } }; }; #endif diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index e4d99e0c7..71ebec0ab 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -14,8 +14,8 @@ namespace xgboost{ namespace regrank{ class RegressionObj : public IObjFunction{ public: - RegressionObj(void){ - loss.loss_type = LossType::kLinearSquare; + RegressionObj( int loss_type ){ + loss.loss_type = loss_type; } virtual ~RegressionObj(){} virtual void SetParam(const char *name, const char *val){ From 3429ab344769606e2bf87f244b23151f4b110762 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 19:24:53 -0700 Subject: [PATCH 10/12] chgs --- python/example/demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/example/demo.py b/python/example/demo.py index f5e0aa2a7..c1cf82b0f 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -12,7 +12,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train') dtest = xgb.DMatrix('agaricus.txt.test') # specify parameters via map, definition are same as c++ version -param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 } +param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } # specify validations set to watch performance evallist = [(dtest,'eval'), (dtrain,'train')] From b90d1dc92b6f2ffb30621063d485de147ab7c6fc Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 19:30:32 -0700 Subject: [PATCH 11/12] Update demo.py --- python/example/demo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/example/demo.py b/python/example/demo.py index c1cf82b0f..73935efab 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -29,11 +29,6 @@ bst.dump_model('dump.raw.txt') # dump model with feature map bst.dump_model('dump.raw.txt','featmap.txt') -# beta: interact mode -bst.set_param('bst:interact:expand',4) -bst.update_interact( dtrain, 'update', 0) -bst.dump_model('dump.raw2.txt') - ### # build dmatrix in python iteratively # From 21b21e69de0fa87387bf58e3dd1841e38bc6b859 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 19:33:59 -0700 Subject: [PATCH 12/12] add bing to author list --- README.md | 3 ++- python/xgboost.py | 1 + python/xgboost_python.cpp | 1 + python/xgboost_python.h | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 64cbf5159..c775c9776 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,10 @@ xgboost: eXtreme Gradient Boosting ======= An optimized general purpose gradient boosting (tree) library. -Authors: +Contributors: * Tianqi Chen, project creater * Kailong Chen, contributes regression module +* Bing Xu, contributes python interface, higgs example Turorial and Documentation: https://github.com/tqchen/xgboost/wiki diff --git a/python/xgboost.py b/python/xgboost.py index 5c3555770..d7cf9f63e 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -1,3 +1,4 @@ +# Author: Tianqi Chen, Bing Xu # module for xgboost import ctypes import os diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index 8dd210c52..7c63fc6ac 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -1,3 +1,4 @@ +// implementations in ctypes #include "xgboost_python.h" #include "../regrank/xgboost_regrank.h" #include "../regrank/xgboost_regrank_data.h" diff --git a/python/xgboost_python.h b/python/xgboost_python.h index ac3ca94ac..6c113a108 100644 --- a/python/xgboost_python.h +++ b/python/xgboost_python.h @@ -1,7 +1,8 @@ #ifndef XGBOOST_PYTHON_H #define XGBOOST_PYTHON_H /*! - * \file xgboost_regrank_data.h + * \file xgboost_python.h + * \author Tianqi Chen * \brief python wrapper for xgboost, using ctypes, * hides everything behind functions * use c style interface