diff --git a/.gitignore b/.gitignore index 5f3c96d0f..ae2c00e76 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,8 @@ *model xgboost *pyc +*train +*test +*group +*rar +*vali diff --git a/README.md b/README.md index 64cbf5159..c775c9776 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,10 @@ xgboost: eXtreme Gradient Boosting ======= An optimized general purpose gradient boosting (tree) library. -Authors: +Contributors: * Tianqi Chen, project creater * Kailong Chen, contributes regression module +* Bing Xu, contributes python interface, higgs example Turorial and Documentation: https://github.com/tqchen/xgboost/wiki diff --git a/demo/binary_classification/mushroom.conf b/demo/binary_classification/mushroom.conf index 596857aee..dbc832244 100644 --- a/demo/binary_classification/mushroom.conf +++ b/demo/binary_classification/mushroom.conf @@ -2,7 +2,7 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 # choose logistic regression loss function for binary classification -loss_type = 2 +objective = binary:logistic # Tree Booster Parameters # step size shrinkage diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index 2413b2ad7..a3c208002 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -17,4 +17,4 @@ make - +speedtest.py compares xgboost's speed on this dataset with sklearn.GBM diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index c16673da5..2bf4a82a5 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -31,8 +31,9 @@ xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) # setup parameters for xgboost param = {} -# use logistic regression loss -param['loss_type'] = 3 +# use logistic regression loss, use raw prediction before logistic transformation +# since we only need the rank +param['objective'] = 'binary:logitraw' # scale weight of positive examples param['scale_pos_weight'] = sum_wneg/sum_wpos param['bst:eta'] = 0.1 diff --git a/demo/kaggle-higgs/run.sh b/demo/kaggle-higgs/run.sh index e6b5d91fa..c69426c25 100755 --- a/demo/kaggle-higgs/run.sh +++ b/demo/kaggle-higgs/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -./higgs-numpy.py -./higgs-pred.py \ No newline at end of file +python higgs-numpy.py +python higgs-pred.py \ No newline at end of file diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py new file mode 100755 index 000000000..8bef29ff2 --- /dev/null +++ b/demo/kaggle-higgs/speedtest.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +# this is the example script to use xgboost to train +import sys +import numpy as np +# add path of xgboost python module +sys.path.append('../../python/') +import xgboost as xgb +from sklearn.ensemble import GradientBoostingClassifier +import time +test_size = 550000 + +# path to where the data lies +dpath = 'data' + +# load in training data, directly use numpy +dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) +print 'finish loading from csv ' + +label = dtrain[:,32] +data = dtrain[:,1:31] +# rescale weight to make it same as test set +weight = dtrain[:,31] * float(test_size) / len(label) + +sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) +sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) + +# print weight statistics +print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) + +# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value +xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) + +# setup parameters for xgboost +param = {} +# use logistic regression loss +param['objective'] = 'binary:logitraw' +# scale weight of positive examples +param['scale_pos_weight'] = sum_wneg/sum_wpos +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['eval_metric'] = 'auc' +param['silent'] = 1 +param['nthread'] = 4 + +plst = param.items()+[('eval_metric', 'ams@0.15')] + +watchlist = [ (xgmat,'train') ] +# boost 10 tres +num_round = 10 +print 'loading data end, start to boost trees' +print "training GBM from sklearn" +tmp = time.time() +gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) +gbm.fit(data, label) +print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp) +#raw_input() +print "training xgboost" +threads = [1, 2, 4, 16] +for i in threads: + param['nthread'] = i + tmp = time.time() + plst = param.items()+[('eval_metric', 'ams@0.15')] + bst = xgb.train( plst, xgmat, num_round, watchlist ); + print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)) + +print 'finish training' diff --git a/demo/rank/README b/demo/rank/README index 6b40516d1..357e96932 100644 --- a/demo/rank/README +++ b/demo/rank/README @@ -1 +1,13 @@ -The dataset for ranking demo is from LETOR04 MQ2008 fold1,http://research.microsoft.com/en-us/um/beijing/projects/letor/letor4download.aspx +Instructions: +The dataset for ranking demo is from LETOR04 MQ2008 fold1, +You can use the following command to run the example + + +Get the data: ./wgetdata.sh +Run the example: ./runexp.sh + + + + + + diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 0853fcb4e..0d26580c2 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -2,10 +2,8 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 +# so far, we have pairwise rank objective="rank:pairwise" -#objective="rank:softmax" -#objective="rank:map" -#objective="rank:ndcg" # Tree Booster Parameters # step size shrinkage @@ -16,8 +14,7 @@ bst:gamma = 1.0 bst:min_child_weight = 0.1 # maximum depth of a tree bst:max_depth = 6 -eval_metric = "ndcg" -eval_metric = "map" + # Task parameters # the number of round to do boosting num_round = 4 diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh old mode 100644 new mode 100755 diff --git a/demo/rank/wgetdata.sh b/demo/rank/wgetdata.sh new file mode 100755 index 000000000..16f7a2e05 --- /dev/null +++ b/demo/rank/wgetdata.sh @@ -0,0 +1,4 @@ +#!/bin/bash +wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +unrar x MQ2008.rar +mv -f MQ2008/Fold1/*.txt . diff --git a/demo/regression/machine.conf b/demo/regression/machine.conf index 88bb6102f..5142bcbcf 100644 --- a/demo/regression/machine.conf +++ b/demo/regression/machine.conf @@ -1,9 +1,9 @@ # General Parameters, see comment for each definition # choose the tree booster, 0: tree, 1: linear booster_type = 0 -# this is the only difference with classification, use 0: linear regression -# when labels are in [0,1] we can also use 1: logistic regression -loss_type = 0 +# this is the only difference with classification, use reg:linear to do linear classification +# when labels are in [0,1] we can also use reg:logistic +objective = reg:linear # Tree Booster Parameters # step size shrinkage diff --git a/python/example/demo.py b/python/example/demo.py index f5e0aa2a7..73935efab 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -12,7 +12,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train') dtest = xgb.DMatrix('agaricus.txt.test') # specify parameters via map, definition are same as c++ version -param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'loss_type':2 } +param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } # specify validations set to watch performance evallist = [(dtest,'eval'), (dtrain,'train')] @@ -29,11 +29,6 @@ bst.dump_model('dump.raw.txt') # dump model with feature map bst.dump_model('dump.raw.txt','featmap.txt') -# beta: interact mode -bst.set_param('bst:interact:expand',4) -bst.update_interact( dtrain, 'update', 0) -bst.dump_model('dump.raw2.txt') - ### # build dmatrix in python iteratively # diff --git a/python/xgboost.py b/python/xgboost.py index 5c3555770..d7cf9f63e 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -1,3 +1,4 @@ +# Author: Tianqi Chen, Bing Xu # module for xgboost import ctypes import os diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index 8dd210c52..7c63fc6ac 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -1,3 +1,4 @@ +// implementations in ctypes #include "xgboost_python.h" #include "../regrank/xgboost_regrank.h" #include "../regrank/xgboost_regrank_data.h" diff --git a/python/xgboost_python.h b/python/xgboost_python.h index ac3ca94ac..6c113a108 100644 --- a/python/xgboost_python.h +++ b/python/xgboost_python.h @@ -1,7 +1,8 @@ #ifndef XGBOOST_PYTHON_H #define XGBOOST_PYTHON_H /*! - * \file xgboost_regrank_data.h + * \file xgboost_python.h + * \author Tianqi Chen * \brief python wrapper for xgboost, using ctypes, * hides everything behind functions * use c style interface diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index c7fa9a222..b2649735c 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -25,7 +25,7 @@ namespace xgboost{ RegRankBoostLearner(void){ silent = 0; obj_ = NULL; - name_obj_ = "reg"; + name_obj_ = "reg:linear"; } /*! * \brief a regression booter associated with training and evaluating data diff --git a/regrank/xgboost_regrank_data.h b/regrank/xgboost_regrank_data.h index d5cd95f3c..f9c78f51c 100644 --- a/regrank/xgboost_regrank_data.h +++ b/regrank/xgboost_regrank_data.h @@ -129,7 +129,9 @@ namespace xgboost{ if( fs.Read(&nwt, sizeof(unsigned) ) != 0 ){ utils::Assert( nwt == 0 || nwt == data.NumRow(), "invalid weight" ); info.weights.resize( nwt ); - utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file"); + if( nwt != 0 ){ + utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file"); + } } } fs.Close(); diff --git a/regrank/xgboost_regrank_dev.hpp b/regrank/xgboost_regrank_dev.hpp deleted file mode 100644 index 1925e4c8d..000000000 --- a/regrank/xgboost_regrank_dev.hpp +++ /dev/null @@ -1,164 +0,0 @@ -// some backup code - - class LambdaRankObj_NDCG : public LambdaRankObj{ - - static inline float CalcDCG(const std::vector< float > &rec) { - double sumdcg = 0.0; - for (size_t i = 0; i < rec.size(); i++){ - const unsigned rel = static_cast(rec[i]); - if (rel != 0){ - sumdcg += logf(2.0f) *((1 << rel) - 1) / logf(i + 2); - } - } - return static_cast(sumdcg); - } - - /* - * \brief Obtain the delta NDCG if trying to switch the positions of instances in index1 or index2 - * in sorted triples. Here DCG is calculated as sigma_i 2^rel_i/log(i + 1) - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param the IDCG of the list - */ - inline float GetLambdaNDCG(const std::vector< Triple > sorted_triple, - int index1, - int index2, float IDCG){ - double original = (1 << static_cast(sorted_triple[index1].label_)) / log(index1 + 2) - + (1 << static_cast(sorted_triple[index2].label_)) / log(index2 + 2); - double changed = (1 << static_cast(sorted_triple[index2].label_)) / log(index1 + 2) - + (1 << static_cast(sorted_triple[index1].label_)) / log(index2 + 2); - double ans = (original - changed) / IDCG; - if (ans < 0) ans = -ans; - return static_cast(ans); - } - - - inline float GetIDCG(const std::vector< Triple > sorted_triple){ - std::vector labels; - for (size_t i = 0; i < sorted_triple.size(); i++){ - labels.push_back(sorted_triple[i].label_); - } - - std::sort(labels.begin(), labels.end(), std::greater()); - return CalcDCG(labels); - } - - inline void GetLambda(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - const std::vector< std::pair > &pairs, std::vector &lambda, int group){ - std::vector< Triple > sorted_triple; - std::vector index_remap; - float IDCG; - - GetSortedTuple(preds, labels, group_index, group, sorted_triple); - GetIndexMap(sorted_triple, group_index[group], index_remap); - IDCG = GetIDCG(sorted_triple); - - lambda.resize(pairs.size()); - for (size_t i = 0; i < pairs.size(); i++){ - lambda[i] = GetLambdaNDCG(sorted_triple, - index_remap[pairs[i].first],index_remap[pairs[i].second],IDCG); - } - } - }; - - class LambdaRankObj_MAP : public LambdaRankObj{ - class Quadruple{ - public: - /* \brief the accumulated precision */ - float ap_acc_; - /* \brief the accumulated precision assuming a positive instance is missing*/ - float ap_acc_miss_; - /* \brief the accumulated precision assuming that one more positive instance is inserted ahead*/ - float ap_acc_add_; - /* \brief the accumulated positive instance count */ - float hits_; - - Quadruple(){} - - Quadruple(const Quadruple& q){ - ap_acc_ = q.ap_acc_; - ap_acc_miss_ = q.ap_acc_miss_; - ap_acc_add_ = q.ap_acc_add_; - hits_ = q.hits_; - } - - Quadruple(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits - ) :ap_acc_(ap_acc), ap_acc_miss_(ap_acc_miss), ap_acc_add_(ap_acc_add), hits_(hits){ - - } - - }; - - /* - * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2 - * in sorted triples - * \param sorted_triple the fields are predition,label,original index - * \param index1,index2 the instances switched - * \param map_acc a vector containing the accumulated precisions for each position in a list - */ - inline float GetLambdaMAP(const std::vector< Triple > sorted_triple, - int index1, int index2, - std::vector< Quadruple > &map_acc){ - if (index1 == index2 || sorted_triple[index1].label_ == sorted_triple[index2].label_) return 0.0; - if (index1 > index2) std::swap(index1, index2); - float original = map_acc[index2].ap_acc_; // The accumulated precision in the interval [index1,index2] - if (index1 != 0) original -= map_acc[index1 - 1].ap_acc_; - float changed = 0; - if (sorted_triple[index1].label_ < sorted_triple[index2].label_){ - changed += map_acc[index2 - 1].ap_acc_add_ - map_acc[index1].ap_acc_add_; - changed += (map_acc[index1].hits_ + 1.0f) / (index1 + 1); - } - else{ - changed += map_acc[index2 - 1].ap_acc_miss_ - map_acc[index1].ap_acc_miss_; - changed += map_acc[index2].hits_ / (index2 + 1); - } - float ans = (changed - original) / (map_acc[map_acc.size() - 1].hits_); - if (ans < 0) ans = -ans; - return ans; - } - - - /* - * \brief preprocessing results for calculating delta MAP - * \return The first field is the accumulated precision, the second field is the - * accumulated precision assuming a positive instance is missing, - * the third field is the accumulated precision assuming that one more positive - * instance is inserted, the fourth field is the accumulated positive instance count - */ - inline void GetMAPAcc(const std::vector< Triple > sorted_triple, - std::vector< Quadruple > &map_acc){ - map_acc.resize(sorted_triple.size()); - float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0; - for (size_t i = 1; i <= sorted_triple.size(); i++){ - if ((int)sorted_triple[i - 1].label_ == 1) { - hit++; - acc1 += hit / i; - acc2 += (hit - 1) / i; - acc3 += (hit + 1) / i; - } - map_acc[i-1] = Quadruple(acc1, acc2, acc3, hit); - } - } - - inline void GetLambda(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - const std::vector< std::pair > &pairs, std::vector &lambda, int group){ - std::vector< Triple > sorted_triple; - std::vector index_remap; - std::vector< Quadruple > map_acc; - - GetSortedTuple(preds, labels, group_index, group, sorted_triple); - GetIndexMap(sorted_triple, group_index[group], index_remap); - GetMAPAcc(sorted_triple, map_acc); - - lambda.resize(pairs.size()); - for (size_t i = 0; i < pairs.size(); i++){ - lambda[i] = GetLambdaMAP(sorted_triple, - index_remap[pairs[i].first], index_remap[pairs[i].second], map_acc); - } - } - }; - diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index 5a7bad9dd..195646729 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -109,6 +109,7 @@ namespace xgboost{ namespace xgboost{ namespace regrank{ inline IObjFunction* CreateObjFunction( const char *name ){ +<<<<<<< HEAD if( !strcmp("reg", name ) ) return new RegressionObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); @@ -118,6 +119,18 @@ namespace xgboost{ utils::Error("unknown objective function type"); return NULL; } +======= + if( !strcmp("reg:linear", name ) ) return new RegressionObj( LossType::kLinearSquare ); + if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik ); + if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify ); + if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw ); + if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); + if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + utils::Error("unknown objective function type"); + return NULL; + } +>>>>>>> 9eabb5c7f912a326005aca53a76c2e53a1661842 }; }; #endif diff --git a/regrank/xgboost_regrank_obj.hpp b/regrank/xgboost_regrank_obj.hpp index 5272b42fa..7350541cb 100644 --- a/regrank/xgboost_regrank_obj.hpp +++ b/regrank/xgboost_regrank_obj.hpp @@ -14,8 +14,8 @@ namespace xgboost{ namespace regrank{ class RegressionObj : public IObjFunction{ public: - RegressionObj(void){ - loss.loss_type = LossType::kLinearSquare; + RegressionObj( int loss_type ){ + loss.loss_type = loss_type; } virtual ~RegressionObj(){} virtual void SetParam(const char *name, const char *val){