From 42267807f58239bbe06f5ad7f2af8d7925159e97 Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 20:57:42 -0600 Subject: [PATCH 1/8] demo --- demo/multi_classification/train.py | 42 +++++++++++++++++++++++++++ demo/multi_classification/wgetdata.sh | 2 ++ 2 files changed, 44 insertions(+) create mode 100755 demo/multi_classification/train.py create mode 100755 demo/multi_classification/wgetdata.sh diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py new file mode 100755 index 000000000..2dc98f4d6 --- /dev/null +++ b/demo/multi_classification/train.py @@ -0,0 +1,42 @@ + +import sys +import numpy as np +sys.path.append('../../python/') +import xgboost as xgb + + + +data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x) } ) +sz = data.shape + +train = data[:int(sz[0] * 0.7), :] +test = data[int(sz[0] * 0.7):, :] + +train_X = train[:,0:33] +train_Y = train[:, 34] + + +test_X = test[:,0:33] +test_Y = test[:, 34] + +xg_train = xgb.DMatrix( train_X, label=train_Y) +xg_test = xgb.DMatrix(test_X, label=test_Y) +# setup parameters for xgboost +param = {} +# use logistic regression loss, use raw prediction before logistic transformation +# since we only need the rank +param['objective'] = 'multi:softmax' +# scale weight of positive examples +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['eval_metric'] = 'auc' +param['silent'] = 1 +param['nthread'] = 4 +param['num_class'] = 5 + +watchlist = [ (xg_train,'train'), (xg_test, 'test') ] +num_round = 5 +bst = xgb.train(param, xg_train, num_round, watchlist ); + + + diff --git a/demo/multi_classification/wgetdata.sh b/demo/multi_classification/wgetdata.sh new file mode 100755 index 000000000..10dbcd8fb --- /dev/null +++ b/demo/multi_classification/wgetdata.sh @@ -0,0 +1,2 @@ +#! /bin/bash +wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data From 945b336fc6d4fe924b63f3fff8eabb1aaf1bee6f Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:00:20 -0700 Subject: [PATCH 2/8] Update README.md --- demo/kaggle-higgs/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index a3c208002..b3db23266 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -16,5 +16,6 @@ make - +Speed +===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM From 2fcd875675467edc4fe7e8e0c437b07e9ec4465c Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 21:05:11 -0600 Subject: [PATCH 3/8] demo --- demo/multi_classification/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index 2dc98f4d6..89a065450 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -1,4 +1,4 @@ - +#! /usr/bin/python import sys import numpy as np sys.path.append('../../python/') @@ -29,7 +29,6 @@ param['objective'] = 'multi:softmax' # scale weight of positive examples param['bst:eta'] = 0.1 param['bst:max_depth'] = 6 -param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 5 From d3c0ed14f3620b385c29a6325e0539eb826fbf19 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:12:04 -0700 Subject: [PATCH 4/8] multi class --- demo/multi_classification/runexp.sh | 9 +++++++++ demo/multi_classification/train.py | 9 +++------ demo/multi_classification/wgetdata.sh | 2 -- 3 files changed, 12 insertions(+), 8 deletions(-) create mode 100755 demo/multi_classification/runexp.sh delete mode 100755 demo/multi_classification/wgetdata.sh diff --git a/demo/multi_classification/runexp.sh b/demo/multi_classification/runexp.sh new file mode 100755 index 000000000..0af814725 --- /dev/null +++ b/demo/multi_classification/runexp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f dermatology.data ] +then + echo "use existing data to run multi class classification" +else + echo "getting data from uci, make sure you are connected to internet" + wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data +fi +python train.py diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index 89a065450..d51824a16 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -4,9 +4,8 @@ import numpy as np sys.path.append('../../python/') import xgboost as xgb - - -data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x) } ) +# label need to be 0 to num_class -1 +data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } ) sz = data.shape train = data[:int(sz[0] * 0.7), :] @@ -31,11 +30,9 @@ param['bst:eta'] = 0.1 param['bst:max_depth'] = 6 param['silent'] = 1 param['nthread'] = 4 -param['num_class'] = 5 +param['num_class'] = 6 watchlist = [ (xg_train,'train'), (xg_test, 'test') ] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist ); - - diff --git a/demo/multi_classification/wgetdata.sh b/demo/multi_classification/wgetdata.sh deleted file mode 100755 index 10dbcd8fb..000000000 --- a/demo/multi_classification/wgetdata.sh +++ /dev/null @@ -1,2 +0,0 @@ -#! /bin/bash -wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data From 7ea988a76bd8ab0e49921d3fab161387ae1f5813 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 16 May 2014 20:16:10 -0700 Subject: [PATCH 5/8] Update train.py --- demo/multi_classification/train.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index d51824a16..d4cf2a0d4 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -22,8 +22,7 @@ xg_train = xgb.DMatrix( train_X, label=train_Y) xg_test = xgb.DMatrix(test_X, label=test_Y) # setup parameters for xgboost param = {} -# use logistic regression loss, use raw prediction before logistic transformation -# since we only need the rank +# use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['bst:eta'] = 0.1 @@ -35,4 +34,9 @@ param['num_class'] = 6 watchlist = [ (xg_train,'train'), (xg_test, 'test') ] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist ); +# get prediction +pred = bst.predict( xg_test ); + +print 'error=%f' % sum(int(pred[i]) != test_Y[i] for i in len(test_Y)) / float(len(test_Y)) + From 23f4c410354ad0f49fba0a792b813116bce54971 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:18:34 -0700 Subject: [PATCH 6/8] chg --- demo/multi_classification/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/multi_classification/train.py b/demo/multi_classification/train.py index d4cf2a0d4..38d818890 100755 --- a/demo/multi_classification/train.py +++ b/demo/multi_classification/train.py @@ -37,6 +37,6 @@ bst = xgb.train(param, xg_train, num_round, watchlist ); # get prediction pred = bst.predict( xg_test ); -print 'error=%f' % sum(int(pred[i]) != test_Y[i] for i in len(test_Y)) / float(len(test_Y)) +print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) ) From d5f6fba82de4d8530daffb56bd98b176ab98d40d Mon Sep 17 00:00:00 2001 From: antinucleon Date: Fri, 16 May 2014 21:27:37 -0600 Subject: [PATCH 7/8] chg --- demo/multiclass_classification/README.md | 8 ++++++++ .../runexp.sh | 0 .../train.py | 0 3 files changed, 8 insertions(+) create mode 100644 demo/multiclass_classification/README.md rename demo/{multi_classification => multiclass_classification}/runexp.sh (100%) rename demo/{multi_classification => multiclass_classification}/train.py (100%) diff --git a/demo/multiclass_classification/README.md b/demo/multiclass_classification/README.md new file mode 100644 index 000000000..4d66ee06a --- /dev/null +++ b/demo/multiclass_classification/README.md @@ -0,0 +1,8 @@ +Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology) + +1. Run runexp.sh +```bash +./runexp.sh +``` + +Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki) diff --git a/demo/multi_classification/runexp.sh b/demo/multiclass_classification/runexp.sh similarity index 100% rename from demo/multi_classification/runexp.sh rename to demo/multiclass_classification/runexp.sh diff --git a/demo/multi_classification/train.py b/demo/multiclass_classification/train.py similarity index 100% rename from demo/multi_classification/train.py rename to demo/multiclass_classification/train.py From 416050d5c0f8dd11c5e5b75f274b6cdafef83251 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 16 May 2014 20:28:07 -0700 Subject: [PATCH 8/8] fix softmax --- regrank/xgboost_regrank.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index b2649735c..e3868002a 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -97,8 +97,8 @@ namespace xgboost{ */ inline void InitTrainer(void){ if( mparam.num_class != 0 ){ - if( name_obj_ != "softmax" ){ - name_obj_ = "softmax"; + if( name_obj_ != "multi:softmax" ){ + name_obj_ = "multi:softmax"; printf("auto select objective=softmax to support multi-class classification\n" ); } }