commit
59a9b6b325
1
.gitignore
vendored
1
.gitignore
vendored
@ -23,3 +23,4 @@ xgboost
|
|||||||
*group
|
*group
|
||||||
*rar
|
*rar
|
||||||
*vali
|
*vali
|
||||||
|
*data
|
||||||
|
|||||||
@ -15,6 +15,8 @@ Features
|
|||||||
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
||||||
* Push the limit on single machine:
|
* Push the limit on single machine:
|
||||||
- Efficient implementation that optimizes memory and computation.
|
- Efficient implementation that optimizes memory and computation.
|
||||||
|
* Speed: XGBoost is very fast
|
||||||
|
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
|
||||||
* Layout of gradient boosting algorithm to support user defined objective
|
* Layout of gradient boosting algorithm to support user defined objective
|
||||||
* Python interface, works with numpy and scipy.sparse matrix
|
* Python interface, works with numpy and scipy.sparse matrix
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,6 @@ make
|
|||||||
|
|
||||||
3. Run ./run.sh
|
3. Run ./run.sh
|
||||||
|
|
||||||
|
Speed
|
||||||
|
=====
|
||||||
|
|
||||||
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
|
||||||
|
|||||||
10
demo/multiclass_classification/README.md
Normal file
10
demo/multiclass_classification/README.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology)
|
||||||
|
|
||||||
|
Make sure you make make xgboost python module in ../../python
|
||||||
|
|
||||||
|
1. Run runexp.sh
|
||||||
|
```bash
|
||||||
|
./runexp.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki)
|
||||||
9
demo/multiclass_classification/runexp.sh
Executable file
9
demo/multiclass_classification/runexp.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ -f dermatology.data ]
|
||||||
|
then
|
||||||
|
echo "use existing data to run multi class classification"
|
||||||
|
else
|
||||||
|
echo "getting data from uci, make sure you are connected to internet"
|
||||||
|
wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data
|
||||||
|
fi
|
||||||
|
python train.py
|
||||||
42
demo/multiclass_classification/train.py
Executable file
42
demo/multiclass_classification/train.py
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#! /usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../python/')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
# label need to be 0 to num_class -1
|
||||||
|
data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } )
|
||||||
|
sz = data.shape
|
||||||
|
|
||||||
|
train = data[:int(sz[0] * 0.7), :]
|
||||||
|
test = data[int(sz[0] * 0.7):, :]
|
||||||
|
|
||||||
|
train_X = train[:,0:33]
|
||||||
|
train_Y = train[:, 34]
|
||||||
|
|
||||||
|
|
||||||
|
test_X = test[:,0:33]
|
||||||
|
test_Y = test[:, 34]
|
||||||
|
|
||||||
|
xg_train = xgb.DMatrix( train_X, label=train_Y)
|
||||||
|
xg_test = xgb.DMatrix(test_X, label=test_Y)
|
||||||
|
# setup parameters for xgboost
|
||||||
|
param = {}
|
||||||
|
# use softmax multi-class classification
|
||||||
|
param['objective'] = 'multi:softmax'
|
||||||
|
# scale weight of positive examples
|
||||||
|
param['bst:eta'] = 0.1
|
||||||
|
param['bst:max_depth'] = 6
|
||||||
|
param['silent'] = 1
|
||||||
|
param['nthread'] = 4
|
||||||
|
param['num_class'] = 6
|
||||||
|
|
||||||
|
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
|
||||||
|
num_round = 5
|
||||||
|
bst = xgb.train(param, xg_train, num_round, watchlist );
|
||||||
|
# get prediction
|
||||||
|
pred = bst.predict( xg_test );
|
||||||
|
|
||||||
|
print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) )
|
||||||
|
|
||||||
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
# choose the tree booster, 0: tree, 1: linear
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
booster_type = 0
|
booster_type = 0
|
||||||
|
|
||||||
# so far, we have pairwise rank
|
# specify objective
|
||||||
objective="rank:pairwise"
|
objective="rank:pairwise"
|
||||||
|
|
||||||
# Tree Booster Parameters
|
# Tree Booster Parameters
|
||||||
|
|||||||
@ -1,14 +1,8 @@
|
|||||||
#Download the dataset from web site
|
python trans_data.py train.txt mq2008.train mq2008.train.group
|
||||||
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
|
|
||||||
|
|
||||||
#please first install the unrar package
|
python trans_data.py test.txt mq2008.test mq2008.test.group
|
||||||
unrar x MQ2008
|
|
||||||
|
|
||||||
python trans_data.py MQ2008/Fold1/train.txt mq2008.train mq2008.train.group
|
python trans_data.py vali.txt mq2008.vali mq2008.vali.group
|
||||||
|
|
||||||
python trans_data.py MQ2008/Fold1/test.txt mq2008.test mq2008.test.group
|
|
||||||
|
|
||||||
python trans_data.py MQ2008/Fold1/vali.txt mq2008.vali mq2008.vali.group
|
|
||||||
|
|
||||||
../../xgboost mq2008.conf
|
../../xgboost mq2008.conf
|
||||||
|
|
||||||
|
|||||||
@ -97,8 +97,8 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void InitTrainer(void){
|
inline void InitTrainer(void){
|
||||||
if( mparam.num_class != 0 ){
|
if( mparam.num_class != 0 ){
|
||||||
if( name_obj_ != "softmax" ){
|
if( name_obj_ != "multi:softmax" ){
|
||||||
name_obj_ = "softmax";
|
name_obj_ = "multi:softmax";
|
||||||
printf("auto select objective=softmax to support multi-class classification\n" );
|
printf("auto select objective=softmax to support multi-class classification\n" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -113,9 +113,10 @@ namespace xgboost{
|
|||||||
if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik );
|
if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik );
|
||||||
if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
|
if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
|
||||||
if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
|
if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
|
||||||
if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj();
|
if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj();
|
||||||
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||||
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
|
||||||
|
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
|
||||||
utils::Error("unknown objective function type");
|
utils::Error("unknown objective function type");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user