diff --git a/.gitignore b/.gitignore index ae2c00e76..5227dbc87 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ xgboost *group *rar *vali +*data diff --git a/README.md b/README.md index c775c9776..659f8d5fa 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Features - Sparse feature format allows easy handling of missing values, and improve computation efficiency. * Push the limit on single machine: - Efficient implementation that optimizes memory and computation. +* Speed: XGBoost is very fast + - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier * Layout of gradient boosting algorithm to support user defined objective * Python interface, works with numpy and scipy.sparse matrix diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md index a3c208002..28472a848 100644 --- a/demo/kaggle-higgs/README.md +++ b/demo/kaggle-higgs/README.md @@ -14,7 +14,6 @@ make 3. Run ./run.sh - - - +Speed +===== speedtest.py compares xgboost's speed on this dataset with sklearn.GBM diff --git a/demo/multiclass_classification/README.md b/demo/multiclass_classification/README.md new file mode 100644 index 000000000..72607de09 --- /dev/null +++ b/demo/multiclass_classification/README.md @@ -0,0 +1,10 @@ +Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology) + +Make sure you make make xgboost python module in ../../python + +1. Run runexp.sh +```bash +./runexp.sh +``` + +Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki) diff --git a/demo/multiclass_classification/runexp.sh b/demo/multiclass_classification/runexp.sh new file mode 100755 index 000000000..0af814725 --- /dev/null +++ b/demo/multiclass_classification/runexp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f dermatology.data ] +then + echo "use existing data to run multi class classification" +else + echo "getting data from uci, make sure you are connected to internet" + wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data +fi +python train.py diff --git a/demo/multiclass_classification/train.py b/demo/multiclass_classification/train.py new file mode 100755 index 000000000..38d818890 --- /dev/null +++ b/demo/multiclass_classification/train.py @@ -0,0 +1,42 @@ +#! /usr/bin/python +import sys +import numpy as np +sys.path.append('../../python/') +import xgboost as xgb + +# label need to be 0 to num_class -1 +data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } ) +sz = data.shape + +train = data[:int(sz[0] * 0.7), :] +test = data[int(sz[0] * 0.7):, :] + +train_X = train[:,0:33] +train_Y = train[:, 34] + + +test_X = test[:,0:33] +test_Y = test[:, 34] + +xg_train = xgb.DMatrix( train_X, label=train_Y) +xg_test = xgb.DMatrix(test_X, label=test_Y) +# setup parameters for xgboost +param = {} +# use softmax multi-class classification +param['objective'] = 'multi:softmax' +# scale weight of positive examples +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['silent'] = 1 +param['nthread'] = 4 +param['num_class'] = 6 + +watchlist = [ (xg_train,'train'), (xg_test, 'test') ] +num_round = 5 +bst = xgb.train(param, xg_train, num_round, watchlist ); +# get prediction +pred = bst.predict( xg_test ); + +print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) ) + + diff --git a/demo/rank/mq2008.conf b/demo/rank/mq2008.conf index 0d26580c2..65ad19b8e 100644 --- a/demo/rank/mq2008.conf +++ b/demo/rank/mq2008.conf @@ -2,7 +2,7 @@ # choose the tree booster, 0: tree, 1: linear booster_type = 0 -# so far, we have pairwise rank +# specify objective objective="rank:pairwise" # Tree Booster Parameters diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index c17ebee05..3867047f3 100755 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -1,14 +1,8 @@ -#Download the dataset from web site -wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +python trans_data.py train.txt mq2008.train mq2008.train.group -#please first install the unrar package -unrar x MQ2008 +python trans_data.py test.txt mq2008.test mq2008.test.group -python trans_data.py MQ2008/Fold1/train.txt mq2008.train mq2008.train.group - -python trans_data.py MQ2008/Fold1/test.txt mq2008.test mq2008.test.group - -python trans_data.py MQ2008/Fold1/vali.txt mq2008.vali mq2008.vali.group +python trans_data.py vali.txt mq2008.vali mq2008.vali.group ../../xgboost mq2008.conf diff --git a/regrank/xgboost_regrank.h b/regrank/xgboost_regrank.h index b2649735c..e3868002a 100644 --- a/regrank/xgboost_regrank.h +++ b/regrank/xgboost_regrank.h @@ -97,8 +97,8 @@ namespace xgboost{ */ inline void InitTrainer(void){ if( mparam.num_class != 0 ){ - if( name_obj_ != "softmax" ){ - name_obj_ = "softmax"; + if( name_obj_ != "multi:softmax" ){ + name_obj_ = "multi:softmax"; printf("auto select objective=softmax to support multi-class classification\n" ); } } diff --git a/regrank/xgboost_regrank_obj.h b/regrank/xgboost_regrank_obj.h index 24396101b..2778686a3 100644 --- a/regrank/xgboost_regrank_obj.h +++ b/regrank/xgboost_regrank_obj.h @@ -113,9 +113,10 @@ namespace xgboost{ if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik ); if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify ); if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw ); - if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); + if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(); if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); - if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); + if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj(); + if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj(); utils::Error("unknown objective function type"); return NULL; }