Merge pull request #1 from tqchen/dev

2.0 version, lots of changes
2014-05-16 20:53:19 -07:00
parent 8e8b8a8ee3 b56b34944e
commit 495e37e0dc
10 changed files with 75 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ xgboost
 *group
 *rar
 *vali
+*data
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Features
  - Sparse feature format allows easy handling of missing values, and improve computation efficiency.
 * Push the limit on single machine:
  - Efficient implementation that optimizes memory and computation.
+* Speed: XGBoost is very fast
+  - IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
 * Layout of gradient boosting algorithm to support user defined objective
 * Python interface, works with numpy and scipy.sparse matrix

--- a/demo/kaggle-higgs/README.md
+++ b/demo/kaggle-higgs/README.md
@@ -14,7 +14,6 @@ make

 3. Run ./run.sh

-
-
-
+Speed
+=====
 speedtest.py compares xgboost's speed on this dataset with sklearn.GBM
--- a/demo/multiclass_classification/README.md
+++ b/demo/multiclass_classification/README.md
@@ -0,0 +1,10 @@
+Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology)
+
+Make sure you make make xgboost python module in ../../python
+
+1. Run runexp.sh
+```bash
+./runexp.sh
+```
+
+Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki)
--- a/demo/multiclass_classification/runexp.sh
+++ b/demo/multiclass_classification/runexp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [ -f dermatology.data ]
+then
+    echo "use existing data to run multi class classification"
+else
+    echo "getting data from uci, make sure you are connected to internet"
+    wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data
+fi
+python train.py
--- a/demo/multiclass_classification/train.py
+++ b/demo/multiclass_classification/train.py
@@ -0,0 +1,42 @@
+#! /usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../python/')
+import xgboost as xgb
+
+# label need to be 0 to num_class -1
+data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } )
+sz = data.shape
+
+train = data[:int(sz[0] * 0.7), :]
+test = data[int(sz[0] * 0.7):, :]
+
+train_X = train[:,0:33]
+train_Y = train[:, 34]
+
+
+test_X = test[:,0:33]
+test_Y = test[:, 34]
+
+xg_train = xgb.DMatrix( train_X, label=train_Y)
+xg_test = xgb.DMatrix(test_X, label=test_Y)
+# setup parameters for xgboost
+param = {}
+# use softmax multi-class classification
+param['objective'] = 'multi:softmax'
+# scale weight of positive examples
+param['bst:eta'] = 0.1
+param['bst:max_depth'] = 6
+param['silent'] = 1
+param['nthread'] = 4
+param['num_class'] = 6
+
+watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
+num_round = 5
+bst = xgb.train(param, xg_train, num_round, watchlist );
+# get prediction
+pred = bst.predict( xg_test );
+
+print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) )
+
+
--- a/demo/rank/mq2008.conf
+++ b/demo/rank/mq2008.conf
@@ -2,7 +2,7 @@
 # choose the tree booster, 0: tree, 1: linear
 booster_type = 0 

-# so far, we have pairwise rank
+# specify objective
 objective="rank:pairwise"

 # Tree Booster Parameters
--- a/demo/rank/runexp.sh
+++ b/demo/rank/runexp.sh
@@ -1,14 +1,8 @@
-#Download the dataset from web site
-wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
+python trans_data.py train.txt mq2008.train mq2008.train.group

-#please first install the unrar package
-unrar x MQ2008
+python trans_data.py test.txt mq2008.test mq2008.test.group

-python trans_data.py MQ2008/Fold1/train.txt mq2008.train mq2008.train.group
-
-python trans_data.py MQ2008/Fold1/test.txt mq2008.test mq2008.test.group
-
-python trans_data.py MQ2008/Fold1/vali.txt mq2008.vali mq2008.vali.group
+python trans_data.py vali.txt mq2008.vali mq2008.vali.group

 ../../xgboost mq2008.conf

--- a/regrank/xgboost_regrank.h
+++ b/regrank/xgboost_regrank.h
@@ -97,8 +97,8 @@ namespace xgboost{
            */
            inline void InitTrainer(void){
                if( mparam.num_class != 0 ){
-                    if( name_obj_ != "softmax" ){
-                        name_obj_ = "softmax";
+                    if( name_obj_ != "multi:softmax" ){
+                        name_obj_ = "multi:softmax";
                        printf("auto select objective=softmax to support multi-class classification\n" );
                    }
                }
--- a/regrank/xgboost_regrank_obj.h
+++ b/regrank/xgboost_regrank_obj.h
@@ -113,9 +113,10 @@ namespace xgboost{
           if( !strcmp("reg:logistic", name ) )    return new RegressionObj( LossType::kLogisticNeglik );
           if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
           if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
-           if( !strcmp("multi:softmax", name ) )      return new SoftmaxMultiClassObj();
+           if( !strcmp("multi:softmax", name ) )   return new SoftmaxMultiClassObj();
           if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
-           if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
+           if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
+           if( !strcmp("rank:softmax", name ) )  return new SoftmaxRankObj();
           utils::Error("unknown objective function type");
           return NULL;
       }