add python interface for xgboost

This commit is contained in:
tqchen 2014-05-03 23:04:02 -07:00
parent adc9400736
commit 6fd77cbb24
10 changed files with 8348 additions and 9 deletions

1
.gitignore vendored
View File

@ -17,3 +17,4 @@
*buffer *buffer
*model *model
xgboost xgboost
*pyc

View File

@ -1,2 +1,4 @@
beta version:
python wrapper for xgboost using ctypes python wrapper for xgboost using ctypes
see example for usage

3
python/example/README.md Normal file
View File

@ -0,0 +1,3 @@
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
for usage: see demo.py and comments in demo.py

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

74
python/example/demo.py Executable file
View File

@ -0,0 +1,74 @@
#!/usr/bin/python
import sys
import scipy.sparse
# append the path to xgboost
sys.path.append('../')
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':4, 'bst:eta':1, 'silent':1, 'loss_type':2 }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train( param, dtrain, num_round, evallist )
# this is prediction
preds = bst.predict( dtest )
labels = dtest.get_label()
print 'error=%f' % ( sum(1 for i in xrange(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))
bst.save_model('0001.model')
###
# build dmatrix in python iteratively
#
print 'start running example of build DMatrix in python'
dtrain = xgb.DMatrix()
labels = []
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
feats = []
for it in arr[1:]:
k,v = it.split(':')
feats.append( (int(k), float(v)) )
dtrain.add_row( feats )
dtrain.set_label( labels )
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# build dmatrix from scipy.sparse
print 'start running example of build DMatrix from scipy.sparse'
labels = []
row = []; col = []; dat = []
i = 0
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
print 'start running example of build DMatrix from numpy array'
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )

126
python/example/featmap.txt Normal file
View File

@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -1,10 +1,12 @@
# module for xgboost # module for xgboost
import ctypes import ctypes
import os
# optinally have scipy sparse, though not necessary # optinally have scipy sparse, though not necessary
import numpy as np import numpy as np
import scipy.sparse as scp import scipy.sparse as scp
# set this line correctly # set this line correctly
XGBOOST_PATH = './libxgboostpy.so' XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
# entry type of sparse matrix # entry type of sparse matrix
class REntry(ctypes.Structure): class REntry(ctypes.Structure):
@ -34,9 +36,9 @@ class DMatrix:
else: else:
try: try:
csr = scp.csr_matrix(data) csr = scp.csr_matrix(data)
self.__init_from_csr(data) self.__init_from_csr(csr)
except: except:
raise "DMatrix", "can not intialize DMatrix from"+type(data) raise Exception, "can not intialize DMatrix from"+str(type(data))
if label != None: if label != None:
self.set_label(label) self.set_label(label)

View File

@ -32,6 +32,7 @@ namespace xgboost{
mat.row_data_.resize( mat.row_ptr_.back() + len ); mat.row_data_.resize( mat.row_ptr_.back() + len );
memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len ); memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len );
mat.row_ptr_.push_back( mat.row_ptr_.back() + len ); mat.row_ptr_.push_back( mat.row_ptr_.back() + len );
init_col_ = false;
} }
inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{ inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{
const xgboost::booster::FMatrixS &mat = this->data; const xgboost::booster::FMatrixS &mat = this->data;
@ -72,7 +73,7 @@ namespace xgboost{
return &(this->info.labels[0]); return &(this->info.labels[0]);
} }
inline void CheckInit(void){ inline void CheckInit(void){
if(!this->data.HaveColAccess()){ if(!init_col_){
this->data.InitData(); this->data.InitData();
} }
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix"); utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
@ -163,7 +164,9 @@ extern "C"{
void *XGBoosterCreate( void *dmats[], size_t len ){ void *XGBoosterCreate( void *dmats[], size_t len ){
std::vector<const xgboost::regrank::DMatrix*> mats; std::vector<const xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){ for( size_t i = 0; i < len; ++i ){
mats.push_back( static_cast<DMatrix*>(dmats[i]) ); DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
dtr->CheckInit();
mats.push_back( dtr );
} }
return new Booster( mats ); return new Booster( mats );
} }

View File

@ -112,9 +112,11 @@ namespace xgboost{
unsigned ngptr; unsigned ngptr;
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){ if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
info.group_ptr.resize( ngptr ); info.group_ptr.resize( ngptr );
if( ngptr != 0 ){
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file"); utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
} }
} }
}
fs.Close(); fs.Close();
if (!silent){ if (!silent){
@ -143,8 +145,10 @@ namespace xgboost{
{// write out group ptr {// write out group ptr
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() ); unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
fs.Write(&ngptr, sizeof(unsigned) ); fs.Write(&ngptr, sizeof(unsigned) );
if( ngptr != 0 ){
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr); fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
} }
}
fs.Close(); fs.Close();
if (!silent){ if (!silent){
printf("%ux%u matrix with %lu entries is saved to %s\n", printf("%ux%u matrix with %lu entries is saved to %s\n",