#!/usr/bin/python import sys import numpy as np import scipy.sparse # append the path to xgboost, you may need to change the following line # alternatively, you can add the path to PYTHONPATH environment variable sys.path.append('../../wrapper') import xgboost as xgb ### simple example # load file from text file, also binary buffer generated by xgboost dtrain = xgb.DMatrix('../data/agaricus.txt.train') dtest = xgb.DMatrix('../data/agaricus.txt.test') # specify parameters via map, definition are same as c++ version param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } # specify validations set to watch performance watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest) labels = dtest.get_label() print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) bst.save_model('0001.model') # dump model bst.dump_model('dump.raw.txt') # dump model with feature map bst.dump_model('dump.nice.txt','../data/featmap.txt') # save dmatrix into binary buffer dtest.save_binary('dtest.buffer') bst.save_model('xgb.model') # load model and data in bst2 = xgb.Booster(model_file='xgb.model') dtest2 = xgb.DMatrix('dtest.buffer') preds2 = bst2.predict(dtest2) # assert they are the same assert np.sum(np.abs(preds2-preds)) == 0 ### # build dmatrix from scipy.sparse print ('start running example of build DMatrix from scipy.sparse') labels = [] row = []; col = []; dat = [] i = 0 for l in open('../data/agaricus.txt.train'): arr = l.split() labels.append( int(arr[0])) for it in arr[1:]: k,v = it.split(':') row.append(i); col.append(int(k)); dat.append(float(v)) i += 1 csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) dtrain = xgb.DMatrix( csr ) dtrain.set_label(labels) watchlist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, watchlist ) print ('start running example of build DMatrix from numpy array') # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix npymat = csr.todense() dtrain = xgb.DMatrix( npymat) dtrain.set_label(labels) watchlist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, watchlist )