diff --git a/demo/data/README.md b/demo/data/README.md new file mode 100644 index 000000000..d2d63ec11 --- /dev/null +++ b/demo/data/README.md @@ -0,0 +1,2 @@ +This folder contains processed example dataset used by the demos. +Copyright of the dataset belongs to the original copyright holder diff --git a/wrapper/python-example/agaricus.txt.test b/demo/data/agaricus.txt.test similarity index 100% rename from wrapper/python-example/agaricus.txt.test rename to demo/data/agaricus.txt.test diff --git a/wrapper/python-example/agaricus.txt.train b/demo/data/agaricus.txt.train similarity index 100% rename from wrapper/python-example/agaricus.txt.train rename to demo/data/agaricus.txt.train diff --git a/wrapper/python-example/featmap.txt b/demo/data/featmap.txt similarity index 100% rename from wrapper/python-example/featmap.txt rename to demo/data/featmap.txt diff --git a/demo/guide-python/REAMDE.md b/demo/guide-python/REAMDE.md new file mode 100644 index 000000000..7eaec6155 --- /dev/null +++ b/demo/guide-python/REAMDE.md @@ -0,0 +1,6 @@ +XGBoost Python Feature Walkthrough +==== +* [Basic walkthrough of wrappers](guide-python/basic.py) +* [Cutomize loss function, and evaluation metric](guide-python/custom_objective.py) +* [Boosting from existing prediction](guide-python/boost_from_prediction.py) +* [Predicting using first n trees](guide-python/predict_first_ntree.py) diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py new file mode 100755 index 000000000..f542954ce --- /dev/null +++ b/demo/guide-python/basic_walkthrough.py @@ -0,0 +1,70 @@ +#!/usr/bin/python +import sys +import numpy as np +import scipy.sparse +# append the path to xgboost, you may need to change the following line +# alternatively, you can add the path to PYTHONPATH environment variable +sys.path.append('../../wrapper') +import xgboost as xgb + +### simple example +# load file from text file, also binary buffer generated by xgboost +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') + +# specify parameters via map, definition are same as c++ version +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + +# specify validations set to watch performance +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 +bst = xgb.train(param, dtrain, num_round, watchlist) + +# this is prediction +preds = bst.predict(dtest) +labels = dtest.get_label() +print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) +bst.save_model('0001.model') +# dump model +bst.dump_model('dump.raw.txt') +# dump model with feature map +bst.dump_model('dump.nice.txt','../data/featmap.txt') + +# save dmatrix into binary buffer +dtest.save_binary('dtest.buffer') +bst.save_model('xgb.model') +# load model and data in +bst2 = xgb.Booster(model_file='xgb.model') +dtest2 = xgb.DMatrix('dtest.buffer') +preds2 = bst2.predict(dtest2) +# assert they are the same +assert np.sum(np.abs(preds2-preds)) == 0 + +### +# build dmatrix from scipy.sparse +print ('start running example of build DMatrix from scipy.sparse') +labels = [] +row = []; col = []; dat = [] +i = 0 +for l in open('../data/agaricus.txt.train'): + arr = l.split() + labels.append( int(arr[0])) + for it in arr[1:]: + k,v = it.split(':') + row.append(i); col.append(int(k)); dat.append(float(v)) + i += 1 +csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) +dtrain = xgb.DMatrix( csr ) +dtrain.set_label(labels) +watchlist = [(dtest,'eval'), (dtrain,'train')] +bst = xgb.train( param, dtrain, num_round, watchlist ) + +print ('start running example of build DMatrix from numpy array') +# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix +npymat = csr.todense() +dtrain = xgb.DMatrix( npymat) +dtrain.set_label(labels) +watchlist = [(dtest,'eval'), (dtrain,'train')] +bst = xgb.train( param, dtrain, num_round, watchlist ) + + diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py new file mode 100755 index 000000000..0aa2e56ab --- /dev/null +++ b/demo/guide-python/boost_from_prediction.py @@ -0,0 +1,26 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +watchlist = [(dtest,'eval'), (dtrain,'train')] +### +# advanced: start from a initial base prediction +# +print ('start running example to start from a initial prediction') +# specify parameters via map, definition are same as c++ version +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } +# train xgboost for 1 round +bst = xgb.train( param, dtrain, 1, watchlist ) +# Note: we need the margin value instead of transformed prediction in set_base_margin +# do predict with output_margin=True, will always give you margin values before logistic transformation +ptrain = bst.predict(dtrain, output_margin=True) +ptest = bst.predict(dtest, output_margin=True) +dtrain.set_base_margin(ptrain) +dtest.set_base_margin(ptest) + +print ('this is result of running from initial prediction') +bst = xgb.train( param, dtrain, 1, watchlist ) diff --git a/demo/guide-python/custom_objective.py b/demo/guide-python/custom_objective.py new file mode 100755 index 000000000..5a7f110f4 --- /dev/null +++ b/demo/guide-python/custom_objective.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb +### +# advanced: cutomsized loss function +# +print ('start running example to used cutomized objective function') + +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') + +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param = {'max_depth':2, 'eta':1, 'silent':1 } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 + +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess + +# user defined evaluation function, return a pair metric_name, result +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +def evalerror(preds, dtrain): + labels = dtrain.get_label() + # return a pair metric_name, result + # since preds are margin(before logistic transformation, cutoff at 0) + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py new file mode 100755 index 000000000..03f327e7f --- /dev/null +++ b/demo/guide-python/predict_first_ntree.py @@ -0,0 +1,22 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +dtest = xgb.DMatrix('../data/agaricus.txt.test') +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 3 +bst = xgb.train(param, dtrain, num_round, watchlist) + +print ('start testing prediction from first n trees') +### predict using first 1 tree +label = dtest.get_label() +ypred1 = bst.predict(dtest, ntree_limit=1) +# by default, we predict using all the trees +ypred2 = bst.predict(dtest) +print ('error of ypred1=%f' % (np.sum((ypred1>0.5)!=label) /float(len(label)))) +print ('error of ypred2=%f' % (np.sum((ypred2>0.5)!=label) /float(len(label)))) diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh new file mode 100755 index 000000000..6b37c68ca --- /dev/null +++ b/demo/guide-python/runall.sh @@ -0,0 +1,6 @@ +#!/bin/bash +python basic_walkthrough.py +python custom_objective.py +python boost_from_prediction.py +python boost_from_prediction.py +rm *~ *.model *.buffer \ No newline at end of file diff --git a/wrapper/README.md b/wrapper/README.md index 3f43fa629..e736b9b6a 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -2,11 +2,10 @@ Wrapper of XGBoost ===== This folder provides wrapper of xgboost to other languages - Python ===== * To make the python module, type ```make``` in the root directory of project -* Refer to the walk through example in [python-example/demo.py](python-example/demo.py) +* Refer also to the walk through example in [demo folder](../demo/guide-python) R ===== diff --git a/wrapper/python-example/README.md b/wrapper/python-example/README.md deleted file mode 100644 index be5350dd2..000000000 --- a/wrapper/python-example/README.md +++ /dev/null @@ -1,3 +0,0 @@ -example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format - -for usage: see demo.py and comments in demo.py diff --git a/wrapper/python-example/demo.py b/wrapper/python-example/demo.py deleted file mode 100755 index 687b491a4..000000000 --- a/wrapper/python-example/demo.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python -import sys -import numpy as np -import scipy.sparse -# append the path to xgboost, you may need to change the following line -# alternatively, you can add the path to PYTHONPATH environment variable -sys.path.append('../') -import xgboost as xgb - -### simple example -# load file from text file, also binary buffer generated by xgboost -dtrain = xgb.DMatrix('agaricus.txt.train') -dtest = xgb.DMatrix('agaricus.txt.test') - -# specify parameters via map, definition are same as c++ version -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } - -# specify validations set to watch performance -evallist = [(dtest,'eval'), (dtrain,'train')] -num_round = 2 -bst = xgb.train(param, dtrain, num_round, evallist) - -# this is prediction -preds = bst.predict(dtest) -labels = dtest.get_label() -print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) -bst.save_model('0001.model') -# dump model -bst.dump_model('dump.raw.txt') -# dump model with feature map -bst.dump_model('dump.nice.txt','featmap.txt') - -# save dmatrix into binary buffer -dtest.save_binary('dtest.buffer') -bst.save_model('xgb.model') -# load model and data in -bst2 = xgb.Booster(model_file='xgb.model') -dtest2 = xgb.DMatrix('dtest.buffer') -preds2 = bst2.predict(dtest2) -# assert they are the same -assert np.sum(np.abs(preds2-preds)) == 0 - -### -# build dmatrix from scipy.sparse -print ('start running example of build DMatrix from scipy.sparse') -labels = [] -row = []; col = []; dat = [] -i = 0 -for l in open('agaricus.txt.train'): - arr = l.split() - labels.append( int(arr[0])) - for it in arr[1:]: - k,v = it.split(':') - row.append(i); col.append(int(k)); dat.append(float(v)) - i += 1 -csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) -dtrain = xgb.DMatrix( csr ) -dtrain.set_label(labels) -evallist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, evallist ) - -print ('start running example of build DMatrix from numpy array') -# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix -npymat = csr.todense() -dtrain = xgb.DMatrix( npymat) -dtrain.set_label(labels) -evallist = [(dtest,'eval'), (dtrain,'train')] -bst = xgb.train( param, dtrain, num_round, evallist ) - -### -# advanced: cutomsized loss function -# -print ('start running example to used cutomized objective function') - -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -param = {'max_depth':2, 'eta':1, 'silent':1 } - -# user define objective function, given prediction, return gradient and second order gradient -# this is loglikelihood loss -def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0-preds) - return grad, hess - -# user defined evaluation function, return a pair metric_name, result -# NOTE: when you do customized loss function, the default prediction value is margin -# this may make buildin evalution metric not function properly -# for example, we are doing logistic loss, the prediction is score before logistic transformation -# the buildin evaluation error assumes input is after logistic transformation -# Take this in mind when you use the customization, and maybe you need write customized evaluation function -def evalerror(preds, dtrain): - labels = dtrain.get_label() - # return a pair metric_name, result - # since preds are margin(before logistic transformation, cutoff at 0) - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) - -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror) - -### -# advanced: start from a initial base prediction -# -print ('start running example to start from a initial prediction') -# specify parameters via map, definition are same as c++ version -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } -# train xgboost for 1 round -bst = xgb.train( param, dtrain, 1, evallist ) -# Note: we need the margin value instead of transformed prediction in set_base_margin -# do predict with output_margin=True, will always give you margin values before logistic transformation -ptrain = bst.predict(dtrain, output_margin=True) -ptest = bst.predict(dtest, output_margin=True) -dtrain.set_base_margin(ptrain) -dtest.set_base_margin(ptest) - -print ('this is result of running from initial prediction') -bst = xgb.train( param, dtrain, 1, evallist )