138 lines
3.8 KiB
Python
138 lines
3.8 KiB
Python
#pylint: skip-file
|
|
import numpy as np
|
|
import xgboost as xgb
|
|
import os
|
|
import pandas as pd
|
|
import urllib2
|
|
|
|
class bcolors:
|
|
HEADER = '\033[95m'
|
|
OKBLUE = '\033[94m'
|
|
OKGREEN = '\033[92m'
|
|
WARNING = '\033[93m'
|
|
FAIL = '\033[91m'
|
|
ENDC = '\033[0m'
|
|
BOLD = '\033[1m'
|
|
UNDERLINE = '\033[4m'
|
|
|
|
|
|
def get_last_eval_callback(result):
|
|
|
|
def callback(env):
|
|
result.append(env.evaluation_result_list[-1][1])
|
|
|
|
callback.after_iteration = True
|
|
return callback
|
|
|
|
|
|
def load_adult():
|
|
path = "../../demo/data/adult.data"
|
|
|
|
if(not os.path.isfile(path)):
|
|
data = urllib2.urlopen('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
|
|
with open(path,'wb') as output:
|
|
output.write(data.read())
|
|
|
|
train_set = pd.read_csv( path, header=None)
|
|
|
|
train_set.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
|
|
'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
|
|
'wage_class']
|
|
train_nomissing = train_set.replace(' ?', np.nan).dropna()
|
|
for feature in train_nomissing.columns: # Loop through all columns in the dataframe
|
|
if train_nomissing[feature].dtype == 'object': # Only apply for columns with categorical strings
|
|
train_nomissing[feature] = pd.Categorical(train_nomissing[feature]).codes # Replace strings with an integer
|
|
|
|
y_train = train_nomissing.pop('wage_class')
|
|
|
|
return xgb.DMatrix( train_nomissing, label=y_train)
|
|
|
|
|
|
def load_higgs():
|
|
higgs_path = '../../demo/data/training.csv'
|
|
dtrain = np.loadtxt(higgs_path, delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
|
|
|
|
#dtrain = dtrain[0:200000,:]
|
|
label = dtrain[:,32]
|
|
data = dtrain[:,1:31]
|
|
weight = dtrain[:,31]
|
|
|
|
return xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
|
|
|
def load_dermatology():
|
|
data = np.loadtxt('../../demo/data/dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } )
|
|
sz = data.shape
|
|
|
|
X = data[:,0:33]
|
|
Y = data[:, 34]
|
|
|
|
return xgb.DMatrix( X, label=Y)
|
|
|
|
def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
|
|
return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
|
|
|
|
#Check GPU test evaluation is approximately equal to CPU test evaluation
|
|
def check_result(cpu_result, gpu_result):
|
|
for i in range(len(cpu_result)):
|
|
if not isclose(cpu_result[i], gpu_result[i], 0.1, 0.02):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
#Get data
|
|
data = []
|
|
params = []
|
|
data.append(load_higgs())
|
|
params.append({})
|
|
|
|
|
|
data.append( load_adult())
|
|
params.append({})
|
|
|
|
data.append(xgb.DMatrix('../../demo/data/agaricus.txt.test'))
|
|
params.append({'objective':'binary:logistic'})
|
|
|
|
#if(os.path.isfile("../../demo/data/dermatology.data")):
|
|
data.append(load_dermatology())
|
|
params.append({'objective':'multi:softmax', 'num_class': 6})
|
|
|
|
num_round = 5
|
|
|
|
num_pass = 0
|
|
num_fail = 0
|
|
|
|
test_depth = [ 1, 6, 9, 11, 15 ]
|
|
#test_depth = [ 1 ]
|
|
|
|
for test in range(0, len(data)):
|
|
for depth in test_depth:
|
|
xgmat = data[test]
|
|
cpu_result = []
|
|
param = params[test]
|
|
param['max_depth'] = depth
|
|
param['updater'] = 'grow_colmaker'
|
|
xgb.cv(param, xgmat, num_round, verbose_eval=False, nfold=5, callbacks=[get_last_eval_callback(cpu_result)])
|
|
|
|
#bst = xgb.train( param, xgmat, 1);
|
|
#bst.dump_model('reference_model.txt','', True)
|
|
|
|
gpu_result = []
|
|
param['updater'] = 'grow_gpu'
|
|
xgb.cv(param, xgmat, num_round, verbose_eval=False, nfold=5, callbacks=[get_last_eval_callback(gpu_result)])
|
|
|
|
#bst = xgb.train( param, xgmat, 1);
|
|
#bst.dump_model('dump.raw.txt','', True)
|
|
|
|
if check_result(cpu_result, gpu_result):
|
|
print(bcolors.OKGREEN + "Pass" + bcolors.ENDC)
|
|
num_pass = num_pass + 1
|
|
else:
|
|
print(bcolors.FAIL + "Fail" + bcolors.ENDC)
|
|
num_fail = num_fail + 1
|
|
|
|
print("cpu rmse: "+str(cpu_result))
|
|
print("gpu rmse: "+str(gpu_result))
|
|
|
|
print(str(num_pass)+"/"+str(num_pass + num_fail)+" passed")
|