Group CLI demo into subdirectory. (#6258)
CLI is not most developed interface. Putting them into correct directory can help new users to avoid it as most of the use cases are from a language binding.
This commit is contained in:
parent
6383757dca
commit
dfac5f89e9
@ -18,7 +18,7 @@ max_depth = 3
|
|||||||
# the number of round to do boosting
|
# the number of round to do boosting
|
||||||
num_round = 2
|
num_round = 2
|
||||||
# 0 means do not save any model except the final round model
|
# 0 means do not save any model except the final round model
|
||||||
save_period = 0
|
save_period = 2
|
||||||
# The path of training data
|
# The path of training data
|
||||||
data = "agaricus.txt.train"
|
data = "agaricus.txt.train"
|
||||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
@ -3,13 +3,15 @@
|
|||||||
python mapfeat.py
|
python mapfeat.py
|
||||||
# split train and test
|
# split train and test
|
||||||
python mknfold.py agaricus.txt 1
|
python mknfold.py agaricus.txt 1
|
||||||
# training and output the models
|
|
||||||
../../xgboost mushroom.conf
|
|
||||||
# output prediction task=pred
|
|
||||||
../../xgboost mushroom.conf task=pred model_in=0002.model
|
|
||||||
# print the boosters of 00002.model in dump.raw.txt
|
|
||||||
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
|
||||||
# use the feature map in printing for better visualization
|
|
||||||
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
|
||||||
cat dump.nice.txt
|
|
||||||
|
|
||||||
|
XGBOOST=../../../xgboost
|
||||||
|
|
||||||
|
# training and output the models
|
||||||
|
$XGBOOST mushroom.conf
|
||||||
|
# output prediction task=pred
|
||||||
|
$XGBOOST mushroom.conf task=pred model_in=0002.model
|
||||||
|
# print the boosters of 00002.model in dump.raw.txt
|
||||||
|
$XGBOOST mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||||
|
# use the feature map in printing for better visualization
|
||||||
|
$XGBOOST mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||||
|
cat dump.nice.txt
|
||||||
11
demo/CLI/distributed-training/run_aws.sh
Normal file
11
demo/CLI/distributed-training/run_aws.sh
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# This is the example script to run distributed xgboost on AWS.
|
||||||
|
# Change the following two lines for configuration
|
||||||
|
|
||||||
|
export BUCKET=mybucket
|
||||||
|
|
||||||
|
# submit the job to YARN
|
||||||
|
../../../dmlc-core/tracker/dmlc-submit --cluster=yarn --num-workers=2 --worker-cores=2\
|
||||||
|
../../../xgboost mushroom.aws.conf nthread=2\
|
||||||
|
data=s3://${BUCKET}/xgb-demo/train\
|
||||||
|
eval[test]=s3://${BUCKET}/xgb-demo/test\
|
||||||
|
model_dir=s3://${BUCKET}/xgb-demo/model
|
||||||
33
demo/CLI/regression/mapfeat.py
Executable file
33
demo/CLI/regression/mapfeat.py
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
fo = open('machine.txt', 'w')
|
||||||
|
cnt = 6
|
||||||
|
fmap = {}
|
||||||
|
for l in open('machine.data'):
|
||||||
|
arr = l.split(',')
|
||||||
|
fo.write(arr[8])
|
||||||
|
for i in range(0, 6):
|
||||||
|
fo.write(' %d:%s' % (i, arr[i + 2]))
|
||||||
|
|
||||||
|
if arr[0] not in fmap:
|
||||||
|
fmap[arr[0]] = cnt
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
fo.write(' %d:1' % fmap[arr[0]])
|
||||||
|
fo.write('\n')
|
||||||
|
|
||||||
|
fo.close()
|
||||||
|
|
||||||
|
# create feature map for machine data
|
||||||
|
fo = open('featmap.txt', 'w')
|
||||||
|
# list from machine.names
|
||||||
|
names = [
|
||||||
|
'vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP'
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(0, 6):
|
||||||
|
fo.write('%d\t%s\tint\n' % (i, names[i + 1]))
|
||||||
|
|
||||||
|
for v, k in sorted(fmap.items(), key=lambda x: x[1]):
|
||||||
|
fo.write('%d\tvendor=%s\ti\n' % (k, v))
|
||||||
|
fo.close()
|
||||||
28
demo/CLI/regression/mknfold.py
Executable file
28
demo/CLI/regression/mknfold.py
Executable file
@ -0,0 +1,28 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print('Usage:<filename> <k> [nfold = 5]')
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
random.seed(10)
|
||||||
|
|
||||||
|
k = int(sys.argv[2])
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
nfold = int(sys.argv[3])
|
||||||
|
else:
|
||||||
|
nfold = 5
|
||||||
|
|
||||||
|
fi = open(sys.argv[1], 'r')
|
||||||
|
ftr = open(sys.argv[1] + '.train', 'w')
|
||||||
|
fte = open(sys.argv[1] + '.test', 'w')
|
||||||
|
for l in fi:
|
||||||
|
if random.randint(1, nfold) == k:
|
||||||
|
fte.write(l)
|
||||||
|
else:
|
||||||
|
ftr.write(l)
|
||||||
|
|
||||||
|
fi.close()
|
||||||
|
ftr.close()
|
||||||
|
fte.close()
|
||||||
@ -1,14 +1,9 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print 'Usage: <csv> <libsvm>'
|
|
||||||
print 'convert a all numerical csv to libsvm'
|
|
||||||
|
|
||||||
fo = open(sys.argv[2], 'w')
|
fo = open(sys.argv[2], 'w')
|
||||||
|
|
||||||
for l in open(sys.argv[1]):
|
for l in open(sys.argv[1]):
|
||||||
arr = l.split(',')
|
arr = l.split(',')
|
||||||
fo.write('%s' % arr[0])
|
fo.write('%s' % arr[0])
|
||||||
for i in xrange(len(arr) - 1):
|
for i in range(len(arr) - 1):
|
||||||
fo.write(' %d:%s' % (i, arr[i+1]))
|
fo.write(' %d:%s' % (i, arr[i+1]))
|
||||||
fo.close()
|
fo.close()
|
||||||
@ -14,4 +14,4 @@ python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
|
|||||||
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
|
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
|
||||||
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
|
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
|
||||||
echo "finish making the data"
|
echo "finish making the data"
|
||||||
../../xgboost yearpredMSD.conf
|
../../../xgboost yearpredMSD.conf
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# This is the example script to run distributed xgboost on AWS.
|
|
||||||
# Change the following two lines for configuration
|
|
||||||
|
|
||||||
export BUCKET=mybucket
|
|
||||||
|
|
||||||
# submit the job to YARN
|
|
||||||
../../dmlc-core/tracker/dmlc-submit --cluster=yarn --num-workers=2 --worker-cores=2\
|
|
||||||
../../xgboost mushroom.aws.conf nthread=2\
|
|
||||||
data=s3://${BUCKET}/xgb-demo/train\
|
|
||||||
eval[test]=s3://${BUCKET}/xgb-demo/test\
|
|
||||||
model_dir=s3://${BUCKET}/xgb-demo/model
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
../../xgboost mq2008.conf
|
../../xgboost mq2008.conf
|
||||||
|
|
||||||
../../xgboost mq2008.conf task=pred model_in=0004.model
|
../../xgboost mq2008.conf task=pred model_in=0004.model
|
||||||
|
|||||||
@ -38,4 +38,3 @@ if __name__ == "__main__":
|
|||||||
fi.close()
|
fi.close()
|
||||||
output_feature.close()
|
output_feature.close()
|
||||||
output_group.close()
|
output_group.close()
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,13 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
wget https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.rar
|
if [ -f MQ2008.rar ]
|
||||||
unrar x MQ2008.rar
|
then
|
||||||
mv -f MQ2008/Fold1/*.txt .
|
echo "Use downloaded data to run experiment."
|
||||||
|
else
|
||||||
|
echo "Downloading data."
|
||||||
|
wget https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.rar
|
||||||
|
unrar x MQ2008.rar
|
||||||
|
mv -f MQ2008/Fold1/*.txt .
|
||||||
|
fi
|
||||||
|
|
||||||
python trans_data.py train.txt mq2008.train mq2008.train.group
|
python trans_data.py train.txt mq2008.train mq2008.train.group
|
||||||
|
|
||||||
|
|||||||
@ -1,31 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
|
|
||||||
fo = open( 'machine.txt', 'w' )
|
|
||||||
cnt = 6
|
|
||||||
fmap = {}
|
|
||||||
for l in open( 'machine.data' ):
|
|
||||||
arr = l.split(',')
|
|
||||||
fo.write(arr[8])
|
|
||||||
for i in range( 0,6 ):
|
|
||||||
fo.write( ' %d:%s' %(i,arr[i+2]) )
|
|
||||||
|
|
||||||
if arr[0] not in fmap:
|
|
||||||
fmap[arr[0]] = cnt
|
|
||||||
cnt += 1
|
|
||||||
|
|
||||||
fo.write( ' %d:1' % fmap[arr[0]] )
|
|
||||||
fo.write('\n')
|
|
||||||
|
|
||||||
fo.close()
|
|
||||||
|
|
||||||
# create feature map for machine data
|
|
||||||
fo = open('featmap.txt', 'w')
|
|
||||||
# list from machine.names
|
|
||||||
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
|
|
||||||
|
|
||||||
for i in range(0,6):
|
|
||||||
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
|
|
||||||
|
|
||||||
for v, k in sorted( fmap.items(), key = lambda x:x[1] ):
|
|
||||||
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
|
|
||||||
fo.close()
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import random
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print ('Usage:<filename> <k> [nfold = 5]')
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
random.seed( 10 )
|
|
||||||
|
|
||||||
k = int( sys.argv[2] )
|
|
||||||
if len(sys.argv) > 3:
|
|
||||||
nfold = int( sys.argv[3] )
|
|
||||||
else:
|
|
||||||
nfold = 5
|
|
||||||
|
|
||||||
fi = open( sys.argv[1], 'r' )
|
|
||||||
ftr = open( sys.argv[1]+'.train', 'w' )
|
|
||||||
fte = open( sys.argv[1]+'.test', 'w' )
|
|
||||||
for l in fi:
|
|
||||||
if random.randint( 1 , nfold ) == k:
|
|
||||||
fte.write( l )
|
|
||||||
else:
|
|
||||||
ftr.write( l )
|
|
||||||
|
|
||||||
fi.close()
|
|
||||||
ftr.close()
|
|
||||||
fte.close()
|
|
||||||
|
|
||||||
@ -116,7 +116,7 @@ if __name__ == "__main__":
|
|||||||
run("cmake .. " + " ".join(args) + maybe_generator)
|
run("cmake .. " + " ".join(args) + maybe_generator)
|
||||||
run("cmake --build . --config Release" + maybe_parallel_build)
|
run("cmake --build . --config Release" + maybe_parallel_build)
|
||||||
|
|
||||||
with cd("demo/regression"):
|
with cd("demo/CLI/regression"):
|
||||||
run(sys.executable + " mapfeat.py")
|
run(sys.executable + " mapfeat.py")
|
||||||
run(sys.executable + " mknfold.py machine.txt 1")
|
run(sys.executable + " mknfold.py machine.txt 1")
|
||||||
|
|
||||||
@ -138,11 +138,11 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
print("copying train/test files")
|
print("copying train/test files")
|
||||||
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
|
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
|
||||||
with cd("../demo/regression"):
|
with cd("../demo/CLI/regression"):
|
||||||
run("{} mapfeat.py".format(sys.executable))
|
run("{} mapfeat.py".format(sys.executable))
|
||||||
run("{} mknfold.py machine.txt 1".format(sys.executable))
|
run("{} mknfold.py machine.txt 1".format(sys.executable))
|
||||||
|
|
||||||
for file in glob.glob("../demo/regression/machine.txt.t*"):
|
for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
|
||||||
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
|
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
|
||||||
for file in glob.glob("../demo/data/agaricus.*"):
|
for file in glob.glob("../demo/data/agaricus.*"):
|
||||||
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
|
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
|
||||||
|
|||||||
@ -2,11 +2,13 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
import pytest
|
import pytest
|
||||||
import testing as tm
|
import testing as tm
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
ROOT_DIR = tm.PROJECT_ROOT
|
ROOT_DIR = tm.PROJECT_ROOT
|
||||||
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
|
DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
|
||||||
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
|
PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
|
||||||
|
CLI_DEMO_DIR = os.path.join(DEMO_DIR, 'CLI')
|
||||||
|
|
||||||
|
|
||||||
def test_basic_walkthrough():
|
def test_basic_walkthrough():
|
||||||
@ -132,7 +134,7 @@ def test_callbacks_demo():
|
|||||||
|
|
||||||
|
|
||||||
def test_cli_regression_demo():
|
def test_cli_regression_demo():
|
||||||
reg_dir = os.path.join(DEMO_DIR, 'regression')
|
reg_dir = os.path.join(CLI_DEMO_DIR, 'regression')
|
||||||
script = os.path.join(reg_dir, 'mapfeat.py')
|
script = os.path.join(reg_dir, 'mapfeat.py')
|
||||||
cmd = ['python', script]
|
cmd = ['python', script]
|
||||||
subprocess.check_call(cmd, cwd=reg_dir)
|
subprocess.check_call(cmd, cwd=reg_dir)
|
||||||
@ -144,3 +146,15 @@ def test_cli_regression_demo():
|
|||||||
exe = os.path.join(tm.PROJECT_ROOT, 'xgboost')
|
exe = os.path.join(tm.PROJECT_ROOT, 'xgboost')
|
||||||
conf = os.path.join(reg_dir, 'machine.conf')
|
conf = os.path.join(reg_dir, 'machine.conf')
|
||||||
subprocess.check_call([exe, conf], cwd=reg_dir)
|
subprocess.check_call([exe, conf], cwd=reg_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(condition=sys.platform.startswith("win"),
|
||||||
|
reason='Test requires sh execution.')
|
||||||
|
def test_cli_binary_classification():
|
||||||
|
cls_dir = os.path.join(CLI_DEMO_DIR, 'binary_classification')
|
||||||
|
with tm.DirectoryExcursion(cls_dir, cleanup=True):
|
||||||
|
subprocess.check_call(['./runexp.sh'])
|
||||||
|
os.remove('0002.model')
|
||||||
|
|
||||||
|
# year prediction is not tested due to data size being too large.
|
||||||
|
# rank is not tested as it requires unrar command.
|
||||||
|
|||||||
@ -251,6 +251,36 @@ def eval_error_metric(predt, dtrain: xgb.DMatrix):
|
|||||||
return 'CustomErr', np.sum(r)
|
return 'CustomErr', np.sum(r)
|
||||||
|
|
||||||
|
|
||||||
|
class DirectoryExcursion:
|
||||||
|
def __init__(self, path: os.PathLike, cleanup=False):
|
||||||
|
'''Change directory. Change back and optionally cleaning up the directory when exit.
|
||||||
|
|
||||||
|
'''
|
||||||
|
self.path = path
|
||||||
|
self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
|
||||||
|
self.cleanup = cleanup
|
||||||
|
self.files = {}
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
os.chdir(self.path)
|
||||||
|
if self.cleanup:
|
||||||
|
self.files = {
|
||||||
|
os.path.join(root, f)
|
||||||
|
for root, subdir, files in os.walk(self.path) for f in files
|
||||||
|
}
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
os.chdir(self.curdir)
|
||||||
|
if self.cleanup:
|
||||||
|
files = {
|
||||||
|
os.path.join(root, f)
|
||||||
|
for root, subdir, files in os.walk(self.path) for f in files
|
||||||
|
}
|
||||||
|
diff = files.difference(self.files)
|
||||||
|
for f in diff:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
|
||||||
CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
|
CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
|
||||||
PROJECT_ROOT = os.path.normpath(
|
PROJECT_ROOT = os.path.normpath(
|
||||||
os.path.join(CURDIR, os.path.pardir, os.path.pardir))
|
os.path.join(CURDIR, os.path.pardir, os.path.pardir))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user