diff --git a/demo/binary_classification/README.md b/demo/CLI/binary_classification/README.md similarity index 100% rename from demo/binary_classification/README.md rename to demo/CLI/binary_classification/README.md diff --git a/demo/binary_classification/agaricus-lepiota.data b/demo/CLI/binary_classification/agaricus-lepiota.data similarity index 100% rename from demo/binary_classification/agaricus-lepiota.data rename to demo/CLI/binary_classification/agaricus-lepiota.data diff --git a/demo/binary_classification/agaricus-lepiota.fmap b/demo/CLI/binary_classification/agaricus-lepiota.fmap similarity index 100% rename from demo/binary_classification/agaricus-lepiota.fmap rename to demo/CLI/binary_classification/agaricus-lepiota.fmap diff --git a/demo/binary_classification/agaricus-lepiota.names b/demo/CLI/binary_classification/agaricus-lepiota.names similarity index 100% rename from demo/binary_classification/agaricus-lepiota.names rename to demo/CLI/binary_classification/agaricus-lepiota.names diff --git a/demo/binary_classification/mapfeat.py b/demo/CLI/binary_classification/mapfeat.py similarity index 100% rename from demo/binary_classification/mapfeat.py rename to demo/CLI/binary_classification/mapfeat.py diff --git a/demo/binary_classification/mknfold.py b/demo/CLI/binary_classification/mknfold.py similarity index 100% rename from demo/binary_classification/mknfold.py rename to demo/CLI/binary_classification/mknfold.py diff --git a/demo/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf similarity index 98% rename from demo/binary_classification/mushroom.conf rename to demo/CLI/binary_classification/mushroom.conf index 435c9bf8d..3cf865465 100644 --- a/demo/binary_classification/mushroom.conf +++ b/demo/CLI/binary_classification/mushroom.conf @@ -18,7 +18,7 @@ max_depth = 3 # the number of round to do boosting num_round = 2 # 0 means do not save any model except the final round model -save_period = 0 +save_period = 2 # The path of training data data = "agaricus.txt.train" # The path of validation data, used to monitor training process, here [test] sets name of the validation set diff --git a/demo/binary_classification/runexp.sh b/demo/CLI/binary_classification/runexp.sh similarity index 50% rename from demo/binary_classification/runexp.sh rename to demo/CLI/binary_classification/runexp.sh index 68c3e6fb9..4a33f0ed8 100755 --- a/demo/binary_classification/runexp.sh +++ b/demo/CLI/binary_classification/runexp.sh @@ -3,13 +3,15 @@ python mapfeat.py # split train and test python mknfold.py agaricus.txt 1 -# training and output the models -../../xgboost mushroom.conf -# output prediction task=pred -../../xgboost mushroom.conf task=pred model_in=0002.model -# print the boosters of 00002.model in dump.raw.txt -../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt +XGBOOST=../../../xgboost + +# training and output the models +$XGBOOST mushroom.conf +# output prediction task=pred +$XGBOOST mushroom.conf task=pred model_in=0002.model +# print the boosters of 00002.model in dump.raw.txt +$XGBOOST mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +$XGBOOST mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt diff --git a/demo/distributed-training/README.md b/demo/CLI/distributed-training/README.md similarity index 100% rename from demo/distributed-training/README.md rename to demo/CLI/distributed-training/README.md diff --git a/demo/distributed-training/mushroom.aws.conf b/demo/CLI/distributed-training/mushroom.aws.conf similarity index 100% rename from demo/distributed-training/mushroom.aws.conf rename to demo/CLI/distributed-training/mushroom.aws.conf diff --git a/demo/distributed-training/plot_model.ipynb b/demo/CLI/distributed-training/plot_model.ipynb similarity index 100% rename from demo/distributed-training/plot_model.ipynb rename to demo/CLI/distributed-training/plot_model.ipynb diff --git a/demo/CLI/distributed-training/run_aws.sh b/demo/CLI/distributed-training/run_aws.sh new file mode 100644 index 000000000..d7223ea54 --- /dev/null +++ b/demo/CLI/distributed-training/run_aws.sh @@ -0,0 +1,11 @@ +# This is the example script to run distributed xgboost on AWS. +# Change the following two lines for configuration + +export BUCKET=mybucket + +# submit the job to YARN +../../../dmlc-core/tracker/dmlc-submit --cluster=yarn --num-workers=2 --worker-cores=2\ + ../../../xgboost mushroom.aws.conf nthread=2\ + data=s3://${BUCKET}/xgb-demo/train\ + eval[test]=s3://${BUCKET}/xgb-demo/test\ + model_dir=s3://${BUCKET}/xgb-demo/model diff --git a/demo/regression/README.md b/demo/CLI/regression/README.md similarity index 100% rename from demo/regression/README.md rename to demo/CLI/regression/README.md diff --git a/demo/regression/machine.conf b/demo/CLI/regression/machine.conf similarity index 100% rename from demo/regression/machine.conf rename to demo/CLI/regression/machine.conf diff --git a/demo/regression/machine.data b/demo/CLI/regression/machine.data similarity index 100% rename from demo/regression/machine.data rename to demo/CLI/regression/machine.data diff --git a/demo/regression/machine.names b/demo/CLI/regression/machine.names similarity index 100% rename from demo/regression/machine.names rename to demo/CLI/regression/machine.names diff --git a/demo/CLI/regression/mapfeat.py b/demo/CLI/regression/mapfeat.py new file mode 100755 index 000000000..01c7035d6 --- /dev/null +++ b/demo/CLI/regression/mapfeat.py @@ -0,0 +1,33 @@ +#!/usr/bin/python + +fo = open('machine.txt', 'w') +cnt = 6 +fmap = {} +for l in open('machine.data'): + arr = l.split(',') + fo.write(arr[8]) + for i in range(0, 6): + fo.write(' %d:%s' % (i, arr[i + 2])) + + if arr[0] not in fmap: + fmap[arr[0]] = cnt + cnt += 1 + + fo.write(' %d:1' % fmap[arr[0]]) + fo.write('\n') + +fo.close() + +# create feature map for machine data +fo = open('featmap.txt', 'w') +# list from machine.names +names = [ + 'vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' +] + +for i in range(0, 6): + fo.write('%d\t%s\tint\n' % (i, names[i + 1])) + +for v, k in sorted(fmap.items(), key=lambda x: x[1]): + fo.write('%d\tvendor=%s\ti\n' % (k, v)) +fo.close() diff --git a/demo/CLI/regression/mknfold.py b/demo/CLI/regression/mknfold.py new file mode 100755 index 000000000..3a3c0f647 --- /dev/null +++ b/demo/CLI/regression/mknfold.py @@ -0,0 +1,28 @@ +#!/usr/bin/python +import sys +import random + +if len(sys.argv) < 2: + print('Usage: [nfold = 5]') + exit(0) + +random.seed(10) + +k = int(sys.argv[2]) +if len(sys.argv) > 3: + nfold = int(sys.argv[3]) +else: + nfold = 5 + +fi = open(sys.argv[1], 'r') +ftr = open(sys.argv[1] + '.train', 'w') +fte = open(sys.argv[1] + '.test', 'w') +for l in fi: + if random.randint(1, nfold) == k: + fte.write(l) + else: + ftr.write(l) + +fi.close() +ftr.close() +fte.close() diff --git a/demo/regression/runexp.sh b/demo/CLI/regression/runexp.sh similarity index 100% rename from demo/regression/runexp.sh rename to demo/CLI/regression/runexp.sh diff --git a/demo/yearpredMSD/README.md b/demo/CLI/yearpredMSD/README.md similarity index 100% rename from demo/yearpredMSD/README.md rename to demo/CLI/yearpredMSD/README.md diff --git a/demo/yearpredMSD/csv2libsvm.py b/demo/CLI/yearpredMSD/csv2libsvm.py similarity index 51% rename from demo/yearpredMSD/csv2libsvm.py rename to demo/CLI/yearpredMSD/csv2libsvm.py index d7c1d15c1..828c95495 100755 --- a/demo/yearpredMSD/csv2libsvm.py +++ b/demo/CLI/yearpredMSD/csv2libsvm.py @@ -1,14 +1,9 @@ -#!/usr/bin/python import sys - -if len(sys.argv) < 3: - print 'Usage: ' - print 'convert a all numerical csv to libsvm' - fo = open(sys.argv[2], 'w') + for l in open(sys.argv[1]): arr = l.split(',') fo.write('%s' % arr[0]) - for i in xrange(len(arr) - 1): + for i in range(len(arr) - 1): fo.write(' %d:%s' % (i, arr[i+1])) fo.close() diff --git a/demo/yearpredMSD/runexp.sh b/demo/CLI/yearpredMSD/runexp.sh similarity index 94% rename from demo/yearpredMSD/runexp.sh rename to demo/CLI/yearpredMSD/runexp.sh index 8853c3f20..4ec58025e 100755 --- a/demo/yearpredMSD/runexp.sh +++ b/demo/CLI/yearpredMSD/runexp.sh @@ -14,4 +14,4 @@ python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test echo "finish making the data" -../../xgboost yearpredMSD.conf +../../../xgboost yearpredMSD.conf diff --git a/demo/yearpredMSD/yearpredMSD.conf b/demo/CLI/yearpredMSD/yearpredMSD.conf similarity index 100% rename from demo/yearpredMSD/yearpredMSD.conf rename to demo/CLI/yearpredMSD/yearpredMSD.conf diff --git a/demo/distributed-training/run_aws.sh b/demo/distributed-training/run_aws.sh deleted file mode 100644 index 0b7cb17d2..000000000 --- a/demo/distributed-training/run_aws.sh +++ /dev/null @@ -1,11 +0,0 @@ -# This is the example script to run distributed xgboost on AWS. -# Change the following two lines for configuration - -export BUCKET=mybucket - -# submit the job to YARN -../../dmlc-core/tracker/dmlc-submit --cluster=yarn --num-workers=2 --worker-cores=2\ - ../../xgboost mushroom.aws.conf nthread=2\ - data=s3://${BUCKET}/xgb-demo/train\ - eval[test]=s3://${BUCKET}/xgb-demo/test\ - model_dir=s3://${BUCKET}/xgb-demo/model diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh index 5b299f925..a5ed5d1e0 100755 --- a/demo/rank/runexp.sh +++ b/demo/rank/runexp.sh @@ -1,5 +1,4 @@ #!/bin/bash ../../xgboost mq2008.conf - ../../xgboost mq2008.conf task=pred model_in=0004.model diff --git a/demo/rank/trans_data.py b/demo/rank/trans_data.py index 7282848c4..aa72276c0 100644 --- a/demo/rank/trans_data.py +++ b/demo/rank/trans_data.py @@ -7,7 +7,7 @@ def save_data(group_data,output_feature,output_group): output_group.write(str(len(group_data))+"\n") for data in group_data: # only include nonzero features - feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ] + feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ] output_feature.write(data[0] + " " + " ".join(feats) + "\n") if __name__ == "__main__": @@ -18,7 +18,7 @@ if __name__ == "__main__": fi = open(sys.argv[1]) output_feature = open(sys.argv[2],"w") output_group = open(sys.argv[3],"w") - + group_data = [] group = "" for line in fi: @@ -38,4 +38,3 @@ if __name__ == "__main__": fi.close() output_feature.close() output_group.close() - diff --git a/demo/rank/wgetdata.sh b/demo/rank/wgetdata.sh index 3bd5bd3b2..613d0183c 100755 --- a/demo/rank/wgetdata.sh +++ b/demo/rank/wgetdata.sh @@ -1,7 +1,13 @@ #!/bin/bash -wget https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.rar -unrar x MQ2008.rar -mv -f MQ2008/Fold1/*.txt . +if [ -f MQ2008.rar ] +then + echo "Use downloaded data to run experiment." +else + echo "Downloading data." + wget https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.rar + unrar x MQ2008.rar + mv -f MQ2008/Fold1/*.txt . +fi python trans_data.py train.txt mq2008.train mq2008.train.group diff --git a/demo/regression/mapfeat.py b/demo/regression/mapfeat.py deleted file mode 100755 index c747c7b49..000000000 --- a/demo/regression/mapfeat.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/python - -fo = open( 'machine.txt', 'w' ) -cnt = 6 -fmap = {} -for l in open( 'machine.data' ): - arr = l.split(',') - fo.write(arr[8]) - for i in range( 0,6 ): - fo.write( ' %d:%s' %(i,arr[i+2]) ) - - if arr[0] not in fmap: - fmap[arr[0]] = cnt - cnt += 1 - - fo.write( ' %d:1' % fmap[arr[0]] ) - fo.write('\n') - -fo.close() - -# create feature map for machine data -fo = open('featmap.txt', 'w') -# list from machine.names -names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ]; - -for i in range(0,6): - fo.write( '%d\t%s\tint\n' % (i, names[i+1])) - -for v, k in sorted( fmap.items(), key = lambda x:x[1] ): - fo.write( '%d\tvendor=%s\ti\n' % (k, v)) -fo.close() diff --git a/demo/regression/mknfold.py b/demo/regression/mknfold.py deleted file mode 100755 index a941f8609..000000000 --- a/demo/regression/mknfold.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python -import sys -import random - -if len(sys.argv) < 2: - print ('Usage: [nfold = 5]') - exit(0) - -random.seed( 10 ) - -k = int( sys.argv[2] ) -if len(sys.argv) > 3: - nfold = int( sys.argv[3] ) -else: - nfold = 5 - -fi = open( sys.argv[1], 'r' ) -ftr = open( sys.argv[1]+'.train', 'w' ) -fte = open( sys.argv[1]+'.test', 'w' ) -for l in fi: - if random.randint( 1 , nfold ) == k: - fte.write( l ) - else: - ftr.write( l ) - -fi.close() -ftr.close() -fte.close() - diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 96fabc450..fc0efb8ab 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -116,7 +116,7 @@ if __name__ == "__main__": run("cmake .. " + " ".join(args) + maybe_generator) run("cmake --build . --config Release" + maybe_parallel_build) - with cd("demo/regression"): + with cd("demo/CLI/regression"): run(sys.executable + " mapfeat.py") run(sys.executable + " mknfold.py machine.txt 1") @@ -138,11 +138,11 @@ if __name__ == "__main__": print("copying train/test files") maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark)) - with cd("../demo/regression"): + with cd("../demo/CLI/regression"): run("{} mapfeat.py".format(sys.executable)) run("{} mknfold.py machine.txt 1".format(sys.executable)) - for file in glob.glob("../demo/regression/machine.txt.t*"): + for file in glob.glob("../demo/CLI/regression/machine.txt.t*"): cp(file, "{}/src/test/resources".format(xgboost4j_spark)) for file in glob.glob("../demo/data/agaricus.*"): cp(file, "{}/src/test/resources".format(xgboost4j_spark)) diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 9ecf3aace..c37adc23c 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -2,11 +2,13 @@ import os import subprocess import pytest import testing as tm +import sys ROOT_DIR = tm.PROJECT_ROOT DEMO_DIR = os.path.join(ROOT_DIR, 'demo') PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python') +CLI_DEMO_DIR = os.path.join(DEMO_DIR, 'CLI') def test_basic_walkthrough(): @@ -132,7 +134,7 @@ def test_callbacks_demo(): def test_cli_regression_demo(): - reg_dir = os.path.join(DEMO_DIR, 'regression') + reg_dir = os.path.join(CLI_DEMO_DIR, 'regression') script = os.path.join(reg_dir, 'mapfeat.py') cmd = ['python', script] subprocess.check_call(cmd, cwd=reg_dir) @@ -144,3 +146,15 @@ def test_cli_regression_demo(): exe = os.path.join(tm.PROJECT_ROOT, 'xgboost') conf = os.path.join(reg_dir, 'machine.conf') subprocess.check_call([exe, conf], cwd=reg_dir) + + +@pytest.mark.skipif(condition=sys.platform.startswith("win"), + reason='Test requires sh execution.') +def test_cli_binary_classification(): + cls_dir = os.path.join(CLI_DEMO_DIR, 'binary_classification') + with tm.DirectoryExcursion(cls_dir, cleanup=True): + subprocess.check_call(['./runexp.sh']) + os.remove('0002.model') + +# year prediction is not tested due to data size being too large. +# rank is not tested as it requires unrar command. diff --git a/tests/python/testing.py b/tests/python/testing.py index 5f301bc1f..f4b8654e3 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -251,6 +251,36 @@ def eval_error_metric(predt, dtrain: xgb.DMatrix): return 'CustomErr', np.sum(r) +class DirectoryExcursion: + def __init__(self, path: os.PathLike, cleanup=False): + '''Change directory. Change back and optionally cleaning up the directory when exit. + + ''' + self.path = path + self.curdir = os.path.normpath(os.path.abspath(os.path.curdir)) + self.cleanup = cleanup + self.files = {} + + def __enter__(self): + os.chdir(self.path) + if self.cleanup: + self.files = { + os.path.join(root, f) + for root, subdir, files in os.walk(self.path) for f in files + } + + def __exit__(self, *args): + os.chdir(self.curdir) + if self.cleanup: + files = { + os.path.join(root, f) + for root, subdir, files in os.walk(self.path) for f in files + } + diff = files.difference(self.files) + for f in diff: + os.remove(f) + + CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) PROJECT_ROOT = os.path.normpath( os.path.join(CURDIR, os.path.pardir, os.path.pardir))