From 8f23eb11d7ff81c0fa51d2109a51e9565fa77985 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 15 Jan 2015 10:22:59 -0800 Subject: [PATCH] change convention --- test/Makefile | 16 ++- test/lazy_recover.cc | 126 ++++++++++++++++++ ...est_local_recover.cpp => local_recover.cc} | 0 ...est_model_recover.cpp => model_recover.cc} | 0 test/{speed_test.cpp => speed_test.cc} | 0 test/test.mk | 17 ++- test/test_local_recover.py | 25 ---- 7 files changed, 146 insertions(+), 38 deletions(-) create mode 100644 test/lazy_recover.cc rename test/{test_local_recover.cpp => local_recover.cc} (100%) rename test/{test_model_recover.cpp => model_recover.cc} (100%) rename test/{speed_test.cpp => speed_test.cc} (100%) delete mode 100755 test/test_local_recover.py diff --git a/test/Makefile b/test/Makefile index fdb57c0cd..5ff983e81 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,8 +5,8 @@ export LDFLAGS= -pthread -lm -lrt -L../lib export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11 # specify tensor path -BIN = speed_test test_model_recover test_local_recover -OBJ = $(RABIT_OBJ) speed_test.o test_model_recover.o test_local_recover.o +BIN = speed_test model_recover local_recover lazy_recover +OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o MPIBIN = speed_test.mpi .PHONY: clean all lib mpi @@ -16,15 +16,17 @@ lib: mpi: cd ..;make mpi;cd - # programs -speed_test.o: speed_test.cpp ../include/*.h lib mpi -test_model_recover.o: test_model_recover.cpp ../include/*.h lib -test_local_recover.o: test_local_recover.cpp ../include/*.h lib +speed_test.o: speed_test.cc ../include/*.h lib mpi +model_recover.o: model_recover.cc ../include/*.h lib +local_recover.o: local_recover.cc ../include/*.h lib +lazy_recover.o: lazy_recover.cc ../include/*.h lib # we can link against MPI version to get use MPI speed_test: speed_test.o $(RABIT_OBJ) speed_test.mpi: speed_test.o $(MPIOBJ) -test_model_recover: test_model_recover.o $(RABIT_OBJ) -test_local_recover: test_local_recover.o $(RABIT_OBJ) +model_recover: model_recover.o $(RABIT_OBJ) +local_recover: local_recover.o $(RABIT_OBJ) +lazy_recover: lazy_recover.o $(RABIT_OBJ) $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) $(LDFLAGS) -lrabit_mock diff --git a/test/lazy_recover.cc b/test/lazy_recover.cc new file mode 100644 index 000000000..d20e4f994 --- /dev/null +++ b/test/lazy_recover.cc @@ -0,0 +1,126 @@ +// this is a test case to test whether rabit can recover model when +// facing an exception +#include +#include +#include +#include +#include +using namespace rabit; + +// dummy model +class Model : public rabit::ISerializable { + public: + // iterations + std::vector data; + // load from stream + virtual void Load(rabit::IStream &fi) { + fi.Read(&data); + } + /*! \brief save the model to the stream */ + virtual void Save(rabit::IStream &fo) const { + fo.Write(data); + } + virtual void InitModel(size_t n) { + data.clear(); + data.resize(n, 1.0f); + } +}; + +inline void TestMax(Model *model, int ntrial, int iter) { + int rank = rabit::GetRank(); + int nproc = rabit::GetWorldSize(); + const int z = iter + 111; + + std::vector ndata(model->data.size()); + for (size_t i = 0; i < ndata.size(); ++i) { + ndata[i] = (i * (rank+1)) % z + model->data[i]; + } + rabit::Allreduce(&ndata[0], ndata.size()); + + for (size_t i = 0; i < ndata.size(); ++i) { + float rmax = (i * 1) % z + model->data[i]; + for (int r = 0; r < nproc; ++r) { + rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]); + } + utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]); + } +} + +inline void TestSum(Model *model, int ntrial, int iter) { + int rank = rabit::GetRank(); + int nproc = rabit::GetWorldSize(); + const int z = 131 + iter; + + std::vector ndata(model->data.size()); + for (size_t i = 0; i < ndata.size(); ++i) { + ndata[i] = (i * (rank+1)) % z + model->data[i]; + } + Allreduce(&ndata[0], ndata.size()); + + for (size_t i = 0; i < ndata.size(); ++i) { + float rsum = model->data[i] * nproc; + for (int r = 0; r < nproc; ++r) { + rsum += (float)((i * (r+1)) % z); + } + utils::Check(fabsf(rsum - ndata[i]) < 1e-5 , + "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]); + } + model->data = ndata; +} + +inline void TestBcast(size_t n, int root, int ntrial, int iter) { + int rank = rabit::GetRank(); + std::string s; s.resize(n); + for (size_t i = 0; i < n; ++i) { + s[i] = char(i % 126 + 1); + } + std::string res; + if (root == rank) { + res = s; + rabit::Broadcast(&res, root); + } else { + rabit::Broadcast(&res, root); + } + utils::Check(res == s, "[%d] TestBcast fail", rank); +} + +int main(int argc, char *argv[]) { + if (argc < 3) { + printf("Usage: \n"); + return 0; + } + int n = atoi(argv[1]); + rabit::Init(argc, argv); + int rank = rabit::GetRank(); + int nproc = rabit::GetWorldSize(); + std::string name = rabit::GetProcessorName(); + Model model; + srand(0); + int ntrial = 0; + for (int i = 1; i < argc; ++i) { + int n; + if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n; + } + int iter = rabit::LoadCheckPoint(&model); + if (iter == 0) { + model.InitModel(n); + printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); + } else { + printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); + } + for (int r = iter; r < 3; ++r) { + TestMax(&model, ntrial, r); + printf("[%d] !!!TestMax pass, iter=%d\n", rank, r); + int step = std::max(nproc / 3, 1); + for (int i = 0; i < nproc; i += step) { + TestBcast(n, i, ntrial, r); + } + printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r); + TestSum(&model, ntrial, r); + printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); + rabit::LazyCheckPoint(&model); + printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + } + rabit::Finalize(); + return 0; +} diff --git a/test/test_local_recover.cpp b/test/local_recover.cc similarity index 100% rename from test/test_local_recover.cpp rename to test/local_recover.cc diff --git a/test/test_model_recover.cpp b/test/model_recover.cc similarity index 100% rename from test/test_model_recover.cpp rename to test/model_recover.cc diff --git a/test/speed_test.cpp b/test/speed_test.cc similarity index 100% rename from test/speed_test.cpp rename to test/speed_test.cc diff --git a/test/test.mk b/test/test.mk index 99d146d08..e08cb19f0 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,18 +1,23 @@ # this is a makefile used to show testcases of rabit -.PHONY:all +.PHONY: all all: # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: - ../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 + ../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 model_recover_10_10k_die_same: - ../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 model_recover_10_10k_die_hard: - ../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 - + ../tracker/rabit_demo.py -n 10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 local_recover_10_10k: - ../tracker/rabit_demo.py -n 10 test_local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../tracker/rabit_demo.py -n 10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + +lazy_recover_10_10k_die_hard: + ../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + +lazy_recover_10_10k_die_same: + ../tracker/rabit_demo.py -n 10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 \ No newline at end of file diff --git a/test/test_local_recover.py b/test/test_local_recover.py deleted file mode 100755 index e35bd3177..000000000 --- a/test/test_local_recover.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/python -import rabit -import numpy as np - -rabit.init(lib='mock') -rank = rabit.get_rank() -n = 10 -nround = 3 -data = np.ones(n) * rank - -version, model, local = rabit.load_checkpoint(True) -if version == 0: - model = np.zeros(n) - local = np.ones(n) -else: - print '[%d] restart from version %d' % (rank, version) - -for i in xrange(version, nround): - res = rabit.allreduce(data + model+local, rabit.SUM) - print '[%d] iter=%d: %s' % (rank, i, str(res)) - model = res - local[:] = i - rabit.checkpoint(model, local) - -rabit.finalize()