allow setup from env variables
This commit is contained in:
parent
9b6bf57e79
commit
67ebf81e7a
4
Makefile
4
Makefile
@ -2,7 +2,7 @@ ifndef CXX
|
|||||||
export CXX = g++
|
export CXX = g++
|
||||||
endif
|
endif
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -Llib
|
export LDFLAGS= -Llib -lrt
|
||||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
|
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
|
||||||
export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS)
|
export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS)
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ $(ALIB):
|
|||||||
ar cr $@ $+
|
ar cr $@ $+
|
||||||
|
|
||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^)
|
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
|
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
|
||||||
|
|||||||
@ -29,11 +29,24 @@ AllreduceBase::AllreduceBase(void) {
|
|||||||
task_id = "NULL";
|
task_id = "NULL";
|
||||||
err_link = NULL;
|
err_link = NULL;
|
||||||
this->SetParam("rabit_reduce_buffer", "256MB");
|
this->SetParam("rabit_reduce_buffer", "256MB");
|
||||||
|
// setup possible enviroment variable of intrest
|
||||||
|
env_vars.push_back("rabit_task_id");
|
||||||
|
env_vars.push_back("rabit_num_trial");
|
||||||
|
env_vars.push_back("rabit_reduce_buffer");
|
||||||
|
env_vars.push_back("rabit_tracker_uri");
|
||||||
|
env_vars.push_back("rabit_tracker_port");
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialization function
|
// initialization function
|
||||||
void AllreduceBase::Init(void) {
|
void AllreduceBase::Init(void) {
|
||||||
// setup from enviroment variables
|
// setup from enviroment variables
|
||||||
|
// handler to get variables from env
|
||||||
|
for (size_t i = 0; i < env_vars.size(); ++i) {
|
||||||
|
const char *value = getenv(env_vars[i].c_str());
|
||||||
|
if (value != NULL) {
|
||||||
|
this->SetParam(env_vars[i].c_str(), value);
|
||||||
|
}
|
||||||
|
}
|
||||||
{
|
{
|
||||||
// handling for hadoop
|
// handling for hadoop
|
||||||
const char *task_id = getenv("mapred_tip_id");
|
const char *task_id = getenv("mapred_tip_id");
|
||||||
|
|||||||
@ -413,6 +413,8 @@ class AllreduceBase : public IEngine {
|
|||||||
// pointer to links in the ring
|
// pointer to links in the ring
|
||||||
LinkRecord *ring_prev, *ring_next;
|
LinkRecord *ring_prev, *ring_next;
|
||||||
//----- meta information-----
|
//----- meta information-----
|
||||||
|
// list of enviroment variables that are of possible interest
|
||||||
|
std::vector<std::string> env_vars;
|
||||||
// unique identifier of the possible job this process is doing
|
// unique identifier of the possible job this process is doing
|
||||||
// used to assign ranks, optional, default to NULL
|
// used to assign ranks, optional, default to NULL
|
||||||
std::string task_id;
|
std::string task_id;
|
||||||
|
|||||||
@ -28,6 +28,8 @@ AllreduceRobust::AllreduceRobust(void) {
|
|||||||
global_lazycheck = NULL;
|
global_lazycheck = NULL;
|
||||||
use_local_model = -1;
|
use_local_model = -1;
|
||||||
recover_counter = 0;
|
recover_counter = 0;
|
||||||
|
env_vars.push_back("rabit_global_replica");
|
||||||
|
env_vars.push_back("rabit_local_replica");
|
||||||
}
|
}
|
||||||
void AllreduceRobust::Init(void) {
|
void AllreduceRobust::Init(void) {
|
||||||
AllreduceBase::Init();
|
AllreduceBase::Init();
|
||||||
|
|||||||
@ -31,35 +31,38 @@ nrep=0
|
|||||||
rc=254
|
rc=254
|
||||||
while [ $rc -eq 254 ];
|
while [ $rc -eq 254 ];
|
||||||
do
|
do
|
||||||
|
export rabit_num_trial=$nrep
|
||||||
|
%s
|
||||||
%s
|
%s
|
||||||
%s %s rabit_num_trial=$nrep
|
|
||||||
rc=$?;
|
rc=$?;
|
||||||
nrep=$((nrep+1));
|
nrep=$((nrep+1));
|
||||||
done
|
done
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def exec_cmd(cmd, taskid):
|
def exec_cmd(cmd, taskid, worker_env):
|
||||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
||||||
cmd[0] = './' + cmd[0]
|
cmd[0] = './' + cmd[0]
|
||||||
cmd = ' '.join(cmd)
|
cmd = ' '.join(cmd)
|
||||||
arg = ' rabit_task_id=%d' % (taskid)
|
env = {}
|
||||||
cmd = cmd + arg
|
for k, v in worker_env.items():
|
||||||
|
env[k] = str(v)
|
||||||
|
env['rabit_task_id'] = str(taskid)
|
||||||
|
env['PYTHONPATH'] = WRAPPER_PATH
|
||||||
|
|
||||||
ntrial = 0
|
ntrial = 0
|
||||||
while True:
|
while True:
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
prep = 'SET PYTHONPATH=\"%s\"\n' % WRAPPER_PATH
|
env['rabit_num_trial'] = str(ntrial)
|
||||||
ret = subprocess.call(prep + cmd + ('rabit_num_trial=%d' % ntrial), shell=True)
|
ret = subprocess.call(cmd, shell=True, env = env)
|
||||||
if ret == 254:
|
if ret == 254:
|
||||||
ntrial += 1
|
ntrial += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
|
|
||||||
if args.verbose != 0:
|
if args.verbose != 0:
|
||||||
bash = keepalive % (echo % cmd, prep, cmd)
|
bash = keepalive % (echo % cmd, cmd)
|
||||||
else:
|
else:
|
||||||
bash = keepalive % ('', prep, cmd)
|
bash = keepalive % ('', cmd)
|
||||||
ret = subprocess.call(bash, shell=True, executable='bash')
|
ret = subprocess.call(bash, shell=True, executable='bash', env = env)
|
||||||
if ret == 0:
|
if ret == 0:
|
||||||
if args.verbose != 0:
|
if args.verbose != 0:
|
||||||
print 'Thread %d exit with 0' % taskid
|
print 'Thread %d exit with 0' % taskid
|
||||||
@ -73,7 +76,7 @@ def exec_cmd(cmd, taskid):
|
|||||||
# Note: this submit script is only used for demo purpose
|
# Note: this submit script is only used for demo purpose
|
||||||
# submission script using pyhton multi-threading
|
# submission script using pyhton multi-threading
|
||||||
#
|
#
|
||||||
def mthread_submit(nslave, worker_args):
|
def mthread_submit(nslave, worker_args, worker_envs):
|
||||||
"""
|
"""
|
||||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||||
note this can be a lambda function containing additional parameters in input
|
note this can be a lambda function containing additional parameters in input
|
||||||
@ -84,7 +87,7 @@ def mthread_submit(nslave, worker_args):
|
|||||||
"""
|
"""
|
||||||
procs = {}
|
procs = {}
|
||||||
for i in range(nslave):
|
for i in range(nslave):
|
||||||
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i))
|
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i, worker_envs))
|
||||||
procs[i].daemon = True
|
procs[i].daemon = True
|
||||||
procs[i].start()
|
procs[i].start()
|
||||||
for i in range(nslave):
|
for i in range(nslave):
|
||||||
|
|||||||
@ -94,7 +94,7 @@ use_yarn = int(hadoop_version[0]) >= 2
|
|||||||
|
|
||||||
print 'Current Hadoop Version is %s' % out[1]
|
print 'Current Hadoop Version is %s' % out[1]
|
||||||
|
|
||||||
def hadoop_streaming(nworker, worker_args, use_yarn):
|
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
||||||
fset = set()
|
fset = set()
|
||||||
if args.auto_file_cache:
|
if args.auto_file_cache:
|
||||||
for i in range(len(args.command)):
|
for i in range(len(args.command)):
|
||||||
@ -113,6 +113,7 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
|||||||
if os.path.exists(f):
|
if os.path.exists(f):
|
||||||
fset.add(f)
|
fset.add(f)
|
||||||
kmap = {}
|
kmap = {}
|
||||||
|
kmap['env'] = 'mapred.child.env'
|
||||||
# setup keymaps
|
# setup keymaps
|
||||||
if use_yarn:
|
if use_yarn:
|
||||||
kmap['nworker'] = 'mapreduce.job.maps'
|
kmap['nworker'] = 'mapreduce.job.maps'
|
||||||
@ -129,6 +130,8 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
|||||||
cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
|
cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
|
||||||
cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
|
cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
|
||||||
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
||||||
|
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
|
||||||
|
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
|
||||||
if args.nthread != -1:
|
if args.nthread != -1:
|
||||||
if kmap['nthread'] is None:
|
if kmap['nthread'] is None:
|
||||||
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
||||||
|
|||||||
@ -22,7 +22,7 @@ args = parser.parse_args()
|
|||||||
#
|
#
|
||||||
# submission script using MPI
|
# submission script using MPI
|
||||||
#
|
#
|
||||||
def mpi_submit(nslave, worker_args):
|
def mpi_submit(nslave, worker_args, worker_envs):
|
||||||
"""
|
"""
|
||||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||||
note this can be a lambda function containing additional parameters in input
|
note this can be a lambda function containing additional parameters in input
|
||||||
@ -31,6 +31,7 @@ def mpi_submit(nslave, worker_args):
|
|||||||
args arguments to launch each job
|
args arguments to launch each job
|
||||||
this usually includes the parameters of master_uri and parameters passed into submit
|
this usually includes the parameters of master_uri and parameters passed into submit
|
||||||
"""
|
"""
|
||||||
|
worker_args += ['%s=%s' % (k, str(v)) for k, v in worker_envs.items()]
|
||||||
sargs = ' '.join(args.command + worker_args)
|
sargs = ' '.join(args.command + worker_args)
|
||||||
if args.hostfile is None:
|
if args.hostfile is None:
|
||||||
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
||||||
|
|||||||
@ -140,15 +140,19 @@ class Tracker:
|
|||||||
self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
|
self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.sock.close()
|
self.sock.close()
|
||||||
def slave_args(self):
|
def slave_envs(self):
|
||||||
|
"""
|
||||||
|
get enviroment variables for slaves
|
||||||
|
can be passed in as args or envs
|
||||||
|
"""
|
||||||
if self.hostIP == 'dns':
|
if self.hostIP == 'dns':
|
||||||
host = socket.gethostname()
|
host = socket.gethostname()
|
||||||
elif self.hostIP == 'ip':
|
elif self.hostIP == 'ip':
|
||||||
host = socket.gethostbyname(socket.getfqdn())
|
host = socket.gethostbyname(socket.getfqdn())
|
||||||
else:
|
else:
|
||||||
host = self.hostIP
|
host = self.hostIP
|
||||||
return ['rabit_tracker_uri=%s' % host,
|
return {'rabit_tracker_uri': host,
|
||||||
'rabit_tracker_port=%s' % self.port]
|
'rabit_tracker_port': self.port}
|
||||||
def get_neighbor(self, rank, nslave):
|
def get_neighbor(self, rank, nslave):
|
||||||
rank = rank + 1
|
rank = rank + 1
|
||||||
ret = []
|
ret = []
|
||||||
@ -265,7 +269,7 @@ class Tracker:
|
|||||||
|
|
||||||
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
||||||
master = Tracker(verbose = verbose, hostIP = hostIP)
|
master = Tracker(verbose = verbose, hostIP = hostIP)
|
||||||
submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
|
submit_thread = Thread(target = fun_submit, args = (nslave, args, master.slave_envs()))
|
||||||
submit_thread.daemon = True
|
submit_thread.daemon = True
|
||||||
submit_thread.start()
|
submit_thread.start()
|
||||||
master.accept_slaves(nslave)
|
master.accept_slaves(nslave)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user