Squashed 'subtree/rabit/' changes from 091634b..59e63bc
59e63bcminor6233050ok14477f9add namenode75a6d34add libhdfs optse3c76bfminmum fix8b3c435chg2035799test code7751b2badd debug7690313okbd346b4okfaba1dcadd testload6f7783eadd testloade5f0340ok3ed9ec8chge552ac4ask for more ram in amb2505e3only stop nm when sucessbc696c9add queue infof3e867eadd option queue5dc843crefactor fileiocd9c81bquick fix1e23af2add virtual destructor to iseekstreamf165ffbfix hdfs8cc6508allow demo to pass in envfad4d69ok0fd6197fix more7423837fix mored25de54add temporal solution, run_yarn_prog.pye5a9e31final attempted3bee8add command back0774000add hdfs to resource9b66e7efix hadoop6812f14ok08e1c16change hadoop prefix back to hadoop homed6b6828Update build.sh146e069bugfix: logical boundary for ring buffer19cb685ok4cf3c13Merge branch 'master' of ssh://github.com/tqchen/rabit20daddbadd trackerc57dad8add ringbased passing and batch schedule295d8a1update994cb02add sge014c866OK git-subtree-dir: subtree/rabit git-subtree-split:59e63bc135
This commit is contained in:
@@ -9,4 +9,4 @@ the example guidelines are in the script themselfs
|
||||
* Yarn (Hadoop): [rabit_yarn.py](rabit_yarn.py)
|
||||
- It is also possible to submit via hadoop streaming with rabit_hadoop_streaming.py
|
||||
- However, it is higly recommended to use rabit_yarn.py because this will allocate resources more precisely and fits machine learning scenarios
|
||||
|
||||
* Sun Grid engine: [rabit_sge.py](rabit_sge.py)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is the demo submission script of rabit, it is created to
|
||||
submit rabit jobs using hadoop streaming
|
||||
This is the demo submission script of rabit for submitting jobs in local machine
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
@@ -43,7 +42,7 @@ def exec_cmd(cmd, taskid, worker_env):
|
||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
||||
cmd[0] = './' + cmd[0]
|
||||
cmd = ' '.join(cmd)
|
||||
env = {}
|
||||
env = os.environ.copy()
|
||||
for k, v in worker_env.items():
|
||||
env[k] = str(v)
|
||||
env['rabit_task_id'] = str(taskid)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Deprecated
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is the demo submission script of rabit, it is created to
|
||||
submit rabit jobs using hadoop streaming
|
||||
Submission script to submit rabit jobs using MPI
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
69
tracker/rabit_sge.py
Executable file
69
tracker/rabit_sge.py
Executable file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Submit rabit jobs to Sun Grid Engine
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import rabit_tracker as tracker
|
||||
|
||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit job using MPI')
|
||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||
help = 'number of worker proccess to be launched')
|
||||
parser.add_argument('-q', '--queue', default='default', type=str,
|
||||
help = 'the queue we want to submit the job to')
|
||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('--vcores', default = 1, type=int,
|
||||
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||
parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
|
||||
parser.add_argument('--logdir', default='auto', help = 'customize the directory to place the logs')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.jobname == 'auto':
|
||||
args.jobname = ('rabit%d.' % args.nworker) + args.command[0].split('/')[-1];
|
||||
if args.logdir == 'auto':
|
||||
args.logdir = args.jobname + '.log'
|
||||
|
||||
if os.path.exists(args.logdir):
|
||||
if not os.path.isdir(args.logdir):
|
||||
raise RuntimeError('specified logdir %s is a file instead of directory' % args.logdir)
|
||||
else:
|
||||
os.mkdir(args.logdir)
|
||||
|
||||
runscript = '%s/runrabit.sh' % args.logdir
|
||||
fo = open(runscript, 'w')
|
||||
fo.write('\"$@\"\n')
|
||||
fo.close()
|
||||
#
|
||||
# submission script using MPI
|
||||
#
|
||||
def sge_submit(nslave, worker_args, worker_envs):
|
||||
"""
|
||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||
note this can be a lambda function containing additional parameters in input
|
||||
Parameters
|
||||
nslave number of slave process to start up
|
||||
args arguments to launch each job
|
||||
this usually includes the parameters of master_uri and parameters passed into submit
|
||||
"""
|
||||
env_arg = ','.join(['%s=\"%s\"' % (k, str(v)) for k, v in worker_envs.items()])
|
||||
cmd = 'qsub -cwd -t 1-%d -S /bin/bash' % nslave
|
||||
if args.queue != 'default':
|
||||
cmd += '-q %s' % args.queue
|
||||
cmd += ' -N %s ' % args.jobname
|
||||
cmd += ' -e %s -o %s' % (args.logdir, args.logdir)
|
||||
cmd += ' -pe orte %d' % (args.vcores)
|
||||
cmd += ' -v %s,PATH=${PATH}:.' % env_arg
|
||||
cmd += ' %s %s' % (runscript, ' '.join(args.command + worker_args))
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True)
|
||||
print 'Waiting for the jobs to get up...'
|
||||
|
||||
# call submit, with nslave, the commands to run each job and submit function
|
||||
tracker.submit(args.nworker, [], fun_submit = sge_submit, verbose = args.verbose)
|
||||
@@ -13,6 +13,7 @@ import socket
|
||||
import struct
|
||||
import subprocess
|
||||
import random
|
||||
import time
|
||||
from threading import Thread
|
||||
|
||||
"""
|
||||
@@ -188,6 +189,7 @@ class Tracker:
|
||||
vlst.reverse()
|
||||
rlst += vlst
|
||||
return rlst
|
||||
|
||||
def get_ring(self, tree_map, parent_map):
|
||||
"""
|
||||
get a ring connection used to recover local data
|
||||
@@ -202,14 +204,44 @@ class Tracker:
|
||||
rnext = (r + 1) % nslave
|
||||
ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
|
||||
return ring_map
|
||||
|
||||
def get_link_map(self, nslave):
|
||||
"""
|
||||
get the link map, this is a bit hacky, call for better algorithm
|
||||
to place similar nodes together
|
||||
"""
|
||||
tree_map, parent_map = self.get_tree(nslave)
|
||||
ring_map = self.get_ring(tree_map, parent_map)
|
||||
rmap = {0 : 0}
|
||||
k = 0
|
||||
for i in range(nslave - 1):
|
||||
k = ring_map[k][1]
|
||||
rmap[k] = i + 1
|
||||
|
||||
ring_map_ = {}
|
||||
tree_map_ = {}
|
||||
parent_map_ ={}
|
||||
for k, v in ring_map.items():
|
||||
ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]])
|
||||
for k, v in tree_map.items():
|
||||
tree_map_[rmap[k]] = [rmap[x] for x in v]
|
||||
for k, v in parent_map.items():
|
||||
if k != 0:
|
||||
parent_map_[rmap[k]] = rmap[v]
|
||||
else:
|
||||
parent_map_[rmap[k]] = -1
|
||||
return tree_map_, parent_map_, ring_map_
|
||||
|
||||
def handle_print(self,slave, msg):
|
||||
sys.stdout.write(msg)
|
||||
|
||||
def log_print(self, msg, level):
|
||||
if level == 1:
|
||||
if self.verbose:
|
||||
sys.stderr.write(msg + '\n')
|
||||
else:
|
||||
sys.stderr.write(msg + '\n')
|
||||
|
||||
def accept_slaves(self, nslave):
|
||||
# set of nodes that finishs the job
|
||||
shutdown = {}
|
||||
@@ -241,31 +273,40 @@ class Tracker:
|
||||
assert s.cmd == 'start'
|
||||
if s.world_size > 0:
|
||||
nslave = s.world_size
|
||||
tree_map, parent_map = self.get_tree(nslave)
|
||||
ring_map = self.get_ring(tree_map, parent_map)
|
||||
tree_map, parent_map, ring_map = self.get_link_map(nslave)
|
||||
# set of nodes that is pending for getting up
|
||||
todo_nodes = range(nslave)
|
||||
random.shuffle(todo_nodes)
|
||||
else:
|
||||
assert s.world_size == -1 or s.world_size == nslave
|
||||
if s.cmd == 'recover':
|
||||
assert s.rank >= 0
|
||||
|
||||
rank = s.decide_rank(job_map)
|
||||
# batch assignment of ranks
|
||||
if rank == -1:
|
||||
assert len(todo_nodes) != 0
|
||||
rank = todo_nodes.pop(0)
|
||||
if s.jobid != 'NULL':
|
||||
job_map[s.jobid] = rank
|
||||
pending.append(s)
|
||||
if len(pending) == len(todo_nodes):
|
||||
pending.sort(key = lambda x : x.host)
|
||||
for s in pending:
|
||||
rank = todo_nodes.pop(0)
|
||||
if s.jobid != 'NULL':
|
||||
job_map[s.jobid] = rank
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
|
||||
if len(todo_nodes) == 0:
|
||||
self.log_print('@tracker All of %d nodes getting started' % nslave, 2)
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
if s.cmd != 'start':
|
||||
self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
|
||||
self.start_time = time.time()
|
||||
else:
|
||||
self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
self.log_print('@tracker All nodes finishes job', 2)
|
||||
self.end_time = time.time()
|
||||
self.log_print('@tracker %s secs between node start and job finish' % str(self.end_time - self.start_time), 2)
|
||||
|
||||
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
||||
master = Tracker(verbose = verbose, hostIP = hostIP)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is a script to submit rabit job via Yarn
|
||||
rabit will run as a Yarn application
|
||||
@@ -13,6 +13,7 @@ import rabit_tracker as tracker
|
||||
|
||||
WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
|
||||
YARN_JAR_PATH = os.path.dirname(__file__) + '/../yarn/rabit-yarn.jar'
|
||||
YARN_BOOT_PY = os.path.dirname(__file__) + '/../yarn/run_hdfs_prog.py'
|
||||
|
||||
if not os.path.exists(YARN_JAR_PATH):
|
||||
warnings.warn("cannot find \"%s\", I will try to run build" % YARN_JAR_PATH)
|
||||
@@ -21,7 +22,7 @@ if not os.path.exists(YARN_JAR_PATH):
|
||||
subprocess.check_call(cmd, shell = True, env = os.environ)
|
||||
assert os.path.exists(YARN_JAR_PATH), "failed to build rabit-yarn.jar, try it manually"
|
||||
|
||||
hadoop_binary = 'hadoop'
|
||||
hadoop_binary = None
|
||||
# code
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
|
||||
@@ -38,6 +39,8 @@ parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('-q', '--queue', default='default', type=str,
|
||||
help = 'the queue we want to submit the job to')
|
||||
parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
|
||||
help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
|
||||
parser.add_argument('-f', '--files', default = [], action='append',
|
||||
@@ -56,6 +59,11 @@ parser.add_argument('-mem', '--memory_mb', default=1024, type=int,
|
||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||
'if you are running multi-threading rabit,'\
|
||||
'so that each node can occupy all the mapper slots in a machine for maximum performance')
|
||||
parser.add_argument('--libhdfs-opts', default='-Xmx128m', type=str,
|
||||
help = 'setting to be passed to libhdfs')
|
||||
parser.add_argument('--name-node', default='default', type=str,
|
||||
help = 'the namenode address of hdfs, libhdfs should connect to, normally leave it as default')
|
||||
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
@@ -87,7 +95,7 @@ if hadoop_version < 2:
|
||||
print 'Current Hadoop Version is %s, rabit_yarn will need Yarn(Hadoop 2.0)' % out[1]
|
||||
|
||||
def submit_yarn(nworker, worker_args, worker_env):
|
||||
fset = set([YARN_JAR_PATH])
|
||||
fset = set([YARN_JAR_PATH, YARN_BOOT_PY])
|
||||
if args.auto_file_cache != 0:
|
||||
for i in range(len(args.command)):
|
||||
f = args.command[i]
|
||||
@@ -96,7 +104,7 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
if i == 0:
|
||||
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||
else:
|
||||
args.command[i] = args.command[i].split('/')[-1]
|
||||
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||
if args.command[0].endswith('.py'):
|
||||
flst = [WRAPPER_PATH + '/rabit.py',
|
||||
WRAPPER_PATH + '/librabit_wrapper.so',
|
||||
@@ -112,6 +120,8 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
env['rabit_cpu_vcores'] = str(args.vcores)
|
||||
env['rabit_memory_mb'] = str(args.memory_mb)
|
||||
env['rabit_world_size'] = str(args.nworker)
|
||||
env['rabit_hdfs_opts'] = str(args.libhdfs_opts)
|
||||
env['rabit_hdfs_namenode'] = str(args.name_node)
|
||||
|
||||
if args.files != None:
|
||||
for flst in args.files:
|
||||
@@ -121,7 +131,8 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
cmd += ' -file %s' % f
|
||||
cmd += ' -jobname %s ' % args.jobname
|
||||
cmd += ' -tempdir %s ' % args.tempdir
|
||||
cmd += (' '.join(args.command + worker_args))
|
||||
cmd += ' -queue %s ' % args.queue
|
||||
cmd += (' '.join(['./run_hdfs_prog.py'] + args.command + worker_args))
|
||||
if args.verbose != 0:
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True, env = env)
|
||||
|
||||
Reference in New Issue
Block a user