Merge commit '75bf97b57539e5572e7ae8eba72bac6562c63c07'
Conflicts: subtree/rabit/rabit-learn/io/line_split-inl.h subtree/rabit/yarn/build.sh
This commit is contained in:
@@ -9,4 +9,4 @@ the example guidelines are in the script themselfs
|
||||
* Yarn (Hadoop): [rabit_yarn.py](rabit_yarn.py)
|
||||
- It is also possible to submit via hadoop streaming with rabit_hadoop_streaming.py
|
||||
- However, it is higly recommended to use rabit_yarn.py because this will allocate resources more precisely and fits machine learning scenarios
|
||||
|
||||
* Sun Grid engine: [rabit_sge.py](rabit_sge.py)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is the demo submission script of rabit, it is created to
|
||||
submit rabit jobs using hadoop streaming
|
||||
This is the demo submission script of rabit for submitting jobs in local machine
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
@@ -43,7 +42,7 @@ def exec_cmd(cmd, taskid, worker_env):
|
||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
||||
cmd[0] = './' + cmd[0]
|
||||
cmd = ' '.join(cmd)
|
||||
env = {}
|
||||
env = os.environ.copy()
|
||||
for k, v in worker_env.items():
|
||||
env[k] = str(v)
|
||||
env['rabit_task_id'] = str(taskid)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Deprecated
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is the demo submission script of rabit, it is created to
|
||||
submit rabit jobs using hadoop streaming
|
||||
Submission script to submit rabit jobs using MPI
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
69
subtree/rabit/tracker/rabit_sge.py
Executable file
69
subtree/rabit/tracker/rabit_sge.py
Executable file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Submit rabit jobs to Sun Grid Engine
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import rabit_tracker as tracker
|
||||
|
||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit job using MPI')
|
||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||
help = 'number of worker proccess to be launched')
|
||||
parser.add_argument('-q', '--queue', default='default', type=str,
|
||||
help = 'the queue we want to submit the job to')
|
||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('--vcores', default = 1, type=int,
|
||||
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||
parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
|
||||
parser.add_argument('--logdir', default='auto', help = 'customize the directory to place the logs')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.jobname == 'auto':
|
||||
args.jobname = ('rabit%d.' % args.nworker) + args.command[0].split('/')[-1];
|
||||
if args.logdir == 'auto':
|
||||
args.logdir = args.jobname + '.log'
|
||||
|
||||
if os.path.exists(args.logdir):
|
||||
if not os.path.isdir(args.logdir):
|
||||
raise RuntimeError('specified logdir %s is a file instead of directory' % args.logdir)
|
||||
else:
|
||||
os.mkdir(args.logdir)
|
||||
|
||||
runscript = '%s/runrabit.sh' % args.logdir
|
||||
fo = open(runscript, 'w')
|
||||
fo.write('\"$@\"\n')
|
||||
fo.close()
|
||||
#
|
||||
# submission script using MPI
|
||||
#
|
||||
def sge_submit(nslave, worker_args, worker_envs):
|
||||
"""
|
||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||
note this can be a lambda function containing additional parameters in input
|
||||
Parameters
|
||||
nslave number of slave process to start up
|
||||
args arguments to launch each job
|
||||
this usually includes the parameters of master_uri and parameters passed into submit
|
||||
"""
|
||||
env_arg = ','.join(['%s=\"%s\"' % (k, str(v)) for k, v in worker_envs.items()])
|
||||
cmd = 'qsub -cwd -t 1-%d -S /bin/bash' % nslave
|
||||
if args.queue != 'default':
|
||||
cmd += '-q %s' % args.queue
|
||||
cmd += ' -N %s ' % args.jobname
|
||||
cmd += ' -e %s -o %s' % (args.logdir, args.logdir)
|
||||
cmd += ' -pe orte %d' % (args.vcores)
|
||||
cmd += ' -v %s,PATH=${PATH}:.' % env_arg
|
||||
cmd += ' %s %s' % (runscript, ' '.join(args.command + worker_args))
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True)
|
||||
print 'Waiting for the jobs to get up...'
|
||||
|
||||
# call submit, with nslave, the commands to run each job and submit function
|
||||
tracker.submit(args.nworker, [], fun_submit = sge_submit, verbose = args.verbose)
|
||||
@@ -13,6 +13,7 @@ import socket
|
||||
import struct
|
||||
import subprocess
|
||||
import random
|
||||
import time
|
||||
from threading import Thread
|
||||
|
||||
"""
|
||||
@@ -188,6 +189,7 @@ class Tracker:
|
||||
vlst.reverse()
|
||||
rlst += vlst
|
||||
return rlst
|
||||
|
||||
def get_ring(self, tree_map, parent_map):
|
||||
"""
|
||||
get a ring connection used to recover local data
|
||||
@@ -202,14 +204,44 @@ class Tracker:
|
||||
rnext = (r + 1) % nslave
|
||||
ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
|
||||
return ring_map
|
||||
|
||||
def get_link_map(self, nslave):
|
||||
"""
|
||||
get the link map, this is a bit hacky, call for better algorithm
|
||||
to place similar nodes together
|
||||
"""
|
||||
tree_map, parent_map = self.get_tree(nslave)
|
||||
ring_map = self.get_ring(tree_map, parent_map)
|
||||
rmap = {0 : 0}
|
||||
k = 0
|
||||
for i in range(nslave - 1):
|
||||
k = ring_map[k][1]
|
||||
rmap[k] = i + 1
|
||||
|
||||
ring_map_ = {}
|
||||
tree_map_ = {}
|
||||
parent_map_ ={}
|
||||
for k, v in ring_map.items():
|
||||
ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]])
|
||||
for k, v in tree_map.items():
|
||||
tree_map_[rmap[k]] = [rmap[x] for x in v]
|
||||
for k, v in parent_map.items():
|
||||
if k != 0:
|
||||
parent_map_[rmap[k]] = rmap[v]
|
||||
else:
|
||||
parent_map_[rmap[k]] = -1
|
||||
return tree_map_, parent_map_, ring_map_
|
||||
|
||||
def handle_print(self,slave, msg):
|
||||
sys.stdout.write(msg)
|
||||
|
||||
def log_print(self, msg, level):
|
||||
if level == 1:
|
||||
if self.verbose:
|
||||
sys.stderr.write(msg + '\n')
|
||||
else:
|
||||
sys.stderr.write(msg + '\n')
|
||||
|
||||
def accept_slaves(self, nslave):
|
||||
# set of nodes that finishs the job
|
||||
shutdown = {}
|
||||
@@ -241,31 +273,40 @@ class Tracker:
|
||||
assert s.cmd == 'start'
|
||||
if s.world_size > 0:
|
||||
nslave = s.world_size
|
||||
tree_map, parent_map = self.get_tree(nslave)
|
||||
ring_map = self.get_ring(tree_map, parent_map)
|
||||
tree_map, parent_map, ring_map = self.get_link_map(nslave)
|
||||
# set of nodes that is pending for getting up
|
||||
todo_nodes = range(nslave)
|
||||
random.shuffle(todo_nodes)
|
||||
else:
|
||||
assert s.world_size == -1 or s.world_size == nslave
|
||||
if s.cmd == 'recover':
|
||||
assert s.rank >= 0
|
||||
|
||||
rank = s.decide_rank(job_map)
|
||||
# batch assignment of ranks
|
||||
if rank == -1:
|
||||
assert len(todo_nodes) != 0
|
||||
rank = todo_nodes.pop(0)
|
||||
if s.jobid != 'NULL':
|
||||
job_map[s.jobid] = rank
|
||||
pending.append(s)
|
||||
if len(pending) == len(todo_nodes):
|
||||
pending.sort(key = lambda x : x.host)
|
||||
for s in pending:
|
||||
rank = todo_nodes.pop(0)
|
||||
if s.jobid != 'NULL':
|
||||
job_map[s.jobid] = rank
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
|
||||
if len(todo_nodes) == 0:
|
||||
self.log_print('@tracker All of %d nodes getting started' % nslave, 2)
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
if s.cmd != 'start':
|
||||
self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
|
||||
self.start_time = time.time()
|
||||
else:
|
||||
self.log_print('Recieve %s signal from %s; assign rank %d' % (s.cmd, s.host, s.rank), 1)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
|
||||
self.log_print('Recieve %s signal from %d' % (s.cmd, s.rank), 1)
|
||||
if s.wait_accept > 0:
|
||||
wait_conn[rank] = s
|
||||
self.log_print('@tracker All nodes finishes job', 2)
|
||||
self.end_time = time.time()
|
||||
self.log_print('@tracker %s secs between node start and job finish' % str(self.end_time - self.start_time), 2)
|
||||
|
||||
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
||||
master = Tracker(verbose = verbose, hostIP = hostIP)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
This is a script to submit rabit job via Yarn
|
||||
rabit will run as a Yarn application
|
||||
@@ -13,6 +13,7 @@ import rabit_tracker as tracker
|
||||
|
||||
WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
|
||||
YARN_JAR_PATH = os.path.dirname(__file__) + '/../yarn/rabit-yarn.jar'
|
||||
YARN_BOOT_PY = os.path.dirname(__file__) + '/../yarn/run_hdfs_prog.py'
|
||||
|
||||
if not os.path.exists(YARN_JAR_PATH):
|
||||
warnings.warn("cannot find \"%s\", I will try to run build" % YARN_JAR_PATH)
|
||||
@@ -21,7 +22,7 @@ if not os.path.exists(YARN_JAR_PATH):
|
||||
subprocess.check_call(cmd, shell = True, env = os.environ)
|
||||
assert os.path.exists(YARN_JAR_PATH), "failed to build rabit-yarn.jar, try it manually"
|
||||
|
||||
hadoop_binary = 'hadoop'
|
||||
hadoop_binary = None
|
||||
# code
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
|
||||
@@ -38,6 +39,8 @@ parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('-q', '--queue', default='default', type=str,
|
||||
help = 'the queue we want to submit the job to')
|
||||
parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
|
||||
help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
|
||||
parser.add_argument('-f', '--files', default = [], action='append',
|
||||
@@ -56,6 +59,11 @@ parser.add_argument('-mem', '--memory_mb', default=1024, type=int,
|
||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||
'if you are running multi-threading rabit,'\
|
||||
'so that each node can occupy all the mapper slots in a machine for maximum performance')
|
||||
parser.add_argument('--libhdfs-opts', default='-Xmx128m', type=str,
|
||||
help = 'setting to be passed to libhdfs')
|
||||
parser.add_argument('--name-node', default='default', type=str,
|
||||
help = 'the namenode address of hdfs, libhdfs should connect to, normally leave it as default')
|
||||
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
@@ -87,7 +95,7 @@ if hadoop_version < 2:
|
||||
print 'Current Hadoop Version is %s, rabit_yarn will need Yarn(Hadoop 2.0)' % out[1]
|
||||
|
||||
def submit_yarn(nworker, worker_args, worker_env):
|
||||
fset = set([YARN_JAR_PATH])
|
||||
fset = set([YARN_JAR_PATH, YARN_BOOT_PY])
|
||||
if args.auto_file_cache != 0:
|
||||
for i in range(len(args.command)):
|
||||
f = args.command[i]
|
||||
@@ -96,7 +104,7 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
if i == 0:
|
||||
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||
else:
|
||||
args.command[i] = args.command[i].split('/')[-1]
|
||||
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||
if args.command[0].endswith('.py'):
|
||||
flst = [WRAPPER_PATH + '/rabit.py',
|
||||
WRAPPER_PATH + '/librabit_wrapper.so',
|
||||
@@ -112,6 +120,8 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
env['rabit_cpu_vcores'] = str(args.vcores)
|
||||
env['rabit_memory_mb'] = str(args.memory_mb)
|
||||
env['rabit_world_size'] = str(args.nworker)
|
||||
env['rabit_hdfs_opts'] = str(args.libhdfs_opts)
|
||||
env['rabit_hdfs_namenode'] = str(args.name_node)
|
||||
|
||||
if args.files != None:
|
||||
for flst in args.files:
|
||||
@@ -121,7 +131,8 @@ def submit_yarn(nworker, worker_args, worker_env):
|
||||
cmd += ' -file %s' % f
|
||||
cmd += ' -jobname %s ' % args.jobname
|
||||
cmd += ' -tempdir %s ' % args.tempdir
|
||||
cmd += (' '.join(args.command + worker_args))
|
||||
cmd += ' -queue %s ' % args.queue
|
||||
cmd += (' '.join(['./run_hdfs_prog.py'] + args.command + worker_args))
|
||||
if args.verbose != 0:
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True, env = env)
|
||||
|
||||
Reference in New Issue
Block a user