Squashed 'subtree/rabit/' changes from d4ec037..28ca7be
28ca7beadd linear readmeca4b20fadd linear readme1133628add linear readme6a11676update docsa607047Update build.sh2c1cfd8complete yarn4f28e32change formater2fbda81fix stdin input3258bcfcheckin yarn master67ebf81allow setup from env variables9b6bf57fix hdfs395d5c2add make system88ce767refactor io, initial hdfs file access need test19be870chgsa1bd3c6Merge branch 'master' of ssh://github.com/tqchen/rabit1a573f9introduce input split29476f1fix timer issue git-subtree-dir: subtree/rabit git-subtree-split:28ca7becbd
This commit is contained in:
12
tracker/README.md
Normal file
12
tracker/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
Trackers
|
||||
=====
|
||||
This folder contains tracker scripts that can be used to submit yarn jobs to different platforms,
|
||||
the example guidelines are in the script themselfs
|
||||
|
||||
***Supported Platforms***
|
||||
* Local demo: [rabit_demo.py](rabit_demo.py)
|
||||
* MPI: [rabit_mpi.py](rabit_mpi.py)
|
||||
* Yarn (Hadoop): [rabit_yarn.py](rabit_yarn.py)
|
||||
- It is also possible to submit via hadoop streaming with rabit_hadoop_streaming.py
|
||||
- However, it is higly recommended to use rabit_yarn.py because this will allocate resources more precisely and fits machine learning scenarios
|
||||
|
||||
@@ -31,35 +31,38 @@ nrep=0
|
||||
rc=254
|
||||
while [ $rc -eq 254 ];
|
||||
do
|
||||
export rabit_num_trial=$nrep
|
||||
%s
|
||||
%s %s rabit_num_trial=$nrep
|
||||
%s
|
||||
rc=$?;
|
||||
nrep=$((nrep+1));
|
||||
done
|
||||
"""
|
||||
|
||||
def exec_cmd(cmd, taskid):
|
||||
def exec_cmd(cmd, taskid, worker_env):
|
||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
|
||||
cmd[0] = './' + cmd[0]
|
||||
cmd = ' '.join(cmd)
|
||||
arg = ' rabit_task_id=%d' % (taskid)
|
||||
cmd = cmd + arg
|
||||
env = {}
|
||||
for k, v in worker_env.items():
|
||||
env[k] = str(v)
|
||||
env['rabit_task_id'] = str(taskid)
|
||||
env['PYTHONPATH'] = WRAPPER_PATH
|
||||
|
||||
ntrial = 0
|
||||
while True:
|
||||
if os.name == 'nt':
|
||||
prep = 'SET PYTHONPATH=\"%s\"\n' % WRAPPER_PATH
|
||||
ret = subprocess.call(prep + cmd + ('rabit_num_trial=%d' % ntrial), shell=True)
|
||||
env['rabit_num_trial'] = str(ntrial)
|
||||
ret = subprocess.call(cmd, shell=True, env = env)
|
||||
if ret == 254:
|
||||
ntrial += 1
|
||||
continue
|
||||
|
||||
else:
|
||||
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
|
||||
if args.verbose != 0:
|
||||
bash = keepalive % (echo % cmd, prep, cmd)
|
||||
if args.verbose != 0:
|
||||
bash = keepalive % (echo % cmd, cmd)
|
||||
else:
|
||||
bash = keepalive % ('', prep, cmd)
|
||||
ret = subprocess.call(bash, shell=True, executable='bash')
|
||||
bash = keepalive % ('', cmd)
|
||||
ret = subprocess.call(bash, shell=True, executable='bash', env = env)
|
||||
if ret == 0:
|
||||
if args.verbose != 0:
|
||||
print 'Thread %d exit with 0' % taskid
|
||||
@@ -73,7 +76,7 @@ def exec_cmd(cmd, taskid):
|
||||
# Note: this submit script is only used for demo purpose
|
||||
# submission script using pyhton multi-threading
|
||||
#
|
||||
def mthread_submit(nslave, worker_args):
|
||||
def mthread_submit(nslave, worker_args, worker_envs):
|
||||
"""
|
||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||
note this can be a lambda function containing additional parameters in input
|
||||
@@ -84,7 +87,7 @@ def mthread_submit(nslave, worker_args):
|
||||
"""
|
||||
procs = {}
|
||||
for i in range(nslave):
|
||||
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i))
|
||||
procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i, worker_envs))
|
||||
procs[i].daemon = True
|
||||
procs[i].start()
|
||||
for i in range(nslave):
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#!/usr/bin/python
|
||||
"""
|
||||
Deprecated
|
||||
|
||||
This is a script to submit rabit job using hadoop streaming.
|
||||
It will submit the rabit process as mappers of MapReduce.
|
||||
|
||||
This script is deprecated, it is highly recommended to use rabit_yarn.py instead
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
@@ -34,13 +38,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
|
||||
', or modify rabit_hadoop.py line 16', stacklevel = 2)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
|
||||
'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
|
||||
'It is Highly recommended to use rabit_yarn.py instead')
|
||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||
help = 'number of worker proccess to be launched')
|
||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('-nt', '--nthread', default = -1, type=int,
|
||||
help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
|
||||
parser.add_argument('-i', '--input', required=True,
|
||||
help = 'input path in HDFS')
|
||||
parser.add_argument('-o', '--output', required=True,
|
||||
@@ -61,6 +63,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
|
||||
parser.add_argument('--timeout', default=600000000, type=int,
|
||||
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
|
||||
'normally you do not need to set this ')
|
||||
parser.add_argument('--vcores', default = -1, type=int,
|
||||
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
|
||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||
'if you are running multi-threading rabit,'\
|
||||
@@ -91,10 +95,14 @@ out = out.split('\n')[0].split()
|
||||
assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
|
||||
hadoop_version = out[1].split('.')
|
||||
use_yarn = int(hadoop_version[0]) >= 2
|
||||
if use_yarn:
|
||||
warnings.warn('It is highly recommended to use rabit_yarn.py to submit jobs to yarn instead', stacklevel = 2)
|
||||
|
||||
print 'Current Hadoop Version is %s' % out[1]
|
||||
|
||||
def hadoop_streaming(nworker, worker_args, use_yarn):
|
||||
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
||||
worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
|
||||
worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
|
||||
fset = set()
|
||||
if args.auto_file_cache:
|
||||
for i in range(len(args.command)):
|
||||
@@ -113,6 +121,7 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
||||
if os.path.exists(f):
|
||||
fset.add(f)
|
||||
kmap = {}
|
||||
kmap['env'] = 'mapred.child.env'
|
||||
# setup keymaps
|
||||
if use_yarn:
|
||||
kmap['nworker'] = 'mapreduce.job.maps'
|
||||
@@ -129,12 +138,14 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
||||
cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
|
||||
cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
|
||||
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
||||
if args.nthread != -1:
|
||||
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
|
||||
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
|
||||
if args.vcores != -1:
|
||||
if kmap['nthread'] is None:
|
||||
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
||||
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
|
||||
else:
|
||||
cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
|
||||
cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
|
||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||
if args.memory_mb != -1:
|
||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||
@@ -150,5 +161,5 @@ def hadoop_streaming(nworker, worker_args, use_yarn):
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True)
|
||||
|
||||
fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
|
||||
fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
|
||||
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
|
||||
@@ -22,7 +22,7 @@ args = parser.parse_args()
|
||||
#
|
||||
# submission script using MPI
|
||||
#
|
||||
def mpi_submit(nslave, worker_args):
|
||||
def mpi_submit(nslave, worker_args, worker_envs):
|
||||
"""
|
||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||
note this can be a lambda function containing additional parameters in input
|
||||
@@ -31,6 +31,7 @@ def mpi_submit(nslave, worker_args):
|
||||
args arguments to launch each job
|
||||
this usually includes the parameters of master_uri and parameters passed into submit
|
||||
"""
|
||||
worker_args += ['%s=%s' % (k, str(v)) for k, v in worker_envs.items()]
|
||||
sargs = ' '.join(args.command + worker_args)
|
||||
if args.hostfile is None:
|
||||
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
||||
|
||||
@@ -134,19 +134,25 @@ class Tracker:
|
||||
sock.listen(16)
|
||||
self.sock = sock
|
||||
self.verbose = verbose
|
||||
if hostIP == 'auto':
|
||||
hostIP = 'dns'
|
||||
self.hostIP = hostIP
|
||||
self.log_print('start listen on %s:%d' % (socket.gethostname(), self.port), 1)
|
||||
def __del__(self):
|
||||
self.sock.close()
|
||||
def slave_args(self):
|
||||
if self.hostIP == 'auto':
|
||||
def slave_envs(self):
|
||||
"""
|
||||
get enviroment variables for slaves
|
||||
can be passed in as args or envs
|
||||
"""
|
||||
if self.hostIP == 'dns':
|
||||
host = socket.gethostname()
|
||||
elif self.hostIP == 'ip':
|
||||
host = socket.gethostbyname(socket.getfqdn())
|
||||
else:
|
||||
host = self.hostIP
|
||||
return ['rabit_tracker_uri=%s' % host,
|
||||
'rabit_tracker_port=%s' % self.port]
|
||||
return {'rabit_tracker_uri': host,
|
||||
'rabit_tracker_port': self.port}
|
||||
def get_neighbor(self, rank, nslave):
|
||||
rank = rank + 1
|
||||
ret = []
|
||||
@@ -261,9 +267,9 @@ class Tracker:
|
||||
wait_conn[rank] = s
|
||||
self.log_print('@tracker All nodes finishes job', 2)
|
||||
|
||||
def submit(nslave, args, fun_submit, verbose, hostIP):
|
||||
def submit(nslave, args, fun_submit, verbose, hostIP = 'auto'):
|
||||
master = Tracker(verbose = verbose, hostIP = hostIP)
|
||||
submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
|
||||
submit_thread = Thread(target = fun_submit, args = (nslave, args, master.slave_envs()))
|
||||
submit_thread.daemon = True
|
||||
submit_thread.start()
|
||||
master.accept_slaves(nslave)
|
||||
|
||||
122
tracker/rabit_yarn.py
Executable file
122
tracker/rabit_yarn.py
Executable file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/python
|
||||
"""
|
||||
This is a script to submit rabit job via Yarn
|
||||
rabit will run as a Yarn application
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import warnings
|
||||
import rabit_tracker as tracker
|
||||
|
||||
WRAPPER_PATH = os.path.dirname(__file__) + '/../wrapper'
|
||||
YARN_JAR_PATH = os.path.dirname(__file__) + '/../yarn/rabit-yarn.jar'
|
||||
|
||||
assert os.path.exists(YARN_JAR_PATH), ("cannot find \"%s\", please run build.sh on the yarn folder" % YARN_JAR_PATH)
|
||||
hadoop_binary = 'hadoop'
|
||||
# code
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
|
||||
if hadoop_home != None:
|
||||
if hadoop_binary == None:
|
||||
hadoop_binary = hadoop_home + '/bin/hadoop'
|
||||
assert os.path.exists(hadoop_binary), "HADOOP_HOME does not contain the hadoop binary"
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs to Yarn.')
|
||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||
help = 'number of worker proccess to be launched')
|
||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
|
||||
help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
|
||||
parser.add_argument('-f', '--files', default = [], action='append',
|
||||
help = 'the cached file list in mapreduce,'\
|
||||
' the submission script will automatically cache all the files which appears in command'\
|
||||
' This will also cause rewritten of all the file names in the command to current path,'\
|
||||
' for example `../../kmeans ../kmeans.conf` will be rewritten to `./kmeans kmeans.conf`'\
|
||||
' because the two files are cached to running folder.'\
|
||||
' You may need this option to cache additional files.'\
|
||||
' You can also use it to manually cache files when auto_file_cache is off')
|
||||
parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
|
||||
parser.add_argument('--tempdir', default='/tmp', help = 'temporary directory in HDFS that can be used to store intermediate results')
|
||||
parser.add_argument('--vcores', default = 1, type=int,
|
||||
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||
parser.add_argument('-mem', '--memory_mb', default=1024, type=int,
|
||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||
'if you are running multi-threading rabit,'\
|
||||
'so that each node can occupy all the mapper slots in a machine for maximum performance')
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.jobname == 'auto':
|
||||
args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
|
||||
|
||||
if hadoop_binary == None:
|
||||
parser.add_argument('-hb', '--hadoop_binary', required = True,
|
||||
help="path to hadoop binary file")
|
||||
else:
|
||||
parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary,
|
||||
help="path to hadoop binary file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.jobname == 'auto':
|
||||
args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
|
||||
|
||||
# detech hadoop version
|
||||
(out, err) = subprocess.Popen('%s version' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
|
||||
out = out.split('\n')[0].split()
|
||||
assert out[0] == 'Hadoop', 'cannot parse hadoop version string'
|
||||
hadoop_version = out[1].split('.')
|
||||
|
||||
(classpath, err) = subprocess.Popen('%s classpath --glob' % args.hadoop_binary, shell = True, stdout=subprocess.PIPE).communicate()
|
||||
|
||||
if hadoop_version < 2:
|
||||
print 'Current Hadoop Version is %s, rabit_yarn will need Yarn(Hadoop 2.0)' % out[1]
|
||||
|
||||
def submit_yarn(nworker, worker_args, worker_env):
|
||||
fset = set([YARN_JAR_PATH])
|
||||
if args.auto_file_cache != 0:
|
||||
for i in range(len(args.command)):
|
||||
f = args.command[i]
|
||||
if os.path.exists(f):
|
||||
fset.add(f)
|
||||
if i == 0:
|
||||
args.command[i] = './' + args.command[i].split('/')[-1]
|
||||
else:
|
||||
args.command[i] = args.command[i].split('/')[-1]
|
||||
if args.command[0].endswith('.py'):
|
||||
flst = [WRAPPER_PATH + '/rabit.py',
|
||||
WRAPPER_PATH + '/librabit_wrapper.so',
|
||||
WRAPPER_PATH + '/librabit_wrapper_mock.so']
|
||||
for f in flst:
|
||||
if os.path.exists(f):
|
||||
fset.add(f)
|
||||
|
||||
cmd = 'java -cp `%s classpath`:%s org.apache.hadoop.yarn.rabit.Client ' % (args.hadoop_binary, YARN_JAR_PATH)
|
||||
env = os.environ.copy()
|
||||
for k, v in worker_env.items():
|
||||
env[k] = str(v)
|
||||
env['rabit_cpu_vcores'] = str(args.vcores)
|
||||
env['rabit_memory_mb'] = str(args.memory_mb)
|
||||
env['rabit_world_size'] = str(args.nworker)
|
||||
|
||||
if args.files != None:
|
||||
for flst in args.files:
|
||||
for f in flst.split('#'):
|
||||
fset.add(f)
|
||||
for f in fset:
|
||||
cmd += ' -file %s' % f
|
||||
cmd += ' -jobname %s ' % args.jobname
|
||||
cmd += ' -tempdir %s ' % args.tempdir
|
||||
cmd += (' '.join(args.command + worker_args))
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True, env = env)
|
||||
|
||||
tracker.submit(args.nworker, [], fun_submit = submit_yarn, verbose = args.verbose, hostIP = args.host_ip)
|
||||
Reference in New Issue
Block a user