change formater

This commit is contained in:
tqchen
2015-03-08 12:29:07 -07:00
parent 2fbda812bc
commit 4f28e32ebd
5 changed files with 543 additions and 461 deletions

View File

@@ -34,13 +34,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
', or modify rabit_hadoop.py line 16', stacklevel = 2)
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
'It is Highly recommended to use rabit_yarn.py instead')
parser.add_argument('-n', '--nworker', required=True, type=int,
help = 'number of worker proccess to be launched')
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
parser.add_argument('-nt', '--nthread', default = -1, type=int,
help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
parser.add_argument('-i', '--input', required=True,
help = 'input path in HDFS')
parser.add_argument('-o', '--output', required=True,
@@ -61,6 +59,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
parser.add_argument('--timeout', default=600000000, type=int,
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
'normally you do not need to set this ')
parser.add_argument('--vcores', default = -1, type=int,
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
'if you are running multi-threading rabit,'\
@@ -95,7 +95,9 @@ use_yarn = int(hadoop_version[0]) >= 2
print 'Current Hadoop Version is %s' % out[1]
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
fset = set()
worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
fset = set()
if args.auto_file_cache:
for i in range(len(args.command)):
f = args.command[i]
@@ -132,12 +134,12 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
if args.nthread != -1:
if args.vcores != -1:
if kmap['nthread'] is None:
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
else:
cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
if args.memory_mb != -1:
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
@@ -153,5 +155,5 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
print cmd
subprocess.check_call(cmd, shell = True)
fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)