change formater

2015-03-08 12:29:07 -07:00
parent 2fbda812bc
commit 4f28e32ebd
5 changed files with 543 additions and 461 deletions
--- a/tracker/rabit_hadoop_streaming.py
+++ b/tracker/rabit_hadoop_streaming.py
@@ -34,13 +34,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
                      ', or modify rabit_hadoop.py line 16', stacklevel = 2)

 parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
-                                     'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
+                                     'It is Highly recommended to use rabit_yarn.py instead')
 parser.add_argument('-n', '--nworker', required=True, type=int,
                    help = 'number of worker proccess to be launched')
 parser.add_argument('-hip', '--host_ip', default='auto', type=str,
                    help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
-parser.add_argument('-nt', '--nthread', default = -1, type=int,
-                    help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
 parser.add_argument('-i', '--input', required=True,
                    help = 'input path in HDFS')
 parser.add_argument('-o', '--output', required=True,
@@ -61,6 +59,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
 parser.add_argument('--timeout', default=600000000, type=int,
                    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
                        'normally you do not need to set this ')
+parser.add_argument('--vcores', default = -1, type=int,
+                    help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
 parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
                    help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
                        'if you are running multi-threading rabit,'\
@@ -95,7 +95,9 @@ use_yarn = int(hadoop_version[0]) >= 2
 print 'Current Hadoop Version is %s' % out[1]

 def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
-    fset = set()    
+    worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
+    worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
+    fset = set()
    if args.auto_file_cache:
        for i in range(len(args.command)):
            f = args.command[i]
@@ -132,12 +134,12 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
    cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
    envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
    cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
-    if args.nthread != -1:
+    if args.vcores != -1:
        if kmap['nthread'] is None:
            warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
                              'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
        else:
-            cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
+            cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
    cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
    if args.memory_mb != -1:
        cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
@@ -153,5 +155,5 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
    print cmd
    subprocess.check_call(cmd, shell = True)

-fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
+fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
 tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)