add auto cache

This commit is contained in:
tqchen 2014-12-29 06:50:17 -08:00
parent c731e82fae
commit b1340bf310

View File

@ -33,6 +33,8 @@ parser.add_argument('-n', '--nslaves', required=True, type=int,
help = 'number of slaves proccess to be launched') help = 'number of slaves proccess to be launched')
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int, parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
help = 'print more messages into the console') help = 'print more messages into the console')
parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
help = 'whether automatically cache the files in the command list to hadoop localfile')
parser.add_argument('-i', '--input', required=True, parser.add_argument('-i', '--input', required=True,
help = 'input path in HDFS') help = 'input path in HDFS')
parser.add_argument('-o', '--output', required=True, parser.add_argument('-o', '--output', required=True,
@ -61,8 +63,10 @@ def hadoop_streaming(nslaves, slave_args):
cmd += ' -input %s -output %s' % (args.input, args.output) cmd += ' -input %s -output %s' % (args.input, args.output)
cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args)) cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
fset = set() fset = set()
if os.path.exists(args.command[0]): if args.auto_file_cache:
fset.add(args.command[0]) for f in args.command:
if os.path.exists(f):
fset.add(f)
for flst in args.files: for flst in args.files:
for f in flst.split('#'): for f in flst.split('#'):
fset.add(f) fset.add(f)