change formater

This commit is contained in:
tqchen 2015-03-08 12:29:07 -07:00
parent 2fbda812bc
commit 4f28e32ebd
5 changed files with 543 additions and 461 deletions

View File

@ -6,15 +6,15 @@ then
fi fi
# put the local training file to HDFS # put the local training file to HDFS
#hadoop fs -rm -r -f $2/data hadoop fs -rm -r -f $2/data
hadoop fs -rm -r -f $2/mushroom.linear.model hadoop fs -rm -r -f $2/mushroom.linear.model
#hadoop fs -mkdir $2/data hadoop fs -mkdir $2/data
#hadoop fs -put ../data/agaricus.txt.train $2/data hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop # submit to hadoop
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" ../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
# get the final model file # get the final model file
#hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
#./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model ./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@ -15,7 +15,7 @@ export CXX = g++
export MPICXX = mpicxx export MPICXX = mpicxx
# whether use HDFS support during compile # whether use HDFS support during compile
USE_HDFS = 1 USE_HDFS = 0
# path to libjvm.so # path to libjvm.so
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server

View File

@ -34,13 +34,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
', or modify rabit_hadoop.py line 16', stacklevel = 2) ', or modify rabit_hadoop.py line 16', stacklevel = 2)
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\ parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended') 'It is Highly recommended to use rabit_yarn.py instead')
parser.add_argument('-n', '--nworker', required=True, type=int, parser.add_argument('-n', '--nworker', required=True, type=int,
help = 'number of worker proccess to be launched') help = 'number of worker proccess to be launched')
parser.add_argument('-hip', '--host_ip', default='auto', type=str, parser.add_argument('-hip', '--host_ip', default='auto', type=str,
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine') help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
parser.add_argument('-nt', '--nthread', default = -1, type=int,
help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
parser.add_argument('-i', '--input', required=True, parser.add_argument('-i', '--input', required=True,
help = 'input path in HDFS') help = 'input path in HDFS')
parser.add_argument('-o', '--output', required=True, parser.add_argument('-o', '--output', required=True,
@ -61,6 +59,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
parser.add_argument('--timeout', default=600000000, type=int, parser.add_argument('--timeout', default=600000000, type=int,
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\ help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
'normally you do not need to set this ') 'normally you do not need to set this ')
parser.add_argument('--vcores', default = -1, type=int,
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
parser.add_argument('-mem', '--memory_mb', default=-1, type=int, parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\ help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
'if you are running multi-threading rabit,'\ 'if you are running multi-threading rabit,'\
@ -95,7 +95,9 @@ use_yarn = int(hadoop_version[0]) >= 2
print 'Current Hadoop Version is %s' % out[1] print 'Current Hadoop Version is %s' % out[1]
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn): def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
fset = set() worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
fset = set()
if args.auto_file_cache: if args.auto_file_cache:
for i in range(len(args.command)): for i in range(len(args.command)):
f = args.command[i] f = args.command[i]
@ -132,12 +134,12 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname) cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items()) envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr) cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
if args.nthread != -1: if args.vcores != -1:
if kmap['nthread'] is None: if kmap['nthread'] is None:
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\ warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2) 'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
else: else:
cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread) cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout) cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
if args.memory_mb != -1: if args.memory_mb != -1:
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout) cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
@ -153,5 +155,5 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
print cmd print cmd
subprocess.check_call(cmd, shell = True) subprocess.check_call(cmd, shell = True)
fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2) fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip) tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)

View File

@ -34,340 +34,413 @@ import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
* @author Tianqi Chen * @author Tianqi Chen
*/ */
public class ApplicationMaster { public class ApplicationMaster {
// logger // logger
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class); private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
// configuration // configuration
private Configuration conf = new YarnConfiguration(); private Configuration conf = new YarnConfiguration();
// number of cores allocated for each task // number of cores allocated for each task
private int numVCores = 1; private int numVCores = 1;
// memory needed requested for the task // memory needed requested for the task
private int numMemoryMB = 10; private int numMemoryMB = 10;
// priority of the app master // priority of the app master
private int appPriority = 0; private int appPriority = 0;
// total number of tasks // total number of tasks
private int numTasks = 1; private int numTasks = 1;
// maximum number of attempts to try in each task // maximum number of attempts to try in each task
private int maxNumAttempt = 10; private int maxNumAttempt = 10;
// command to launch // command to launch
private String command = ""; private String command = "";
// application tracker hostname // application tracker hostname
private String appHostName = ""; private String appHostName = "";
// tracker URL to do // tracker URL to do
private String appTrackerUrl = ""; private String appTrackerUrl = "";
// tracker port // tracker port
private int appTrackerPort = 0; private int appTrackerPort = 0;
// whether we start to abort the application, due to whatever fatal reasons
private boolean startAbort = false;
// record the aborting reason
private String abortDiagnosis = "";
// resource manager
private AMRMClientAsync<ContainerRequest> rmClient = null;
// node manager
private NMClientAsync nmClient = null;
// list of tasks that pending for resources to be allocated
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
// map containerId->task record of tasks that was running
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
// collection of tasks
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
// collection of killed tasks
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
// whether we start to abort the application, due to whatever fatal reasons
private boolean startAbort = false;
// record the aborting reason
private String abortDiagnosis = "";
// resource manager
private AMRMClientAsync<ContainerRequest> rmClient = null;
// node manager
private NMClientAsync nmClient = null;
// list of tasks that pending for resources to be allocated
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
// map containerId->task record of tasks that was running
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
// collection of tasks
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
// collection of killed tasks
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
new ApplicationMaster().run(args); new ApplicationMaster().run(args);
} }
/** /**
* get integer argument from environment variable * get integer argument from environment variable
* @param name name of key *
* @param required whether this is required * @param name
* @param defv default value * name of key
* @param required
* whether this is required
* @param defv
* default value
* @return the requested result * @return the requested result
*/ */
private int getEnvInteger(String name, boolean required, int defv) { private int getEnvInteger(String name, boolean required, int defv) {
String value = System.getenv(name); String value = System.getenv(name);
if (value == null) { if (value == null) {
if (required) LOG.fatal("environment variable " + name + "not set"); if (required)
} LOG.fatal("environment variable " + name + "not set");
return Integer.valueOf(value); }
return Integer.valueOf(value);
} }
/** /**
* initialize from arguments and command lines * initialize from arguments and command lines
*
* @param args * @param args
*/ */
private void initArgs(String args[]) { private void initArgs(String args[]) {
for (String c : args) { for (String c : args) {
this.command += c + " "; this.command += c + " ";
} }
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores); numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB); numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt); maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
} maxNumAttempt);
}
/** /**
* called to start the application * called to start the application
*/ */
private void run(String args[]) throws Exception { private void run(String args[]) throws Exception {
this.initArgs(args); this.initArgs(args);
// list of tasks that waits to be submit // list of tasks that waits to be submit
java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>(); java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
// add waiting tasks // add waiting tasks
for (int i = 0; i < this.numTasks; ++i) { for (int i = 0; i < this.numTasks; ++i) {
tasks.add(new TaskRecord(i)); tasks.add(new TaskRecord(i));
} }
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000, new RMCallbackHandler()); this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
this.nmClient = NMClientAsync.createNMClientAsync(new NMCallbackHandler()); new RMCallbackHandler());
this.rmClient.init(conf); this.nmClient = NMClientAsync
this.rmClient.start(); .createNMClientAsync(new NMCallbackHandler());
this.nmClient.init(conf); this.rmClient.init(conf);
this.nmClient.start(); this.rmClient.start();
RegisterApplicationMasterResponse response = this.nmClient.init(conf);
this.rmClient.registerApplicationMaster(this.appHostName, this.appTrackerPort, this.appTrackerUrl); this.nmClient.start();
Resource maxResource = response.getMaximumResourceCapability(); RegisterApplicationMasterResponse response = this.rmClient
if (maxResource.getMemory() < this.numMemoryMB) { .registerApplicationMaster(this.appHostName,
LOG.warn("[Rabit] memory requested exceed bound " + maxResource.getMemory()); this.appTrackerPort, this.appTrackerUrl);
this.numMemoryMB = maxResource.getMemory(); Resource maxResource = response.getMaximumResourceCapability();
} if (maxResource.getMemory() < this.numMemoryMB) {
if (maxResource.getVirtualCores() < this.numVCores) { LOG.warn("[Rabit] memory requested exceed bound "
LOG.warn("[Rabit] memory requested exceed bound " + maxResource.getVirtualCores()); + maxResource.getMemory());
this.numVCores = maxResource.getVirtualCores(); this.numMemoryMB = maxResource.getMemory();
} }
this.submitTasks(tasks); if (maxResource.getVirtualCores() < this.numVCores) {
LOG.info("[Rabit] ApplicationMaster started"); LOG.warn("[Rabit] memory requested exceed bound "
while (!this.doneAllJobs()) { + maxResource.getVirtualCores());
try { this.numVCores = maxResource.getVirtualCores();
Thread.sleep(100);; }
} catch (InterruptedException e) { this.submitTasks(tasks);
} LOG.info("[Rabit] ApplicationMaster started");
} while (!this.doneAllJobs()) {
assert (killedTasks.size() + finishedTasks.size() == numTasks); try {
boolean success = finishedTasks.size() == numTasks; Thread.sleep(100);
LOG.info("Application completed. Stopping running containers"); ;
nmClient.stop(); } catch (InterruptedException e) {
String diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks }
+ ", finished=" + this.finishedTasks.size() + ", failed=" }
+ this.killedTasks.size() + "\n" + this.abortDiagnosis; assert (killedTasks.size() + finishedTasks.size() == numTasks);
rmClient.unregisterApplicationMaster boolean success = finishedTasks.size() == numTasks;
(success ? FinalApplicationStatus.SUCCEEDED : FinalApplicationStatus.FAILED, LOG.info("Application completed. Stopping running containers");
diagnostics, appTrackerUrl); nmClient.stop();
} String diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
/** + ", finished=" + this.finishedTasks.size() + ", failed="
* check if the job finishes + this.killedTasks.size() + "\n" + this.abortDiagnosis;
* @return whether we finished all the jobs rmClient.unregisterApplicationMaster(
*/ success ? FinalApplicationStatus.SUCCEEDED
private synchronized boolean doneAllJobs() { : FinalApplicationStatus.FAILED, diagnostics,
return pendingTasks.size() == 0 && runningTasks.size() == 0; appTrackerUrl);
} }
/**
* submit tasks to request containers for the tasks /**
* @param tasks a collection of tasks we want to ask container for * check if the job finishes
*/ *
private synchronized void submitTasks(Collection<TaskRecord> tasks) { * @return whether we finished all the jobs
for (TaskRecord r : tasks) { */
Resource resource = Records.newRecord(Resource.class); private synchronized boolean doneAllJobs() {
resource.setMemory(numMemoryMB); return pendingTasks.size() == 0 && runningTasks.size() == 0;
resource.setVirtualCores(numVCores); }
Priority priority = Records.newRecord(Priority.class);
priority.setPriority(this.appPriority); /**
r.containerRequest = new ContainerRequest(resource, null, null, priority); * submit tasks to request containers for the tasks
rmClient.addContainerRequest(r.containerRequest); *
pendingTasks.add(r); * @param tasks
} * a collection of tasks we want to ask container for
} */
/** private synchronized void submitTasks(Collection<TaskRecord> tasks) {
* launch the task on container for (TaskRecord r : tasks) {
* @param container container to run the task Resource resource = Records.newRecord(Resource.class);
* @param task the task resource.setMemory(numMemoryMB);
*/ resource.setVirtualCores(numVCores);
private void launchTask(Container container, TaskRecord task) { Priority priority = Records.newRecord(Priority.class);
task.container = container; priority.setPriority(this.appPriority);
task.containerRequest = null; r.containerRequest = new ContainerRequest(resource, null, null,
ContainerLaunchContext ctx = priority);
Records.newRecord(ContainerLaunchContext.class); rmClient.addContainerRequest(r.containerRequest);
String cmd = command + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout" pendingTasks.add(r);
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"; }
}
/**
* launch the task on container
*
* @param container
* container to run the task
* @param task
* the task
*/
private void launchTask(Container container, TaskRecord task) {
task.container = container;
task.containerRequest = null;
ContainerLaunchContext ctx = Records
.newRecord(ContainerLaunchContext.class);
String cmd = command + " 1>"
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
+ "/stderr";
ctx.setCommands(Collections.singletonList(cmd)); ctx.setCommands(Collections.singletonList(cmd));
// setup environment variables // setup environment variables
Map<String, String> env = new java.util.HashMap<String, String>(); Map<String, String> env = new java.util.HashMap<String, String>();
// setup class path // setup class path
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*"); StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, for (String c : conf.getStrings(
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { YarnConfiguration.YARN_APPLICATION_CLASSPATH,
cpath.append(':'); YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
cpath.append(c.trim()); cpath.append(':');
} cpath.append(c.trim());
env.put("CLASSPATH", cpath.toString()); }
// setup LD_LIBARY_pATH path for libhdfs env.put("CLASSPATH", cpath.toString());
env.put("LD_LIBRARY_PATH", "${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native"); // setup LD_LIBARY_pATH path for libhdfs
env.put("LD_LIBRARY_PATH",
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
// inherit all rabit variables // inherit all rabit variables
for (Map.Entry<String, String> e : System.getenv().entrySet()) { for (Map.Entry<String, String> e : System.getenv().entrySet()) {
if (e.getKey().startsWith("rabit_")) { if (e.getKey().startsWith("rabit_")) {
env.put(e.getKey(), e.getValue()); env.put(e.getKey(), e.getValue());
} }
} }
env.put("rabit_task_id", String.valueOf(task.taskId)); env.put("rabit_task_id", String.valueOf(task.taskId));
env.put("rabit_num_trial", String.valueOf(task.attemptCounter)); env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
ctx.setEnvironment(env); ctx.setEnvironment(env);
synchronized (this) { synchronized (this) {
assert (!this.runningTasks.containsKey(container.getId())); assert (!this.runningTasks.containsKey(container.getId()));
this.runningTasks.put(container.getId(), task); this.runningTasks.put(container.getId(), task);
this.nmClient.startContainerAsync(container, ctx); this.nmClient.startContainerAsync(container, ctx);
} }
} }
/**
* free the containers that have not yet been launched /**
* @param containers * free the containers that have not yet been launched
*/ *
private synchronized void freeUnusedContainers(Collection<Container> containers) { * @param containers
} */
/** private synchronized void freeUnusedContainers(
* handle method for AMRMClientAsync.CallbackHandler container allocation Collection<Container> containers) {
* @param containers }
*/
private synchronized void onContainersAllocated(List<Container> containers) { /**
if (this.startAbort) { * handle method for AMRMClientAsync.CallbackHandler container allocation
this.freeUnusedContainers(containers); *
return; * @param containers
} */
Collection<Container> freelist = new java.util.LinkedList<Container>(); private synchronized void onContainersAllocated(List<Container> containers) {
for (Container c : containers) { if (this.startAbort) {
TaskRecord task; this.freeUnusedContainers(containers);
task = pendingTasks.poll(); return;
if (task == null) { }
freelist.add(c); continue; Collection<Container> freelist = new java.util.LinkedList<Container>();
} for (Container c : containers) {
this.launchTask(c, task); TaskRecord task;
} task = pendingTasks.poll();
this.freeUnusedContainers(freelist); if (task == null) {
} freelist.add(c);
/** continue;
* start aborting the job }
* @param msg the fatal message this.launchTask(c, task);
*/ }
private synchronized void abortJob(String msg) { this.freeUnusedContainers(freelist);
if (!this.startAbort) this.abortDiagnosis = msg; }
this.startAbort = true;
for (TaskRecord r : this.runningTasks.values()) { /**
if (!r.abortRequested) { * start aborting the job
nmClient.stopContainerAsync(r.container.getId(), r.container.getNodeId()); *
r.abortRequested = true; * @param msg
} * the fatal message
} */
this.killedTasks.addAll(this.pendingTasks); private synchronized void abortJob(String msg) {
for (TaskRecord r : this.pendingTasks) { if (!this.startAbort)
rmClient.removeContainerRequest(r.containerRequest); this.abortDiagnosis = msg;
} this.startAbort = true;
this.pendingTasks.clear(); for (TaskRecord r : this.runningTasks.values()) {
LOG.info(msg); if (!r.abortRequested) {
} nmClient.stopContainerAsync(r.container.getId(),
/** r.container.getNodeId());
* handle non fatal failures r.abortRequested = true;
* @param cid }
*/ }
private synchronized void handleFailure(Collection<ContainerId> failed) { this.killedTasks.addAll(this.pendingTasks);
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>(); for (TaskRecord r : this.pendingTasks) {
for (ContainerId cid : failed) { rmClient.removeContainerRequest(r.containerRequest);
TaskRecord r = runningTasks.remove(cid); }
if (r == null) continue; this.pendingTasks.clear();
r.attemptCounter += 1; LOG.info(msg);
r.container = null; }
tasks.add(r);
if (r.attemptCounter >= this.maxNumAttempt) { /**
this.abortJob("[Rabit] Task " + r.taskId + " failed more than " + r.attemptCounter + "times"); * handle non fatal failures
} *
} * @param cid
if (this.startAbort) { */
this.killedTasks.addAll(tasks); private synchronized void handleFailure(Collection<ContainerId> failed) {
} else { Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
this.submitTasks(tasks); for (ContainerId cid : failed) {
} TaskRecord r = runningTasks.remove(cid);
} if (r == null)
/** continue;
* handle method for AMRMClientAsync.CallbackHandler container allocation r.attemptCounter += 1;
* @param status list of status r.container = null;
*/ tasks.add(r);
private synchronized void onContainersCompleted(List<ContainerStatus> status) { if (r.attemptCounter >= this.maxNumAttempt) {
Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>(); this.abortJob("[Rabit] Task " + r.taskId + " failed more than "
for (ContainerStatus s : status) { + r.attemptCounter + "times");
assert (s.getState().equals(ContainerState.COMPLETE)); }
int exstatus = s.getExitStatus(); }
TaskRecord r = runningTasks.get(s.getContainerId()); if (this.startAbort) {
if (r == null) continue; this.killedTasks.addAll(tasks);
if (exstatus == ContainerExitStatus.SUCCESS) { } else {
finishedTasks.add(r); this.submitTasks(tasks);
runningTasks.remove(s.getContainerId()); }
} else { }
switch (exstatus) {
case ContainerExitStatus.KILLED_EXCEEDED_PMEM: /**
this.abortJob("[Rabit] Task " + r.taskId + " killed because of exceeding allocated physical memory"); * handle method for AMRMClientAsync.CallbackHandler container allocation
break; *
case ContainerExitStatus.KILLED_EXCEEDED_VMEM: * @param status
this.abortJob("[Rabit] Task " + r.taskId + " killed because of exceeding allocated virtual memory"); * list of status
break; */
default: private synchronized void onContainersCompleted(List<ContainerStatus> status) {
LOG.info("[Rabit] Task " + r.taskId + " exited with status " + exstatus); Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
failed.add(s.getContainerId()); for (ContainerStatus s : status) {
} assert (s.getState().equals(ContainerState.COMPLETE));
} int exstatus = s.getExitStatus();
} TaskRecord r = runningTasks.get(s.getContainerId());
this.handleFailure(failed); if (r == null)
} continue;
/** if (exstatus == ContainerExitStatus.SUCCESS) {
* callback handler for resource manager finishedTasks.add(r);
*/ runningTasks.remove(s.getContainerId());
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler { } else {
@Override switch (exstatus) {
public float getProgress() { case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
return 1.0f - (float)(pendingTasks.size()) / numTasks; this.abortJob("[Rabit] Task "
} + r.taskId
@Override + " killed because of exceeding allocated physical memory");
public void onContainersAllocated(List<Container> containers) { break;
ApplicationMaster.this.onContainersAllocated(containers); case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
} this.abortJob("[Rabit] Task "
@Override + r.taskId
public void onContainersCompleted(List<ContainerStatus> status) { + " killed because of exceeding allocated virtual memory");
ApplicationMaster.this.onContainersCompleted(status); break;
} default:
@Override LOG.info("[Rabit] Task " + r.taskId
public void onError(Throwable ex) { + " exited with status " + exstatus);
ApplicationMaster.this.abortJob("[Rabit] Resource manager Error " + ex.toString()); failed.add(s.getContainerId());
} }
@Override }
public void onNodesUpdated(List<NodeReport> nodereport) { }
} this.handleFailure(failed);
@Override }
public void onShutdownRequest() {
ApplicationMaster.this.abortJob("[Rabit] Get shutdown request, start to shutdown..."); /**
} * callback handler for resource manager
} */
private class NMCallbackHandler implements NMClientAsync.CallbackHandler { private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
@Override @Override
public void onContainerStarted(ContainerId cid, Map<String, ByteBuffer> services) { public float getProgress() {
LOG.debug("onContainerStarted Invoked"); return 1.0f - (float) (pendingTasks.size()) / numTasks;
} }
@Override
public void onContainerStatusReceived(ContainerId cid, ContainerStatus status) { @Override
LOG.debug("onContainerStatusReceived Invoked"); public void onContainersAllocated(List<Container> containers) {
} ApplicationMaster.this.onContainersAllocated(containers);
@Override }
public void onContainerStopped(ContainerId cid) {
LOG.debug("onContainerStopped Invoked"); @Override
} public void onContainersCompleted(List<ContainerStatus> status) {
@Override ApplicationMaster.this.onContainersCompleted(status);
public void onGetContainerStatusError(ContainerId cid, Throwable ex) { }
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
ApplicationMaster.this.handleFailure(Collections.singletonList(cid)); @Override
} public void onError(Throwable ex) {
@Override ApplicationMaster.this.abortJob("[Rabit] Resource manager Error "
public void onStartContainerError(ContainerId cid, Throwable ex) { + ex.toString());
LOG.debug("onStartContainerError Invoked: " + ex.toString()); }
ApplicationMaster.this.handleFailure(Collections.singletonList(cid));
} @Override
@Override public void onNodesUpdated(List<NodeReport> nodereport) {
public void onStopContainerError(ContainerId cid, Throwable ex) { }
LOG.info("onStopContainerError Invoked: " + ex.toString());
} @Override
} public void onShutdownRequest() {
ApplicationMaster.this
.abortJob("[Rabit] Get shutdown request, start to shutdown...");
}
}
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
@Override
public void onContainerStarted(ContainerId cid,
Map<String, ByteBuffer> services) {
LOG.debug("onContainerStarted Invoked");
}
@Override
public void onContainerStatusReceived(ContainerId cid,
ContainerStatus status) {
LOG.debug("onContainerStatusReceived Invoked");
}
@Override
public void onContainerStopped(ContainerId cid) {
LOG.debug("onContainerStopped Invoked");
}
@Override
public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
ApplicationMaster.this
.handleFailure(Collections.singletonList(cid));
}
@Override
public void onStartContainerError(ContainerId cid, Throwable ex) {
LOG.debug("onStartContainerError Invoked: " + ex.toString());
ApplicationMaster.this
.handleFailure(Collections.singletonList(cid));
}
@Override
public void onStopContainerError(ContainerId cid, Throwable ex) {
LOG.info("onStopContainerError Invoked: " + ex.toString());
}
}
} }

View File

@ -26,149 +26,156 @@ import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.Records;
public class Client { public class Client {
// logger // logger
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class); private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
// configuration // configuration
private YarnConfiguration conf = new YarnConfiguration(); private YarnConfiguration conf = new YarnConfiguration();
// cached maps // cached maps
private Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>(); private Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
// args to pass to application master // args to pass to application master
private String appArgs; private String appArgs;
/**
* get the local resource setting /**
* @param fmaps the file maps * get the local resource setting
* @return the resource map *
* @throws IOException * @param fmaps
*/ * the file maps
private Map<String, LocalResource> getLocalResource() throws IOException { * @return the resource map
Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>(); * @throws IOException
for (Map.Entry<String, Path> e : cacheFiles.entrySet()){ */
LocalResource r = Records.newRecord(LocalResource.class); private Map<String, LocalResource> getLocalResource() throws IOException {
Path path = e.getValue(); Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
FileStatus status = FileSystem.get(conf).getFileStatus(path); for (Map.Entry<String, Path> e : cacheFiles.entrySet()) {
r.setResource(ConverterUtils.getYarnUrlFromPath(path)); LocalResource r = Records.newRecord(LocalResource.class);
r.setSize(status.getLen()); Path path = e.getValue();
r.setTimestamp(status.getModificationTime()); FileStatus status = FileSystem.get(conf).getFileStatus(path);
r.setType(LocalResourceType.FILE); r.setResource(ConverterUtils.getYarnUrlFromPath(path));
r.setVisibility(LocalResourceVisibility.PUBLIC); r.setSize(status.getLen());
rmap.put(e.getKey(), r); r.setTimestamp(status.getModificationTime());
} r.setType(LocalResourceType.FILE);
return rmap; r.setVisibility(LocalResourceVisibility.PUBLIC);
} rmap.put(e.getKey(), r);
/** }
* get the environment variables for container return rmap;
* @return the env variable for child class }
*/
private Map<String, String> getEnvironment() { /**
// Setup environment variables * get the environment variables for container
Map<String, String> env = new java.util.HashMap<String, String>(); *
String cpath = "${CLASSPATH}:./*"; * @return the env variable for child class
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, */
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { private Map<String, String> getEnvironment() {
cpath += ':'; // Setup environment variables
cpath += c.trim(); Map<String, String> env = new java.util.HashMap<String, String>();
} String cpath = "${CLASSPATH}:./*";
env.put("CLASSPATH", cpath); for (String c : conf.getStrings(
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
cpath += ':';
cpath += c.trim();
}
env.put("CLASSPATH", cpath);
for (Map.Entry<String, String> e : System.getenv().entrySet()) { for (Map.Entry<String, String> e : System.getenv().entrySet()) {
if (e.getKey().startsWith("rabit_")) { if (e.getKey().startsWith("rabit_")) {
env.put(e.getKey(), e.getValue()); env.put(e.getKey(), e.getValue());
} }
} }
return env; return env;
} }
/**
* initialize the settings
* @param args
*/
private void initArgs(String[] args) {
// directly pass all args except args0
StringBuilder sargs = new StringBuilder("");
for (int i = 0; i < args.length; ++i) {
if (args[i] == "-file") {
String[] arr = args[i + 1].split("#");
Path path = new Path(arr[0]);
if (arr.length == 1) {
cacheFiles.put(path.getName(), path);
} else {
cacheFiles.put(arr[1], path);
}
++ i;
} else {
sargs.append(" ");
sargs.append(args[i]);
}
}
this.appArgs = sargs.toString();
}
private void run(String[] args) throws Exception {
if (args.length == 0) {
System.out.println("Usage: [options] [commands..]");
System.out.println("options: [-file filename]");
return;
}
this.initArgs(args);
// Create yarnClient
YarnConfiguration conf = new YarnConfiguration();
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
// Create application via yarnClient
YarnClientApplication app = yarnClient.createApplication();
// Set up the container launch context for the application master /**
ContainerLaunchContext amContainer = * initialize the settings
Records.newRecord(ContainerLaunchContext.class); *
amContainer.setCommands( * @param args
Collections.singletonList( */
"$JAVA_HOME/bin/java" + private void initArgs(String[] args) {
" -Xmx256M" + // directly pass all args except args0
" org.apache.hadoop.yarn.rabit.ApplicationMaster" + this.appArgs + StringBuilder sargs = new StringBuilder("");
" 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout" + for (int i = 0; i < args.length; ++i) {
" 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr" if (args[i] == "-file") {
) String[] arr = args[i + 1].split("#");
); Path path = new Path(arr[0]);
if (arr.length == 1) {
// setup cache files cacheFiles.put(path.getName(), path);
amContainer.setLocalResources(this.getLocalResource()); } else {
amContainer.setEnvironment(this.getEnvironment()); cacheFiles.put(arr[1], path);
}
// Set up resource type requirements for ApplicationMaster ++i;
Resource capability = Records.newRecord(Resource.class); } else {
capability.setMemory(256); sargs.append(" ");
capability.setVirtualCores(1); sargs.append(args[i]);
}
}
this.appArgs = sargs.toString();
}
private void run(String[] args) throws Exception {
ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); if (args.length == 0) {
appContext.setApplicationName("Rabit-YARN"); System.out.println("Usage: [options] [commands..]");
appContext.setAMContainerSpec(amContainer); System.out.println("options: [-file filename]");
appContext.setResource(capability); return;
appContext.setQueue("default"); }
this.initArgs(args);
// Submit application // Create yarnClient
ApplicationId appId = appContext.getApplicationId(); YarnConfiguration conf = new YarnConfiguration();
LOG.info("Submitting application " + appId); YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.submitApplication(appContext); yarnClient.init(conf);
yarnClient.start();
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
YarnApplicationState appState = appReport.getYarnApplicationState(); // Create application via yarnClient
while (appState != YarnApplicationState.FINISHED && YarnClientApplication app = yarnClient.createApplication();
appState != YarnApplicationState.KILLED &&
appState != YarnApplicationState.FAILED) { // Set up the container launch context for the application master
Thread.sleep(100); ContainerLaunchContext amContainer = Records
appReport = yarnClient.getApplicationReport(appId); .newRecord(ContainerLaunchContext.class);
appState = appReport.getYarnApplicationState(); amContainer.setCommands(Collections.singletonList("$JAVA_HOME/bin/java"
} + " -Xmx256M"
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
System.out.println( + this.appArgs + " 1>"
"Application " + appId + " finished with" + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
" state " + appState + + " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
" at " + appReport.getFinishTime()); + "/stderr"));
if (!appReport.getFinalApplicationStatus().equals(FinalApplicationStatus.SUCCEEDED)) {
System.err.println(appReport.getDiagnostics()); // setup cache files
} amContainer.setLocalResources(this.getLocalResource());
} amContainer.setEnvironment(this.getEnvironment());
public static void main(String[] args) throws Exception {
new Client().run(args); // Set up resource type requirements for ApplicationMaster
} Resource capability = Records.newRecord(Resource.class);
capability.setMemory(256);
capability.setVirtualCores(1);
ApplicationSubmissionContext appContext = app
.getApplicationSubmissionContext();
appContext.setApplicationName("Rabit-YARN");
appContext.setAMContainerSpec(amContainer);
appContext.setResource(capability);
appContext.setQueue("default");
// Submit application
ApplicationId appId = appContext.getApplicationId();
LOG.info("Submitting application " + appId);
yarnClient.submitApplication(appContext);
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
YarnApplicationState appState = appReport.getYarnApplicationState();
while (appState != YarnApplicationState.FINISHED
&& appState != YarnApplicationState.KILLED
&& appState != YarnApplicationState.FAILED) {
Thread.sleep(100);
appReport = yarnClient.getApplicationReport(appId);
appState = appReport.getYarnApplicationState();
}
System.out.println("Application " + appId + " finished with"
+ " state " + appState + " at " + appReport.getFinishTime());
if (!appReport.getFinalApplicationStatus().equals(
FinalApplicationStatus.SUCCEEDED)) {
System.err.println(appReport.getDiagnostics());
}
}
public static void main(String[] args) throws Exception {
new Client().run(args);
}
} }