change formater
This commit is contained in:
parent
2fbda812bc
commit
4f28e32ebd
@ -6,15 +6,15 @@ then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# put the local training file to HDFS
|
# put the local training file to HDFS
|
||||||
#hadoop fs -rm -r -f $2/data
|
hadoop fs -rm -r -f $2/data
|
||||||
hadoop fs -rm -r -f $2/mushroom.linear.model
|
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||||
#hadoop fs -mkdir $2/data
|
hadoop fs -mkdir $2/data
|
||||||
#hadoop fs -put ../data/agaricus.txt.train $2/data
|
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||||
|
|
||||||
# submit to hadoop
|
# submit to hadoop
|
||||||
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
||||||
|
|
||||||
# get the final model file
|
# get the final model file
|
||||||
#hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
||||||
|
|
||||||
#./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
||||||
@ -15,7 +15,7 @@ export CXX = g++
|
|||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
|
|
||||||
# whether use HDFS support during compile
|
# whether use HDFS support during compile
|
||||||
USE_HDFS = 1
|
USE_HDFS = 0
|
||||||
|
|
||||||
# path to libjvm.so
|
# path to libjvm.so
|
||||||
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
|
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
|
||||||
|
|||||||
@ -34,13 +34,11 @@ if hadoop_binary == None or hadoop_streaming_jar == None:
|
|||||||
', or modify rabit_hadoop.py line 16', stacklevel = 2)
|
', or modify rabit_hadoop.py line 16', stacklevel = 2)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
|
parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming.'\
|
||||||
'This script support both Hadoop 1.0 and Yarn(MRv2), Yarn is recommended')
|
'It is Highly recommended to use rabit_yarn.py instead')
|
||||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||||
help = 'number of worker proccess to be launched')
|
help = 'number of worker proccess to be launched')
|
||||||
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
parser.add_argument('-hip', '--host_ip', default='auto', type=str,
|
||||||
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
help = 'host IP address if cannot be automatically guessed, specify the IP of submission machine')
|
||||||
parser.add_argument('-nt', '--nthread', default = -1, type=int,
|
|
||||||
help = 'number of thread in each mapper to be launched, set it if each rabit job is multi-threaded')
|
|
||||||
parser.add_argument('-i', '--input', required=True,
|
parser.add_argument('-i', '--input', required=True,
|
||||||
help = 'input path in HDFS')
|
help = 'input path in HDFS')
|
||||||
parser.add_argument('-o', '--output', required=True,
|
parser.add_argument('-o', '--output', required=True,
|
||||||
@ -61,6 +59,8 @@ parser.add_argument('--jobname', default='auto', help = 'customize jobname in tr
|
|||||||
parser.add_argument('--timeout', default=600000000, type=int,
|
parser.add_argument('--timeout', default=600000000, type=int,
|
||||||
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
|
help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
|
||||||
'normally you do not need to set this ')
|
'normally you do not need to set this ')
|
||||||
|
parser.add_argument('--vcores', default = -1, type=int,
|
||||||
|
help = 'number of vcpores to request in each mapper, set it if each rabit job is multi-threaded')
|
||||||
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
|
parser.add_argument('-mem', '--memory_mb', default=-1, type=int,
|
||||||
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
help = 'maximum memory used by the process. Guide: set it large (near mapred.cluster.max.map.memory.mb)'\
|
||||||
'if you are running multi-threading rabit,'\
|
'if you are running multi-threading rabit,'\
|
||||||
@ -95,7 +95,9 @@ use_yarn = int(hadoop_version[0]) >= 2
|
|||||||
print 'Current Hadoop Version is %s' % out[1]
|
print 'Current Hadoop Version is %s' % out[1]
|
||||||
|
|
||||||
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
||||||
fset = set()
|
worker_envs['CLASSPATH'] = '`$HADOOP_HOME/bin/hadoop classpath --glob` '
|
||||||
|
worker_envs['LD_LIBRARY_PATH'] = '{LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server'
|
||||||
|
fset = set()
|
||||||
if args.auto_file_cache:
|
if args.auto_file_cache:
|
||||||
for i in range(len(args.command)):
|
for i in range(len(args.command)):
|
||||||
f = args.command[i]
|
f = args.command[i]
|
||||||
@ -132,12 +134,12 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
|||||||
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
|
||||||
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
|
envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
|
||||||
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
|
cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
|
||||||
if args.nthread != -1:
|
if args.vcores != -1:
|
||||||
if kmap['nthread'] is None:
|
if kmap['nthread'] is None:
|
||||||
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
|
||||||
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
|
'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
|
||||||
else:
|
else:
|
||||||
cmd += ' -D%s=%d' % (kmap['nthread'], args.nthread)
|
cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
|
||||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||||
if args.memory_mb != -1:
|
if args.memory_mb != -1:
|
||||||
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
|
||||||
@ -153,5 +155,5 @@ def hadoop_streaming(nworker, worker_args, worker_envs, use_yarn):
|
|||||||
print cmd
|
print cmd
|
||||||
subprocess.check_call(cmd, shell = True)
|
subprocess.check_call(cmd, shell = True)
|
||||||
|
|
||||||
fun_submit = lambda nworker, worker_args: hadoop_streaming(nworker, worker_args, int(hadoop_version[0]) >= 2)
|
fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
|
||||||
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
|
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
|
||||||
@ -34,340 +34,413 @@ import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
|||||||
* @author Tianqi Chen
|
* @author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
public class ApplicationMaster {
|
public class ApplicationMaster {
|
||||||
// logger
|
// logger
|
||||||
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
||||||
// configuration
|
// configuration
|
||||||
private Configuration conf = new YarnConfiguration();
|
private Configuration conf = new YarnConfiguration();
|
||||||
|
|
||||||
// number of cores allocated for each task
|
// number of cores allocated for each task
|
||||||
private int numVCores = 1;
|
private int numVCores = 1;
|
||||||
// memory needed requested for the task
|
// memory needed requested for the task
|
||||||
private int numMemoryMB = 10;
|
private int numMemoryMB = 10;
|
||||||
// priority of the app master
|
// priority of the app master
|
||||||
private int appPriority = 0;
|
private int appPriority = 0;
|
||||||
// total number of tasks
|
// total number of tasks
|
||||||
private int numTasks = 1;
|
private int numTasks = 1;
|
||||||
// maximum number of attempts to try in each task
|
// maximum number of attempts to try in each task
|
||||||
private int maxNumAttempt = 10;
|
private int maxNumAttempt = 10;
|
||||||
// command to launch
|
// command to launch
|
||||||
private String command = "";
|
private String command = "";
|
||||||
|
|
||||||
// application tracker hostname
|
// application tracker hostname
|
||||||
private String appHostName = "";
|
private String appHostName = "";
|
||||||
// tracker URL to do
|
// tracker URL to do
|
||||||
private String appTrackerUrl = "";
|
private String appTrackerUrl = "";
|
||||||
// tracker port
|
// tracker port
|
||||||
private int appTrackerPort = 0;
|
private int appTrackerPort = 0;
|
||||||
|
|
||||||
|
// whether we start to abort the application, due to whatever fatal reasons
|
||||||
|
private boolean startAbort = false;
|
||||||
|
// record the aborting reason
|
||||||
|
private String abortDiagnosis = "";
|
||||||
|
// resource manager
|
||||||
|
private AMRMClientAsync<ContainerRequest> rmClient = null;
|
||||||
|
// node manager
|
||||||
|
private NMClientAsync nmClient = null;
|
||||||
|
|
||||||
|
// list of tasks that pending for resources to be allocated
|
||||||
|
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
// map containerId->task record of tasks that was running
|
||||||
|
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
|
||||||
|
// collection of tasks
|
||||||
|
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
// collection of killed tasks
|
||||||
|
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
|
||||||
|
|
||||||
// whether we start to abort the application, due to whatever fatal reasons
|
|
||||||
private boolean startAbort = false;
|
|
||||||
// record the aborting reason
|
|
||||||
private String abortDiagnosis = "";
|
|
||||||
// resource manager
|
|
||||||
private AMRMClientAsync<ContainerRequest> rmClient = null;
|
|
||||||
// node manager
|
|
||||||
private NMClientAsync nmClient = null;
|
|
||||||
|
|
||||||
// list of tasks that pending for resources to be allocated
|
|
||||||
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
|
|
||||||
// map containerId->task record of tasks that was running
|
|
||||||
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
|
|
||||||
// collection of tasks
|
|
||||||
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
|
|
||||||
// collection of killed tasks
|
|
||||||
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
new ApplicationMaster().run(args);
|
new ApplicationMaster().run(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get integer argument from environment variable
|
* get integer argument from environment variable
|
||||||
* @param name name of key
|
*
|
||||||
* @param required whether this is required
|
* @param name
|
||||||
* @param defv default value
|
* name of key
|
||||||
|
* @param required
|
||||||
|
* whether this is required
|
||||||
|
* @param defv
|
||||||
|
* default value
|
||||||
* @return the requested result
|
* @return the requested result
|
||||||
*/
|
*/
|
||||||
private int getEnvInteger(String name, boolean required, int defv) {
|
private int getEnvInteger(String name, boolean required, int defv) {
|
||||||
String value = System.getenv(name);
|
String value = System.getenv(name);
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
if (required) LOG.fatal("environment variable " + name + "not set");
|
if (required)
|
||||||
}
|
LOG.fatal("environment variable " + name + "not set");
|
||||||
return Integer.valueOf(value);
|
}
|
||||||
|
return Integer.valueOf(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* initialize from arguments and command lines
|
* initialize from arguments and command lines
|
||||||
|
*
|
||||||
* @param args
|
* @param args
|
||||||
*/
|
*/
|
||||||
private void initArgs(String args[]) {
|
private void initArgs(String args[]) {
|
||||||
for (String c : args) {
|
for (String c : args) {
|
||||||
this.command += c + " ";
|
this.command += c + " ";
|
||||||
}
|
}
|
||||||
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||||
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
|
||||||
}
|
maxNumAttempt);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* called to start the application
|
* called to start the application
|
||||||
*/
|
*/
|
||||||
private void run(String args[]) throws Exception {
|
private void run(String args[]) throws Exception {
|
||||||
this.initArgs(args);
|
this.initArgs(args);
|
||||||
// list of tasks that waits to be submit
|
// list of tasks that waits to be submit
|
||||||
java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||||
// add waiting tasks
|
// add waiting tasks
|
||||||
for (int i = 0; i < this.numTasks; ++i) {
|
for (int i = 0; i < this.numTasks; ++i) {
|
||||||
tasks.add(new TaskRecord(i));
|
tasks.add(new TaskRecord(i));
|
||||||
}
|
}
|
||||||
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000, new RMCallbackHandler());
|
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
|
||||||
this.nmClient = NMClientAsync.createNMClientAsync(new NMCallbackHandler());
|
new RMCallbackHandler());
|
||||||
this.rmClient.init(conf);
|
this.nmClient = NMClientAsync
|
||||||
this.rmClient.start();
|
.createNMClientAsync(new NMCallbackHandler());
|
||||||
this.nmClient.init(conf);
|
this.rmClient.init(conf);
|
||||||
this.nmClient.start();
|
this.rmClient.start();
|
||||||
RegisterApplicationMasterResponse response =
|
this.nmClient.init(conf);
|
||||||
this.rmClient.registerApplicationMaster(this.appHostName, this.appTrackerPort, this.appTrackerUrl);
|
this.nmClient.start();
|
||||||
Resource maxResource = response.getMaximumResourceCapability();
|
RegisterApplicationMasterResponse response = this.rmClient
|
||||||
if (maxResource.getMemory() < this.numMemoryMB) {
|
.registerApplicationMaster(this.appHostName,
|
||||||
LOG.warn("[Rabit] memory requested exceed bound " + maxResource.getMemory());
|
this.appTrackerPort, this.appTrackerUrl);
|
||||||
this.numMemoryMB = maxResource.getMemory();
|
Resource maxResource = response.getMaximumResourceCapability();
|
||||||
}
|
if (maxResource.getMemory() < this.numMemoryMB) {
|
||||||
if (maxResource.getVirtualCores() < this.numVCores) {
|
LOG.warn("[Rabit] memory requested exceed bound "
|
||||||
LOG.warn("[Rabit] memory requested exceed bound " + maxResource.getVirtualCores());
|
+ maxResource.getMemory());
|
||||||
this.numVCores = maxResource.getVirtualCores();
|
this.numMemoryMB = maxResource.getMemory();
|
||||||
}
|
}
|
||||||
this.submitTasks(tasks);
|
if (maxResource.getVirtualCores() < this.numVCores) {
|
||||||
LOG.info("[Rabit] ApplicationMaster started");
|
LOG.warn("[Rabit] memory requested exceed bound "
|
||||||
while (!this.doneAllJobs()) {
|
+ maxResource.getVirtualCores());
|
||||||
try {
|
this.numVCores = maxResource.getVirtualCores();
|
||||||
Thread.sleep(100);;
|
}
|
||||||
} catch (InterruptedException e) {
|
this.submitTasks(tasks);
|
||||||
}
|
LOG.info("[Rabit] ApplicationMaster started");
|
||||||
}
|
while (!this.doneAllJobs()) {
|
||||||
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
try {
|
||||||
boolean success = finishedTasks.size() == numTasks;
|
Thread.sleep(100);
|
||||||
LOG.info("Application completed. Stopping running containers");
|
;
|
||||||
nmClient.stop();
|
} catch (InterruptedException e) {
|
||||||
String diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
}
|
||||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
}
|
||||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
||||||
rmClient.unregisterApplicationMaster
|
boolean success = finishedTasks.size() == numTasks;
|
||||||
(success ? FinalApplicationStatus.SUCCEEDED : FinalApplicationStatus.FAILED,
|
LOG.info("Application completed. Stopping running containers");
|
||||||
diagnostics, appTrackerUrl);
|
nmClient.stop();
|
||||||
}
|
String diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
||||||
/**
|
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||||
* check if the job finishes
|
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||||
* @return whether we finished all the jobs
|
rmClient.unregisterApplicationMaster(
|
||||||
*/
|
success ? FinalApplicationStatus.SUCCEEDED
|
||||||
private synchronized boolean doneAllJobs() {
|
: FinalApplicationStatus.FAILED, diagnostics,
|
||||||
return pendingTasks.size() == 0 && runningTasks.size() == 0;
|
appTrackerUrl);
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* submit tasks to request containers for the tasks
|
/**
|
||||||
* @param tasks a collection of tasks we want to ask container for
|
* check if the job finishes
|
||||||
*/
|
*
|
||||||
private synchronized void submitTasks(Collection<TaskRecord> tasks) {
|
* @return whether we finished all the jobs
|
||||||
for (TaskRecord r : tasks) {
|
*/
|
||||||
Resource resource = Records.newRecord(Resource.class);
|
private synchronized boolean doneAllJobs() {
|
||||||
resource.setMemory(numMemoryMB);
|
return pendingTasks.size() == 0 && runningTasks.size() == 0;
|
||||||
resource.setVirtualCores(numVCores);
|
}
|
||||||
Priority priority = Records.newRecord(Priority.class);
|
|
||||||
priority.setPriority(this.appPriority);
|
/**
|
||||||
r.containerRequest = new ContainerRequest(resource, null, null, priority);
|
* submit tasks to request containers for the tasks
|
||||||
rmClient.addContainerRequest(r.containerRequest);
|
*
|
||||||
pendingTasks.add(r);
|
* @param tasks
|
||||||
}
|
* a collection of tasks we want to ask container for
|
||||||
}
|
*/
|
||||||
/**
|
private synchronized void submitTasks(Collection<TaskRecord> tasks) {
|
||||||
* launch the task on container
|
for (TaskRecord r : tasks) {
|
||||||
* @param container container to run the task
|
Resource resource = Records.newRecord(Resource.class);
|
||||||
* @param task the task
|
resource.setMemory(numMemoryMB);
|
||||||
*/
|
resource.setVirtualCores(numVCores);
|
||||||
private void launchTask(Container container, TaskRecord task) {
|
Priority priority = Records.newRecord(Priority.class);
|
||||||
task.container = container;
|
priority.setPriority(this.appPriority);
|
||||||
task.containerRequest = null;
|
r.containerRequest = new ContainerRequest(resource, null, null,
|
||||||
ContainerLaunchContext ctx =
|
priority);
|
||||||
Records.newRecord(ContainerLaunchContext.class);
|
rmClient.addContainerRequest(r.containerRequest);
|
||||||
String cmd = command + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
pendingTasks.add(r);
|
||||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr";
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* launch the task on container
|
||||||
|
*
|
||||||
|
* @param container
|
||||||
|
* container to run the task
|
||||||
|
* @param task
|
||||||
|
* the task
|
||||||
|
*/
|
||||||
|
private void launchTask(Container container, TaskRecord task) {
|
||||||
|
task.container = container;
|
||||||
|
task.containerRequest = null;
|
||||||
|
ContainerLaunchContext ctx = Records
|
||||||
|
.newRecord(ContainerLaunchContext.class);
|
||||||
|
String cmd = command + " 1>"
|
||||||
|
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||||
|
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||||
|
+ "/stderr";
|
||||||
ctx.setCommands(Collections.singletonList(cmd));
|
ctx.setCommands(Collections.singletonList(cmd));
|
||||||
// setup environment variables
|
// setup environment variables
|
||||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||||
// setup class path
|
// setup class path
|
||||||
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||||
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
for (String c : conf.getStrings(
|
||||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||||
cpath.append(':');
|
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||||
cpath.append(c.trim());
|
cpath.append(':');
|
||||||
}
|
cpath.append(c.trim());
|
||||||
env.put("CLASSPATH", cpath.toString());
|
}
|
||||||
// setup LD_LIBARY_pATH path for libhdfs
|
env.put("CLASSPATH", cpath.toString());
|
||||||
env.put("LD_LIBRARY_PATH", "${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native");
|
// setup LD_LIBARY_pATH path for libhdfs
|
||||||
|
env.put("LD_LIBRARY_PATH",
|
||||||
|
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||||
// inherit all rabit variables
|
// inherit all rabit variables
|
||||||
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||||
if (e.getKey().startsWith("rabit_")) {
|
if (e.getKey().startsWith("rabit_")) {
|
||||||
env.put(e.getKey(), e.getValue());
|
env.put(e.getKey(), e.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
env.put("rabit_task_id", String.valueOf(task.taskId));
|
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||||
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||||
ctx.setEnvironment(env);
|
ctx.setEnvironment(env);
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
assert (!this.runningTasks.containsKey(container.getId()));
|
assert (!this.runningTasks.containsKey(container.getId()));
|
||||||
this.runningTasks.put(container.getId(), task);
|
this.runningTasks.put(container.getId(), task);
|
||||||
this.nmClient.startContainerAsync(container, ctx);
|
this.nmClient.startContainerAsync(container, ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* free the containers that have not yet been launched
|
/**
|
||||||
* @param containers
|
* free the containers that have not yet been launched
|
||||||
*/
|
*
|
||||||
private synchronized void freeUnusedContainers(Collection<Container> containers) {
|
* @param containers
|
||||||
}
|
*/
|
||||||
/**
|
private synchronized void freeUnusedContainers(
|
||||||
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
Collection<Container> containers) {
|
||||||
* @param containers
|
}
|
||||||
*/
|
|
||||||
private synchronized void onContainersAllocated(List<Container> containers) {
|
/**
|
||||||
if (this.startAbort) {
|
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||||
this.freeUnusedContainers(containers);
|
*
|
||||||
return;
|
* @param containers
|
||||||
}
|
*/
|
||||||
Collection<Container> freelist = new java.util.LinkedList<Container>();
|
private synchronized void onContainersAllocated(List<Container> containers) {
|
||||||
for (Container c : containers) {
|
if (this.startAbort) {
|
||||||
TaskRecord task;
|
this.freeUnusedContainers(containers);
|
||||||
task = pendingTasks.poll();
|
return;
|
||||||
if (task == null) {
|
}
|
||||||
freelist.add(c); continue;
|
Collection<Container> freelist = new java.util.LinkedList<Container>();
|
||||||
}
|
for (Container c : containers) {
|
||||||
this.launchTask(c, task);
|
TaskRecord task;
|
||||||
}
|
task = pendingTasks.poll();
|
||||||
this.freeUnusedContainers(freelist);
|
if (task == null) {
|
||||||
}
|
freelist.add(c);
|
||||||
/**
|
continue;
|
||||||
* start aborting the job
|
}
|
||||||
* @param msg the fatal message
|
this.launchTask(c, task);
|
||||||
*/
|
}
|
||||||
private synchronized void abortJob(String msg) {
|
this.freeUnusedContainers(freelist);
|
||||||
if (!this.startAbort) this.abortDiagnosis = msg;
|
}
|
||||||
this.startAbort = true;
|
|
||||||
for (TaskRecord r : this.runningTasks.values()) {
|
/**
|
||||||
if (!r.abortRequested) {
|
* start aborting the job
|
||||||
nmClient.stopContainerAsync(r.container.getId(), r.container.getNodeId());
|
*
|
||||||
r.abortRequested = true;
|
* @param msg
|
||||||
}
|
* the fatal message
|
||||||
}
|
*/
|
||||||
this.killedTasks.addAll(this.pendingTasks);
|
private synchronized void abortJob(String msg) {
|
||||||
for (TaskRecord r : this.pendingTasks) {
|
if (!this.startAbort)
|
||||||
rmClient.removeContainerRequest(r.containerRequest);
|
this.abortDiagnosis = msg;
|
||||||
}
|
this.startAbort = true;
|
||||||
this.pendingTasks.clear();
|
for (TaskRecord r : this.runningTasks.values()) {
|
||||||
LOG.info(msg);
|
if (!r.abortRequested) {
|
||||||
}
|
nmClient.stopContainerAsync(r.container.getId(),
|
||||||
/**
|
r.container.getNodeId());
|
||||||
* handle non fatal failures
|
r.abortRequested = true;
|
||||||
* @param cid
|
}
|
||||||
*/
|
}
|
||||||
private synchronized void handleFailure(Collection<ContainerId> failed) {
|
this.killedTasks.addAll(this.pendingTasks);
|
||||||
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
for (TaskRecord r : this.pendingTasks) {
|
||||||
for (ContainerId cid : failed) {
|
rmClient.removeContainerRequest(r.containerRequest);
|
||||||
TaskRecord r = runningTasks.remove(cid);
|
}
|
||||||
if (r == null) continue;
|
this.pendingTasks.clear();
|
||||||
r.attemptCounter += 1;
|
LOG.info(msg);
|
||||||
r.container = null;
|
}
|
||||||
tasks.add(r);
|
|
||||||
if (r.attemptCounter >= this.maxNumAttempt) {
|
/**
|
||||||
this.abortJob("[Rabit] Task " + r.taskId + " failed more than " + r.attemptCounter + "times");
|
* handle non fatal failures
|
||||||
}
|
*
|
||||||
}
|
* @param cid
|
||||||
if (this.startAbort) {
|
*/
|
||||||
this.killedTasks.addAll(tasks);
|
private synchronized void handleFailure(Collection<ContainerId> failed) {
|
||||||
} else {
|
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||||
this.submitTasks(tasks);
|
for (ContainerId cid : failed) {
|
||||||
}
|
TaskRecord r = runningTasks.remove(cid);
|
||||||
}
|
if (r == null)
|
||||||
/**
|
continue;
|
||||||
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
r.attemptCounter += 1;
|
||||||
* @param status list of status
|
r.container = null;
|
||||||
*/
|
tasks.add(r);
|
||||||
private synchronized void onContainersCompleted(List<ContainerStatus> status) {
|
if (r.attemptCounter >= this.maxNumAttempt) {
|
||||||
Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
|
this.abortJob("[Rabit] Task " + r.taskId + " failed more than "
|
||||||
for (ContainerStatus s : status) {
|
+ r.attemptCounter + "times");
|
||||||
assert (s.getState().equals(ContainerState.COMPLETE));
|
}
|
||||||
int exstatus = s.getExitStatus();
|
}
|
||||||
TaskRecord r = runningTasks.get(s.getContainerId());
|
if (this.startAbort) {
|
||||||
if (r == null) continue;
|
this.killedTasks.addAll(tasks);
|
||||||
if (exstatus == ContainerExitStatus.SUCCESS) {
|
} else {
|
||||||
finishedTasks.add(r);
|
this.submitTasks(tasks);
|
||||||
runningTasks.remove(s.getContainerId());
|
}
|
||||||
} else {
|
}
|
||||||
switch (exstatus) {
|
|
||||||
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
/**
|
||||||
this.abortJob("[Rabit] Task " + r.taskId + " killed because of exceeding allocated physical memory");
|
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||||
break;
|
*
|
||||||
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
* @param status
|
||||||
this.abortJob("[Rabit] Task " + r.taskId + " killed because of exceeding allocated virtual memory");
|
* list of status
|
||||||
break;
|
*/
|
||||||
default:
|
private synchronized void onContainersCompleted(List<ContainerStatus> status) {
|
||||||
LOG.info("[Rabit] Task " + r.taskId + " exited with status " + exstatus);
|
Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
|
||||||
failed.add(s.getContainerId());
|
for (ContainerStatus s : status) {
|
||||||
}
|
assert (s.getState().equals(ContainerState.COMPLETE));
|
||||||
}
|
int exstatus = s.getExitStatus();
|
||||||
}
|
TaskRecord r = runningTasks.get(s.getContainerId());
|
||||||
this.handleFailure(failed);
|
if (r == null)
|
||||||
}
|
continue;
|
||||||
/**
|
if (exstatus == ContainerExitStatus.SUCCESS) {
|
||||||
* callback handler for resource manager
|
finishedTasks.add(r);
|
||||||
*/
|
runningTasks.remove(s.getContainerId());
|
||||||
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
|
} else {
|
||||||
@Override
|
switch (exstatus) {
|
||||||
public float getProgress() {
|
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||||
return 1.0f - (float)(pendingTasks.size()) / numTasks;
|
this.abortJob("[Rabit] Task "
|
||||||
}
|
+ r.taskId
|
||||||
@Override
|
+ " killed because of exceeding allocated physical memory");
|
||||||
public void onContainersAllocated(List<Container> containers) {
|
break;
|
||||||
ApplicationMaster.this.onContainersAllocated(containers);
|
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||||
}
|
this.abortJob("[Rabit] Task "
|
||||||
@Override
|
+ r.taskId
|
||||||
public void onContainersCompleted(List<ContainerStatus> status) {
|
+ " killed because of exceeding allocated virtual memory");
|
||||||
ApplicationMaster.this.onContainersCompleted(status);
|
break;
|
||||||
}
|
default:
|
||||||
@Override
|
LOG.info("[Rabit] Task " + r.taskId
|
||||||
public void onError(Throwable ex) {
|
+ " exited with status " + exstatus);
|
||||||
ApplicationMaster.this.abortJob("[Rabit] Resource manager Error " + ex.toString());
|
failed.add(s.getContainerId());
|
||||||
}
|
}
|
||||||
@Override
|
}
|
||||||
public void onNodesUpdated(List<NodeReport> nodereport) {
|
}
|
||||||
}
|
this.handleFailure(failed);
|
||||||
@Override
|
}
|
||||||
public void onShutdownRequest() {
|
|
||||||
ApplicationMaster.this.abortJob("[Rabit] Get shutdown request, start to shutdown...");
|
/**
|
||||||
}
|
* callback handler for resource manager
|
||||||
}
|
*/
|
||||||
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
|
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
|
||||||
@Override
|
@Override
|
||||||
public void onContainerStarted(ContainerId cid, Map<String, ByteBuffer> services) {
|
public float getProgress() {
|
||||||
LOG.debug("onContainerStarted Invoked");
|
return 1.0f - (float) (pendingTasks.size()) / numTasks;
|
||||||
}
|
}
|
||||||
@Override
|
|
||||||
public void onContainerStatusReceived(ContainerId cid, ContainerStatus status) {
|
@Override
|
||||||
LOG.debug("onContainerStatusReceived Invoked");
|
public void onContainersAllocated(List<Container> containers) {
|
||||||
}
|
ApplicationMaster.this.onContainersAllocated(containers);
|
||||||
@Override
|
}
|
||||||
public void onContainerStopped(ContainerId cid) {
|
|
||||||
LOG.debug("onContainerStopped Invoked");
|
@Override
|
||||||
}
|
public void onContainersCompleted(List<ContainerStatus> status) {
|
||||||
@Override
|
ApplicationMaster.this.onContainersCompleted(status);
|
||||||
public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
|
}
|
||||||
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
|
|
||||||
ApplicationMaster.this.handleFailure(Collections.singletonList(cid));
|
@Override
|
||||||
}
|
public void onError(Throwable ex) {
|
||||||
@Override
|
ApplicationMaster.this.abortJob("[Rabit] Resource manager Error "
|
||||||
public void onStartContainerError(ContainerId cid, Throwable ex) {
|
+ ex.toString());
|
||||||
LOG.debug("onStartContainerError Invoked: " + ex.toString());
|
}
|
||||||
ApplicationMaster.this.handleFailure(Collections.singletonList(cid));
|
|
||||||
}
|
@Override
|
||||||
@Override
|
public void onNodesUpdated(List<NodeReport> nodereport) {
|
||||||
public void onStopContainerError(ContainerId cid, Throwable ex) {
|
}
|
||||||
LOG.info("onStopContainerError Invoked: " + ex.toString());
|
|
||||||
}
|
@Override
|
||||||
}
|
public void onShutdownRequest() {
|
||||||
|
ApplicationMaster.this
|
||||||
|
.abortJob("[Rabit] Get shutdown request, start to shutdown...");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
|
||||||
|
@Override
|
||||||
|
public void onContainerStarted(ContainerId cid,
|
||||||
|
Map<String, ByteBuffer> services) {
|
||||||
|
LOG.debug("onContainerStarted Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainerStatusReceived(ContainerId cid,
|
||||||
|
ContainerStatus status) {
|
||||||
|
LOG.debug("onContainerStatusReceived Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onContainerStopped(ContainerId cid) {
|
||||||
|
LOG.debug("onContainerStopped Invoked");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
|
||||||
|
ApplicationMaster.this
|
||||||
|
.handleFailure(Collections.singletonList(cid));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onStartContainerError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.debug("onStartContainerError Invoked: " + ex.toString());
|
||||||
|
ApplicationMaster.this
|
||||||
|
.handleFailure(Collections.singletonList(cid));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onStopContainerError(ContainerId cid, Throwable ex) {
|
||||||
|
LOG.info("onStopContainerError Invoked: " + ex.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,149 +26,156 @@ import org.apache.hadoop.yarn.util.ConverterUtils;
|
|||||||
import org.apache.hadoop.yarn.util.Records;
|
import org.apache.hadoop.yarn.util.Records;
|
||||||
|
|
||||||
public class Client {
|
public class Client {
|
||||||
// logger
|
// logger
|
||||||
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
||||||
// configuration
|
// configuration
|
||||||
private YarnConfiguration conf = new YarnConfiguration();
|
private YarnConfiguration conf = new YarnConfiguration();
|
||||||
// cached maps
|
// cached maps
|
||||||
private Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
private Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
||||||
// args to pass to application master
|
// args to pass to application master
|
||||||
private String appArgs;
|
private String appArgs;
|
||||||
/**
|
|
||||||
* get the local resource setting
|
/**
|
||||||
* @param fmaps the file maps
|
* get the local resource setting
|
||||||
* @return the resource map
|
*
|
||||||
* @throws IOException
|
* @param fmaps
|
||||||
*/
|
* the file maps
|
||||||
private Map<String, LocalResource> getLocalResource() throws IOException {
|
* @return the resource map
|
||||||
Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
|
* @throws IOException
|
||||||
for (Map.Entry<String, Path> e : cacheFiles.entrySet()){
|
*/
|
||||||
LocalResource r = Records.newRecord(LocalResource.class);
|
private Map<String, LocalResource> getLocalResource() throws IOException {
|
||||||
Path path = e.getValue();
|
Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
|
||||||
FileStatus status = FileSystem.get(conf).getFileStatus(path);
|
for (Map.Entry<String, Path> e : cacheFiles.entrySet()) {
|
||||||
r.setResource(ConverterUtils.getYarnUrlFromPath(path));
|
LocalResource r = Records.newRecord(LocalResource.class);
|
||||||
r.setSize(status.getLen());
|
Path path = e.getValue();
|
||||||
r.setTimestamp(status.getModificationTime());
|
FileStatus status = FileSystem.get(conf).getFileStatus(path);
|
||||||
r.setType(LocalResourceType.FILE);
|
r.setResource(ConverterUtils.getYarnUrlFromPath(path));
|
||||||
r.setVisibility(LocalResourceVisibility.PUBLIC);
|
r.setSize(status.getLen());
|
||||||
rmap.put(e.getKey(), r);
|
r.setTimestamp(status.getModificationTime());
|
||||||
}
|
r.setType(LocalResourceType.FILE);
|
||||||
return rmap;
|
r.setVisibility(LocalResourceVisibility.PUBLIC);
|
||||||
}
|
rmap.put(e.getKey(), r);
|
||||||
/**
|
}
|
||||||
* get the environment variables for container
|
return rmap;
|
||||||
* @return the env variable for child class
|
}
|
||||||
*/
|
|
||||||
private Map<String, String> getEnvironment() {
|
/**
|
||||||
// Setup environment variables
|
* get the environment variables for container
|
||||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
*
|
||||||
String cpath = "${CLASSPATH}:./*";
|
* @return the env variable for child class
|
||||||
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
*/
|
||||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
private Map<String, String> getEnvironment() {
|
||||||
cpath += ':';
|
// Setup environment variables
|
||||||
cpath += c.trim();
|
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||||
}
|
String cpath = "${CLASSPATH}:./*";
|
||||||
env.put("CLASSPATH", cpath);
|
for (String c : conf.getStrings(
|
||||||
|
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||||
|
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||||
|
cpath += ':';
|
||||||
|
cpath += c.trim();
|
||||||
|
}
|
||||||
|
env.put("CLASSPATH", cpath);
|
||||||
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||||
if (e.getKey().startsWith("rabit_")) {
|
if (e.getKey().startsWith("rabit_")) {
|
||||||
env.put(e.getKey(), e.getValue());
|
env.put(e.getKey(), e.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return env;
|
return env;
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* initialize the settings
|
|
||||||
* @param args
|
|
||||||
*/
|
|
||||||
private void initArgs(String[] args) {
|
|
||||||
// directly pass all args except args0
|
|
||||||
StringBuilder sargs = new StringBuilder("");
|
|
||||||
for (int i = 0; i < args.length; ++i) {
|
|
||||||
if (args[i] == "-file") {
|
|
||||||
String[] arr = args[i + 1].split("#");
|
|
||||||
Path path = new Path(arr[0]);
|
|
||||||
if (arr.length == 1) {
|
|
||||||
cacheFiles.put(path.getName(), path);
|
|
||||||
} else {
|
|
||||||
cacheFiles.put(arr[1], path);
|
|
||||||
}
|
|
||||||
++ i;
|
|
||||||
} else {
|
|
||||||
sargs.append(" ");
|
|
||||||
sargs.append(args[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this.appArgs = sargs.toString();
|
|
||||||
}
|
|
||||||
private void run(String[] args) throws Exception {
|
|
||||||
if (args.length == 0) {
|
|
||||||
System.out.println("Usage: [options] [commands..]");
|
|
||||||
System.out.println("options: [-file filename]");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this.initArgs(args);
|
|
||||||
// Create yarnClient
|
|
||||||
YarnConfiguration conf = new YarnConfiguration();
|
|
||||||
YarnClient yarnClient = YarnClient.createYarnClient();
|
|
||||||
yarnClient.init(conf);
|
|
||||||
yarnClient.start();
|
|
||||||
|
|
||||||
// Create application via yarnClient
|
|
||||||
YarnClientApplication app = yarnClient.createApplication();
|
|
||||||
|
|
||||||
// Set up the container launch context for the application master
|
/**
|
||||||
ContainerLaunchContext amContainer =
|
* initialize the settings
|
||||||
Records.newRecord(ContainerLaunchContext.class);
|
*
|
||||||
amContainer.setCommands(
|
* @param args
|
||||||
Collections.singletonList(
|
*/
|
||||||
"$JAVA_HOME/bin/java" +
|
private void initArgs(String[] args) {
|
||||||
" -Xmx256M" +
|
// directly pass all args except args0
|
||||||
" org.apache.hadoop.yarn.rabit.ApplicationMaster" + this.appArgs +
|
StringBuilder sargs = new StringBuilder("");
|
||||||
" 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout" +
|
for (int i = 0; i < args.length; ++i) {
|
||||||
" 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"
|
if (args[i] == "-file") {
|
||||||
)
|
String[] arr = args[i + 1].split("#");
|
||||||
);
|
Path path = new Path(arr[0]);
|
||||||
|
if (arr.length == 1) {
|
||||||
// setup cache files
|
cacheFiles.put(path.getName(), path);
|
||||||
amContainer.setLocalResources(this.getLocalResource());
|
} else {
|
||||||
amContainer.setEnvironment(this.getEnvironment());
|
cacheFiles.put(arr[1], path);
|
||||||
|
}
|
||||||
// Set up resource type requirements for ApplicationMaster
|
++i;
|
||||||
Resource capability = Records.newRecord(Resource.class);
|
} else {
|
||||||
capability.setMemory(256);
|
sargs.append(" ");
|
||||||
capability.setVirtualCores(1);
|
sargs.append(args[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.appArgs = sargs.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run(String[] args) throws Exception {
|
||||||
ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext();
|
if (args.length == 0) {
|
||||||
appContext.setApplicationName("Rabit-YARN");
|
System.out.println("Usage: [options] [commands..]");
|
||||||
appContext.setAMContainerSpec(amContainer);
|
System.out.println("options: [-file filename]");
|
||||||
appContext.setResource(capability);
|
return;
|
||||||
appContext.setQueue("default");
|
}
|
||||||
|
this.initArgs(args);
|
||||||
// Submit application
|
// Create yarnClient
|
||||||
ApplicationId appId = appContext.getApplicationId();
|
YarnConfiguration conf = new YarnConfiguration();
|
||||||
LOG.info("Submitting application " + appId);
|
YarnClient yarnClient = YarnClient.createYarnClient();
|
||||||
yarnClient.submitApplication(appContext);
|
yarnClient.init(conf);
|
||||||
|
yarnClient.start();
|
||||||
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
|
|
||||||
YarnApplicationState appState = appReport.getYarnApplicationState();
|
// Create application via yarnClient
|
||||||
while (appState != YarnApplicationState.FINISHED &&
|
YarnClientApplication app = yarnClient.createApplication();
|
||||||
appState != YarnApplicationState.KILLED &&
|
|
||||||
appState != YarnApplicationState.FAILED) {
|
// Set up the container launch context for the application master
|
||||||
Thread.sleep(100);
|
ContainerLaunchContext amContainer = Records
|
||||||
appReport = yarnClient.getApplicationReport(appId);
|
.newRecord(ContainerLaunchContext.class);
|
||||||
appState = appReport.getYarnApplicationState();
|
amContainer.setCommands(Collections.singletonList("$JAVA_HOME/bin/java"
|
||||||
}
|
+ " -Xmx256M"
|
||||||
|
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
|
||||||
System.out.println(
|
+ this.appArgs + " 1>"
|
||||||
"Application " + appId + " finished with" +
|
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||||
" state " + appState +
|
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||||
" at " + appReport.getFinishTime());
|
+ "/stderr"));
|
||||||
if (!appReport.getFinalApplicationStatus().equals(FinalApplicationStatus.SUCCEEDED)) {
|
|
||||||
System.err.println(appReport.getDiagnostics());
|
// setup cache files
|
||||||
}
|
amContainer.setLocalResources(this.getLocalResource());
|
||||||
}
|
amContainer.setEnvironment(this.getEnvironment());
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
new Client().run(args);
|
// Set up resource type requirements for ApplicationMaster
|
||||||
}
|
Resource capability = Records.newRecord(Resource.class);
|
||||||
|
capability.setMemory(256);
|
||||||
|
capability.setVirtualCores(1);
|
||||||
|
|
||||||
|
ApplicationSubmissionContext appContext = app
|
||||||
|
.getApplicationSubmissionContext();
|
||||||
|
appContext.setApplicationName("Rabit-YARN");
|
||||||
|
appContext.setAMContainerSpec(amContainer);
|
||||||
|
appContext.setResource(capability);
|
||||||
|
appContext.setQueue("default");
|
||||||
|
|
||||||
|
// Submit application
|
||||||
|
ApplicationId appId = appContext.getApplicationId();
|
||||||
|
LOG.info("Submitting application " + appId);
|
||||||
|
yarnClient.submitApplication(appContext);
|
||||||
|
|
||||||
|
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
|
||||||
|
YarnApplicationState appState = appReport.getYarnApplicationState();
|
||||||
|
while (appState != YarnApplicationState.FINISHED
|
||||||
|
&& appState != YarnApplicationState.KILLED
|
||||||
|
&& appState != YarnApplicationState.FAILED) {
|
||||||
|
Thread.sleep(100);
|
||||||
|
appReport = yarnClient.getApplicationReport(appId);
|
||||||
|
appState = appReport.getYarnApplicationState();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Application " + appId + " finished with"
|
||||||
|
+ " state " + appState + " at " + appReport.getFinishTime());
|
||||||
|
if (!appReport.getFinalApplicationStatus().equals(
|
||||||
|
FinalApplicationStatus.SUCCEEDED)) {
|
||||||
|
System.err.println(appReport.getDiagnostics());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
new Client().run(args);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user