Squashed 'subtree/rabit/' changes from d4ec037..28ca7be
28ca7beadd linear readmeca4b20fadd linear readme1133628add linear readme6a11676update docsa607047Update build.sh2c1cfd8complete yarn4f28e32change formater2fbda81fix stdin input3258bcfcheckin yarn master67ebf81allow setup from env variables9b6bf57fix hdfs395d5c2add make system88ce767refactor io, initial hdfs file access need test19be870chgsa1bd3c6Merge branch 'master' of ssh://github.com/tqchen/rabit1a573f9introduce input split29476f1fix timer issue git-subtree-dir: subtree/rabit git-subtree-split:28ca7becbd
This commit is contained in:
4
yarn/.gitignore
vendored
Normal file
4
yarn/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
bin
|
||||
.classpath
|
||||
.project
|
||||
*.jar
|
||||
5
yarn/README.md
Normal file
5
yarn/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
rabit-yarn
|
||||
=====
|
||||
* This folder contains Application code to allow rabit run on Yarn.
|
||||
* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to submit the job
|
||||
- run ```./build.sh``` to build the jar, before using the script
|
||||
1
yarn/bin/README
Normal file
1
yarn/bin/README
Normal file
@@ -0,0 +1 @@
|
||||
foler used to hold generated class files
|
||||
4
yarn/build.sh
Executable file
4
yarn/build.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
CPATH=`${HADOOP_PREFIX}/bin/hadoop classpath`
|
||||
javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
|
||||
jar cf rabit-yarn.jar -C bin .
|
||||
508
yarn/src/org/apache/hadoop/yarn/rabit/ApplicationMaster.java
Normal file
508
yarn/src/org/apache/hadoop/yarn/rabit/ApplicationMaster.java
Normal file
@@ -0,0 +1,508 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||
import org.apache.hadoop.yarn.util.Records;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||
import org.apache.hadoop.yarn.api.records.NodeReport;
|
||||
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
|
||||
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||
|
||||
/**
|
||||
* application master for allocating resources of rabit client
|
||||
*
|
||||
* @author Tianqi Chen
|
||||
*/
|
||||
public class ApplicationMaster {
|
||||
// logger
|
||||
private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
|
||||
// configuration
|
||||
private Configuration conf = new YarnConfiguration();
|
||||
// hdfs handler
|
||||
private FileSystem dfs;
|
||||
|
||||
// number of cores allocated for each task
|
||||
private int numVCores = 1;
|
||||
// memory needed requested for the task
|
||||
private int numMemoryMB = 10;
|
||||
// priority of the app master
|
||||
private int appPriority = 0;
|
||||
// total number of tasks
|
||||
private int numTasks = 1;
|
||||
// maximum number of attempts to try in each task
|
||||
private int maxNumAttempt = 3;
|
||||
// command to launch
|
||||
private String command = "";
|
||||
|
||||
// application tracker hostname
|
||||
private String appHostName = "";
|
||||
// tracker URL to do
|
||||
private String appTrackerUrl = "";
|
||||
// tracker port
|
||||
private int appTrackerPort = 0;
|
||||
|
||||
// whether we start to abort the application, due to whatever fatal reasons
|
||||
private boolean startAbort = false;
|
||||
// worker resources
|
||||
private Map<String, LocalResource> workerResources = new java.util.HashMap<String, LocalResource>();
|
||||
// record the aborting reason
|
||||
private String abortDiagnosis = "";
|
||||
// resource manager
|
||||
private AMRMClientAsync<ContainerRequest> rmClient = null;
|
||||
// node manager
|
||||
private NMClientAsync nmClient = null;
|
||||
|
||||
// list of tasks that pending for resources to be allocated
|
||||
private final Queue<TaskRecord> pendingTasks = new java.util.LinkedList<TaskRecord>();
|
||||
// map containerId->task record of tasks that was running
|
||||
private final Map<ContainerId, TaskRecord> runningTasks = new java.util.HashMap<ContainerId, TaskRecord>();
|
||||
// collection of tasks
|
||||
private final Collection<TaskRecord> finishedTasks = new java.util.LinkedList<TaskRecord>();
|
||||
// collection of killed tasks
|
||||
private final Collection<TaskRecord> killedTasks = new java.util.LinkedList<TaskRecord>();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new ApplicationMaster().run(args);
|
||||
}
|
||||
|
||||
private ApplicationMaster() throws IOException {
|
||||
dfs = FileSystem.get(conf);
|
||||
}
|
||||
|
||||
/**
|
||||
* get integer argument from environment variable
|
||||
*
|
||||
* @param name
|
||||
* name of key
|
||||
* @param required
|
||||
* whether this is required
|
||||
* @param defv
|
||||
* default value
|
||||
* @return the requested result
|
||||
*/
|
||||
private int getEnvInteger(String name, boolean required, int defv)
|
||||
throws IOException {
|
||||
String value = System.getenv(name);
|
||||
if (value == null) {
|
||||
if (required) {
|
||||
throw new IOException("environment variable " + name
|
||||
+ " not set");
|
||||
} else {
|
||||
return defv;
|
||||
}
|
||||
}
|
||||
return Integer.valueOf(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize from arguments and command lines
|
||||
*
|
||||
* @param args
|
||||
*/
|
||||
private void initArgs(String args[]) throws IOException {
|
||||
LOG.info("Invoke initArgs");
|
||||
// cached maps
|
||||
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
if (args[i].equals("-file")) {
|
||||
String[] arr = args[++i].split("#");
|
||||
Path path = new Path(arr[0]);
|
||||
if (arr.length == 1) {
|
||||
cacheFiles.put(path.getName(), path);
|
||||
} else {
|
||||
cacheFiles.put(arr[1], path);
|
||||
}
|
||||
} else {
|
||||
this.command += args[i] + " ";
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, Path> e : cacheFiles.entrySet()) {
|
||||
LocalResource r = Records.newRecord(LocalResource.class);
|
||||
FileStatus status = dfs.getFileStatus(e.getValue());
|
||||
r.setResource(ConverterUtils.getYarnUrlFromPath(e.getValue()));
|
||||
r.setSize(status.getLen());
|
||||
r.setTimestamp(status.getModificationTime());
|
||||
r.setType(LocalResourceType.FILE);
|
||||
r.setVisibility(LocalResourceVisibility.APPLICATION);
|
||||
workerResources.put(e.getKey(), r);
|
||||
}
|
||||
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
* called to start the application
|
||||
*/
|
||||
private void run(String args[]) throws Exception {
|
||||
this.initArgs(args);
|
||||
this.rmClient = AMRMClientAsync.createAMRMClientAsync(1000,
|
||||
new RMCallbackHandler());
|
||||
this.nmClient = NMClientAsync
|
||||
.createNMClientAsync(new NMCallbackHandler());
|
||||
this.rmClient.init(conf);
|
||||
this.rmClient.start();
|
||||
this.nmClient.init(conf);
|
||||
this.nmClient.start();
|
||||
RegisterApplicationMasterResponse response = this.rmClient
|
||||
.registerApplicationMaster(this.appHostName,
|
||||
this.appTrackerPort, this.appTrackerUrl);
|
||||
|
||||
boolean success = false;
|
||||
String diagnostics = "";
|
||||
try {
|
||||
// list of tasks that waits to be submit
|
||||
java.util.Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||
// add waiting tasks
|
||||
for (int i = 0; i < this.numTasks; ++i) {
|
||||
tasks.add(new TaskRecord(i));
|
||||
}
|
||||
Resource maxResource = response.getMaximumResourceCapability();
|
||||
|
||||
if (maxResource.getMemory() < this.numMemoryMB) {
|
||||
LOG.warn("[Rabit] memory requested exceed bound "
|
||||
+ maxResource.getMemory());
|
||||
this.numMemoryMB = maxResource.getMemory();
|
||||
}
|
||||
if (maxResource.getVirtualCores() < this.numVCores) {
|
||||
LOG.warn("[Rabit] memory requested exceed bound "
|
||||
+ maxResource.getVirtualCores());
|
||||
this.numVCores = maxResource.getVirtualCores();
|
||||
}
|
||||
this.submitTasks(tasks);
|
||||
LOG.info("[Rabit] ApplicationMaster started");
|
||||
while (!this.doneAllJobs()) {
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
}
|
||||
}
|
||||
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
||||
success = finishedTasks.size() == numTasks;
|
||||
LOG.info("Application completed. Stopping running containers");
|
||||
nmClient.stop();
|
||||
diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||
LOG.info(diagnostics);
|
||||
} catch (Exception e) {
|
||||
diagnostics = e.toString();
|
||||
}
|
||||
rmClient.unregisterApplicationMaster(
|
||||
success ? FinalApplicationStatus.SUCCEEDED
|
||||
: FinalApplicationStatus.FAILED, diagnostics,
|
||||
appTrackerUrl);
|
||||
if (!success) throw new Exception("Application not successful");
|
||||
}
|
||||
|
||||
/**
|
||||
* check if the job finishes
|
||||
*
|
||||
* @return whether we finished all the jobs
|
||||
*/
|
||||
private synchronized boolean doneAllJobs() {
|
||||
return pendingTasks.size() == 0 && runningTasks.size() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* submit tasks to request containers for the tasks
|
||||
*
|
||||
* @param tasks
|
||||
* a collection of tasks we want to ask container for
|
||||
*/
|
||||
private synchronized void submitTasks(Collection<TaskRecord> tasks) {
|
||||
for (TaskRecord r : tasks) {
|
||||
Resource resource = Records.newRecord(Resource.class);
|
||||
resource.setMemory(numMemoryMB);
|
||||
resource.setVirtualCores(numVCores);
|
||||
Priority priority = Records.newRecord(Priority.class);
|
||||
priority.setPriority(this.appPriority);
|
||||
r.containerRequest = new ContainerRequest(resource, null, null,
|
||||
priority);
|
||||
rmClient.addContainerRequest(r.containerRequest);
|
||||
pendingTasks.add(r);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* launch the task on container
|
||||
*
|
||||
* @param container
|
||||
* container to run the task
|
||||
* @param task
|
||||
* the task
|
||||
*/
|
||||
private void launchTask(Container container, TaskRecord task) {
|
||||
task.container = container;
|
||||
task.containerRequest = null;
|
||||
ContainerLaunchContext ctx = Records
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
"CLASSPATH=${CLASSPATH}:`${HADOOP_PREFIX}/bin/hadoop classpath --glob` "
|
||||
+ this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
LOG.info(cmd);
|
||||
ctx.setCommands(Collections.singletonList(cmd));
|
||||
LOG.info(workerResources);
|
||||
ctx.setLocalResources(this.workerResources);
|
||||
// setup environment variables
|
||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||
|
||||
// setup class path, this is kind of duplicated, ignoring
|
||||
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||
for (String c : conf.getStrings(
|
||||
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||
cpath.append(':');
|
||||
cpath.append(c.trim());
|
||||
}
|
||||
// already use hadoop command to get class path in worker, maybe a better solution in future
|
||||
// env.put("CLASSPATH", cpath.toString());
|
||||
// setup LD_LIBARY_PATH path for libhdfs
|
||||
env.put("LD_LIBRARY_PATH",
|
||||
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||
env.put("PYTHONPATH", "${PYTHONPATH}:.");
|
||||
// inherit all rabit variables
|
||||
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||
|
||||
ctx.setEnvironment(env);
|
||||
synchronized (this) {
|
||||
assert (!this.runningTasks.containsKey(container.getId()));
|
||||
this.runningTasks.put(container.getId(), task);
|
||||
this.nmClient.startContainerAsync(container, ctx);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* free the containers that have not yet been launched
|
||||
*
|
||||
* @param containers
|
||||
*/
|
||||
private synchronized void freeUnusedContainers(
|
||||
Collection<Container> containers) {
|
||||
}
|
||||
|
||||
/**
|
||||
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||
*
|
||||
* @param containers
|
||||
*/
|
||||
private synchronized void onContainersAllocated(List<Container> containers) {
|
||||
if (this.startAbort) {
|
||||
this.freeUnusedContainers(containers);
|
||||
return;
|
||||
}
|
||||
Collection<Container> freelist = new java.util.LinkedList<Container>();
|
||||
for (Container c : containers) {
|
||||
TaskRecord task;
|
||||
task = pendingTasks.poll();
|
||||
if (task == null) {
|
||||
freelist.add(c);
|
||||
continue;
|
||||
}
|
||||
this.launchTask(c, task);
|
||||
}
|
||||
this.freeUnusedContainers(freelist);
|
||||
}
|
||||
|
||||
/**
|
||||
* start aborting the job
|
||||
*
|
||||
* @param msg
|
||||
* the fatal message
|
||||
*/
|
||||
private synchronized void abortJob(String msg) {
|
||||
if (!this.startAbort)
|
||||
this.abortDiagnosis = msg;
|
||||
this.startAbort = true;
|
||||
for (TaskRecord r : this.runningTasks.values()) {
|
||||
if (!r.abortRequested) {
|
||||
nmClient.stopContainerAsync(r.container.getId(),
|
||||
r.container.getNodeId());
|
||||
r.abortRequested = true;
|
||||
}
|
||||
}
|
||||
this.killedTasks.addAll(this.pendingTasks);
|
||||
for (TaskRecord r : this.pendingTasks) {
|
||||
rmClient.removeContainerRequest(r.containerRequest);
|
||||
}
|
||||
this.pendingTasks.clear();
|
||||
LOG.info(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* handle non fatal failures
|
||||
*
|
||||
* @param cid
|
||||
*/
|
||||
private synchronized void handleFailure(Collection<ContainerId> failed) {
|
||||
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||
for (ContainerId cid : failed) {
|
||||
TaskRecord r = runningTasks.remove(cid);
|
||||
if (r == null)
|
||||
continue;
|
||||
r.attemptCounter += 1;
|
||||
r.container = null;
|
||||
tasks.add(r);
|
||||
if (r.attemptCounter >= this.maxNumAttempt) {
|
||||
this.abortJob("[Rabit] Task " + r.taskId + " failed more than "
|
||||
+ r.attemptCounter + "times");
|
||||
}
|
||||
}
|
||||
if (this.startAbort) {
|
||||
this.killedTasks.addAll(tasks);
|
||||
} else {
|
||||
this.submitTasks(tasks);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* handle method for AMRMClientAsync.CallbackHandler container allocation
|
||||
*
|
||||
* @param status
|
||||
* list of status
|
||||
*/
|
||||
private synchronized void onContainersCompleted(List<ContainerStatus> status) {
|
||||
Collection<ContainerId> failed = new java.util.LinkedList<ContainerId>();
|
||||
for (ContainerStatus s : status) {
|
||||
assert (s.getState().equals(ContainerState.COMPLETE));
|
||||
int exstatus = s.getExitStatus();
|
||||
TaskRecord r = runningTasks.get(s.getContainerId());
|
||||
if (r == null)
|
||||
continue;
|
||||
if (exstatus == ContainerExitStatus.SUCCESS) {
|
||||
finishedTasks.add(r);
|
||||
runningTasks.remove(s.getContainerId());
|
||||
} else {
|
||||
switch (exstatus) {
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
break;
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
break;
|
||||
default:
|
||||
LOG.info("[Rabit] Task " + r.taskId
|
||||
+ " exited with status " + exstatus);
|
||||
failed.add(s.getContainerId());
|
||||
}
|
||||
}
|
||||
}
|
||||
this.handleFailure(failed);
|
||||
}
|
||||
|
||||
/**
|
||||
* callback handler for resource manager
|
||||
*/
|
||||
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
|
||||
@Override
|
||||
public float getProgress() {
|
||||
return 1.0f - (float) (pendingTasks.size()) / numTasks;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onContainersAllocated(List<Container> containers) {
|
||||
ApplicationMaster.this.onContainersAllocated(containers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onContainersCompleted(List<ContainerStatus> status) {
|
||||
ApplicationMaster.this.onContainersCompleted(status);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onError(Throwable ex) {
|
||||
ApplicationMaster.this.abortJob("[Rabit] Resource manager Error "
|
||||
+ ex.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onNodesUpdated(List<NodeReport> nodereport) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onShutdownRequest() {
|
||||
ApplicationMaster.this
|
||||
.abortJob("[Rabit] Get shutdown request, start to shutdown...");
|
||||
}
|
||||
}
|
||||
|
||||
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
|
||||
@Override
|
||||
public void onContainerStarted(ContainerId cid,
|
||||
Map<String, ByteBuffer> services) {
|
||||
LOG.debug("onContainerStarted Invoked");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onContainerStatusReceived(ContainerId cid,
|
||||
ContainerStatus status) {
|
||||
LOG.debug("onContainerStatusReceived Invoked");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onContainerStopped(ContainerId cid) {
|
||||
LOG.debug("onContainerStopped Invoked");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onGetContainerStatusError(ContainerId cid, Throwable ex) {
|
||||
LOG.debug("onGetContainerStatusError Invoked: " + ex.toString());
|
||||
ApplicationMaster.this
|
||||
.handleFailure(Collections.singletonList(cid));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onStartContainerError(ContainerId cid, Throwable ex) {
|
||||
LOG.debug("onStartContainerError Invoked: " + ex.toString());
|
||||
ApplicationMaster.this
|
||||
.handleFailure(Collections.singletonList(cid));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onStopContainerError(ContainerId cid, Throwable ex) {
|
||||
LOG.info("onStopContainerError Invoked: " + ex.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
233
yarn/src/org/apache/hadoop/yarn/rabit/Client.java
Normal file
233
yarn/src/org/apache/hadoop/yarn/rabit/Client.java
Normal file
@@ -0,0 +1,233 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||
import org.apache.hadoop.yarn.util.Records;
|
||||
|
||||
public class Client {
|
||||
// logger
|
||||
private static final Log LOG = LogFactory.getLog(Client.class);
|
||||
// permission for temp file
|
||||
private static final FsPermission permTemp = new FsPermission("777");
|
||||
// configuration
|
||||
private YarnConfiguration conf = new YarnConfiguration();
|
||||
// hdfs handler
|
||||
private FileSystem dfs;
|
||||
// cached maps
|
||||
private Map<String, String> cacheFiles = new java.util.HashMap<String, String>();
|
||||
// enviroment variable to setup cachefiles
|
||||
private String cacheFileArg = "";
|
||||
// args to pass to application master
|
||||
private String appArgs = "";
|
||||
// HDFS Path to store temporal result
|
||||
private String tempdir = "/tmp";
|
||||
// job name
|
||||
private String jobName = "";
|
||||
/**
|
||||
* constructor
|
||||
* @throws IOException
|
||||
*/
|
||||
private Client() throws IOException {
|
||||
dfs = FileSystem.get(conf);
|
||||
}
|
||||
|
||||
/**
|
||||
* ge
|
||||
*
|
||||
* @param fmaps
|
||||
* the file maps
|
||||
* @return the resource map
|
||||
* @throws IOException
|
||||
*/
|
||||
private Map<String, LocalResource> setupCacheFiles(ApplicationId appId) throws IOException {
|
||||
// create temporary rabit directory
|
||||
Path tmpPath = new Path(this.tempdir);
|
||||
if (!dfs.exists(tmpPath)) {
|
||||
dfs.mkdirs(tmpPath, permTemp);
|
||||
LOG.info("HDFS temp directory do not exist, creating.. " + tmpPath);
|
||||
}
|
||||
tmpPath = new Path(tmpPath + "/temp-rabit-yarn-" + appId);
|
||||
if (dfs.exists(tmpPath)) {
|
||||
dfs.delete(tmpPath, true);
|
||||
}
|
||||
// create temporary directory
|
||||
FileSystem.mkdirs(dfs, tmpPath, permTemp);
|
||||
|
||||
StringBuilder cstr = new StringBuilder();
|
||||
Map<String, LocalResource> rmap = new java.util.HashMap<String, LocalResource>();
|
||||
for (Map.Entry<String, String> e : cacheFiles.entrySet()) {
|
||||
LocalResource r = Records.newRecord(LocalResource.class);
|
||||
Path path = new Path(e.getValue());
|
||||
// copy local data to temporary folder in HDFS
|
||||
if (!e.getValue().startsWith("hdfs://")) {
|
||||
Path dst = new Path("hdfs://" + tmpPath + "/"+ path.getName());
|
||||
dfs.copyFromLocalFile(false, true, path, dst);
|
||||
dfs.setPermission(dst, permTemp);
|
||||
dfs.deleteOnExit(dst);
|
||||
path = dst;
|
||||
}
|
||||
FileStatus status = dfs.getFileStatus(path);
|
||||
r.setResource(ConverterUtils.getYarnUrlFromPath(path));
|
||||
r.setSize(status.getLen());
|
||||
r.setTimestamp(status.getModificationTime());
|
||||
r.setType(LocalResourceType.FILE);
|
||||
r.setVisibility(LocalResourceVisibility.APPLICATION);
|
||||
rmap.put(e.getKey(), r);
|
||||
cstr.append(" -file \"");
|
||||
cstr.append(path.toString());
|
||||
cstr.append('#');
|
||||
cstr.append(e.getKey());
|
||||
cstr.append("\"");
|
||||
}
|
||||
|
||||
dfs.deleteOnExit(tmpPath);
|
||||
this.cacheFileArg = cstr.toString();
|
||||
return rmap;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the environment variables for container
|
||||
*
|
||||
* @return the env variable for child class
|
||||
*/
|
||||
private Map<String, String> getEnvironment() {
|
||||
// Setup environment variables
|
||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||
String cpath = "${CLASSPATH}:./*";
|
||||
for (String c : conf.getStrings(
|
||||
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||
cpath += ':';
|
||||
cpath += c.trim();
|
||||
}
|
||||
env.put("CLASSPATH", cpath);
|
||||
for (Map.Entry<String, String> e : System.getenv().entrySet()) {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
LOG.debug(env);
|
||||
return env;
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize the settings
|
||||
*
|
||||
* @param args
|
||||
*/
|
||||
private void initArgs(String[] args) {
|
||||
// directly pass all arguments except args0
|
||||
StringBuilder sargs = new StringBuilder("");
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
if (args[i].equals("-file")) {
|
||||
String[] arr = args[++i].split("#");
|
||||
if (arr.length == 1) {
|
||||
cacheFiles.put(new Path(arr[0]).getName(), arr[0]);
|
||||
} else {
|
||||
cacheFiles.put(arr[1], arr[0]);
|
||||
}
|
||||
} else if(args[i].equals("-jobname")) {
|
||||
this.jobName = args[++i];
|
||||
} else if(args[i].equals("-tempdir")) {
|
||||
this.tempdir = args[++i];
|
||||
} else {
|
||||
sargs.append(" ");
|
||||
sargs.append(args[i]);
|
||||
}
|
||||
}
|
||||
this.appArgs = sargs.toString();
|
||||
}
|
||||
|
||||
private void run(String[] args) throws Exception {
|
||||
if (args.length == 0) {
|
||||
System.out.println("Usage: [options] [commands..]");
|
||||
System.out.println("options: [-file filename]");
|
||||
return;
|
||||
}
|
||||
this.initArgs(args);
|
||||
// Create yarnClient
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
YarnClient yarnClient = YarnClient.createYarnClient();
|
||||
yarnClient.init(conf);
|
||||
yarnClient.start();
|
||||
|
||||
// Create application via yarnClient
|
||||
YarnClientApplication app = yarnClient.createApplication();
|
||||
|
||||
// Set up the container launch context for the application master
|
||||
ContainerLaunchContext amContainer = Records
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
ApplicationSubmissionContext appContext = app
|
||||
.getApplicationSubmissionContext();
|
||||
// Submit application
|
||||
ApplicationId appId = appContext.getApplicationId();
|
||||
// setup cache-files and environment variables
|
||||
amContainer.setLocalResources(this.setupCacheFiles(appId));
|
||||
amContainer.setEnvironment(this.getEnvironment());
|
||||
String cmd = "$JAVA_HOME/bin/java"
|
||||
+ " -Xmx256M"
|
||||
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
|
||||
+ this.cacheFileArg + ' ' + this.appArgs + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr";
|
||||
LOG.debug(cmd);
|
||||
amContainer.setCommands(Collections.singletonList(cmd));
|
||||
|
||||
// Set up resource type requirements for ApplicationMaster
|
||||
Resource capability = Records.newRecord(Resource.class);
|
||||
capability.setMemory(256);
|
||||
capability.setVirtualCores(1);
|
||||
LOG.info("jobname=" + this.jobName);
|
||||
|
||||
appContext.setApplicationName(jobName + ":RABIT-YARN");
|
||||
appContext.setAMContainerSpec(amContainer);
|
||||
appContext.setResource(capability);
|
||||
appContext.setQueue("default");
|
||||
|
||||
LOG.info("Submitting application " + appId);
|
||||
yarnClient.submitApplication(appContext);
|
||||
|
||||
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
|
||||
YarnApplicationState appState = appReport.getYarnApplicationState();
|
||||
while (appState != YarnApplicationState.FINISHED
|
||||
&& appState != YarnApplicationState.KILLED
|
||||
&& appState != YarnApplicationState.FAILED) {
|
||||
Thread.sleep(100);
|
||||
appReport = yarnClient.getApplicationReport(appId);
|
||||
appState = appReport.getYarnApplicationState();
|
||||
}
|
||||
|
||||
System.out.println("Application " + appId + " finished with"
|
||||
+ " state " + appState + " at " + appReport.getFinishTime());
|
||||
if (!appReport.getFinalApplicationStatus().equals(
|
||||
FinalApplicationStatus.SUCCEEDED)) {
|
||||
System.err.println(appReport.getDiagnostics());
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new Client().run(args);
|
||||
}
|
||||
}
|
||||
24
yarn/src/org/apache/hadoop/yarn/rabit/TaskRecord.java
Normal file
24
yarn/src/org/apache/hadoop/yarn/rabit/TaskRecord.java
Normal file
@@ -0,0 +1,24 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||
|
||||
/**
|
||||
* data structure to hold the task information
|
||||
*/
|
||||
public class TaskRecord {
|
||||
// task id of the task
|
||||
public int taskId = 0;
|
||||
// number of failed attempts to run the task
|
||||
public int attemptCounter = 0;
|
||||
// container request, can be null if task is already running
|
||||
public ContainerRequest containerRequest = null;
|
||||
// running container, can be null if the task is not launched
|
||||
public Container container = null;
|
||||
// whether we have requested abortion of this task
|
||||
public boolean abortRequested = false;
|
||||
|
||||
public TaskRecord(int taskId) {
|
||||
this.taskId = taskId;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user