Squashed 'subtree/rabit/' changes from 091634b..59e63bc
59e63bcminor6233050ok14477f9add namenode75a6d34add libhdfs optse3c76bfminmum fix8b3c435chg2035799test code7751b2badd debug7690313okbd346b4okfaba1dcadd testload6f7783eadd testloade5f0340ok3ed9ec8chge552ac4ask for more ram in amb2505e3only stop nm when sucessbc696c9add queue infof3e867eadd option queue5dc843crefactor fileiocd9c81bquick fix1e23af2add virtual destructor to iseekstreamf165ffbfix hdfs8cc6508allow demo to pass in envfad4d69ok0fd6197fix more7423837fix mored25de54add temporal solution, run_yarn_prog.pye5a9e31final attempted3bee8add command back0774000add hdfs to resource9b66e7efix hadoop6812f14ok08e1c16change hadoop prefix back to hadoop homed6b6828Update build.sh146e069bugfix: logical boundary for ring buffer19cb685ok4cf3c13Merge branch 'master' of ssh://github.com/tqchen/rabit20daddbadd trackerc57dad8add ringbased passing and batch schedule295d8a1update994cb02add sge014c866OK git-subtree-dir: subtree/rabit git-subtree-split:59e63bc135
This commit is contained in:
@@ -1 +0,0 @@
|
||||
foler used to hold generated class files
|
||||
@@ -1,4 +1,8 @@
|
||||
#!/bin/bash
|
||||
CPATH=`${HADOOP_PREFIX}/bin/hadoop classpath`
|
||||
if [ ! -d bin ]; then
|
||||
mkdir bin
|
||||
fi
|
||||
|
||||
CPATH=`${HADOOP_HOME}/bin/hadoop classpath`
|
||||
javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
|
||||
jar cf rabit-yarn.jar -C bin .
|
||||
|
||||
45
yarn/run_hdfs_prog.py
Executable file
45
yarn/run_hdfs_prog.py
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
this script helps setup classpath env for HDFS, before running program
|
||||
that links with libhdfs
|
||||
"""
|
||||
import glob
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage: the command you want to run'
|
||||
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
hdfs_home = os.getenv('HADOOP_HDFS_HOME')
|
||||
java_home = os.getenv('JAVA_HOME')
|
||||
if hadoop_home is None:
|
||||
hadoop_home = os.getenv('HADOOP_PREFIX')
|
||||
assert hadoop_home is not None, 'need to set HADOOP_HOME'
|
||||
assert hdfs_home is not None, 'need to set HADOOP_HDFS_HOME'
|
||||
assert java_home is not None, 'need to set JAVA_HOME'
|
||||
|
||||
(classpath, err) = subprocess.Popen('%s/bin/hadoop classpath' % hadoop_home,
|
||||
stdout=subprocess.PIPE, shell = True,
|
||||
env = os.environ).communicate()
|
||||
cpath = []
|
||||
for f in classpath.split(':'):
|
||||
cpath += glob.glob(f)
|
||||
|
||||
lpath = []
|
||||
lpath.append('%s/lib/native' % hdfs_home)
|
||||
lpath.append('%s/jre/lib/amd64/server' % java_home)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['CLASSPATH'] = '${CLASSPATH}:' + (':'.join(cpath))
|
||||
|
||||
# setup hdfs options
|
||||
if 'rabit_hdfs_opts' in env:
|
||||
env['LIBHDFS_OPTS'] = env['rabit_hdfs_opts']
|
||||
elif 'LIBHDFS_OPTS' not in env:
|
||||
env['LIBHDFS_OPTS'] = '--Xmx128m'
|
||||
|
||||
env['LD_LIBRARY_PATH'] = '${LD_LIBRARY_PATH}:' + (':'.join(lpath))
|
||||
ret = subprocess.call(args = sys.argv[1:], env = env)
|
||||
sys.exit(ret)
|
||||
@@ -1,5 +1,6 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
@@ -7,6 +8,7 @@ import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -34,6 +36,7 @@ import org.apache.hadoop.yarn.api.records.NodeReport;
|
||||
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
|
||||
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
|
||||
/**
|
||||
* application master for allocating resources of rabit client
|
||||
@@ -61,6 +64,8 @@ public class ApplicationMaster {
|
||||
// command to launch
|
||||
private String command = "";
|
||||
|
||||
// username
|
||||
private String userName = "";
|
||||
// application tracker hostname
|
||||
private String appHostName = "";
|
||||
// tracker URL to do
|
||||
@@ -128,6 +133,8 @@ public class ApplicationMaster {
|
||||
*/
|
||||
private void initArgs(String args[]) throws IOException {
|
||||
LOG.info("Invoke initArgs");
|
||||
// get user name
|
||||
userName = UserGroupInformation.getCurrentUser().getShortUserName();
|
||||
// cached maps
|
||||
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
@@ -156,7 +163,8 @@ public class ApplicationMaster {
|
||||
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
|
||||
maxNumAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -175,7 +183,7 @@ public class ApplicationMaster {
|
||||
RegisterApplicationMasterResponse response = this.rmClient
|
||||
.registerApplicationMaster(this.appHostName,
|
||||
this.appTrackerPort, this.appTrackerUrl);
|
||||
|
||||
|
||||
boolean success = false;
|
||||
String diagnostics = "";
|
||||
try {
|
||||
@@ -208,19 +216,20 @@ public class ApplicationMaster {
|
||||
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
||||
success = finishedTasks.size() == numTasks;
|
||||
LOG.info("Application completed. Stopping running containers");
|
||||
nmClient.stop();
|
||||
diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||
nmClient.stop();
|
||||
LOG.info(diagnostics);
|
||||
} catch (Exception e) {
|
||||
diagnostics = e.toString();
|
||||
}
|
||||
}
|
||||
rmClient.unregisterApplicationMaster(
|
||||
success ? FinalApplicationStatus.SUCCEEDED
|
||||
: FinalApplicationStatus.FAILED, diagnostics,
|
||||
appTrackerUrl);
|
||||
if (!success) throw new Exception("Application not successful");
|
||||
if (!success)
|
||||
throw new Exception("Application not successful");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -265,30 +274,63 @@ public class ApplicationMaster {
|
||||
task.containerRequest = null;
|
||||
ContainerLaunchContext ctx = Records
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
"CLASSPATH=${CLASSPATH}:`${HADOOP_PREFIX}/bin/hadoop classpath --glob` "
|
||||
+ this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
LOG.info(cmd);
|
||||
String hadoop = "hadoop";
|
||||
if (System.getenv("HADOOP_HOME") != null) {
|
||||
hadoop = "${HADOOP_HOME}/bin/hadoop";
|
||||
} else if (System.getenv("HADOOP_PREFIX") != null) {
|
||||
hadoop = "${HADOOP_PREFIX}/bin/hadoop";
|
||||
}
|
||||
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
ctx.setCommands(Collections.singletonList(cmd));
|
||||
LOG.info(workerResources);
|
||||
ctx.setLocalResources(this.workerResources);
|
||||
// setup environment variables
|
||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||
|
||||
|
||||
// setup class path, this is kind of duplicated, ignoring
|
||||
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||
for (String c : conf.getStrings(
|
||||
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||
cpath.append(':');
|
||||
cpath.append(c.trim());
|
||||
String[] arrPath = c.split(":");
|
||||
for (String ps : arrPath) {
|
||||
if (ps.endsWith("*.jar") || ps.endsWith("*")) {
|
||||
ps = ps.substring(0, ps.lastIndexOf('*'));
|
||||
String prefix = ps.substring(0, ps.lastIndexOf('/'));
|
||||
if (ps.startsWith("$")) {
|
||||
String[] arr =ps.split("/", 2);
|
||||
if (arr.length != 2) continue;
|
||||
try {
|
||||
ps = System.getenv(arr[0].substring(1)) + '/' + arr[1];
|
||||
} catch (Exception e){
|
||||
continue;
|
||||
}
|
||||
}
|
||||
File dir = new File(ps);
|
||||
if (dir.isDirectory()) {
|
||||
for (File f: dir.listFiles()) {
|
||||
if (f.isFile() && f.getPath().endsWith(".jar")) {
|
||||
cpath.append(":");
|
||||
cpath.append(prefix + '/' + f.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpath.append(':');
|
||||
cpath.append(ps.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
// already use hadoop command to get class path in worker, maybe a better solution in future
|
||||
// env.put("CLASSPATH", cpath.toString());
|
||||
// already use hadoop command to get class path in worker, maybe a
|
||||
// better solution in future
|
||||
env.put("CLASSPATH", cpath.toString());
|
||||
//LOG.info("CLASSPATH =" + cpath.toString());
|
||||
// setup LD_LIBARY_PATH path for libhdfs
|
||||
env.put("LD_LIBRARY_PATH",
|
||||
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||
@@ -298,10 +340,13 @@ public class ApplicationMaster {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
if (e.getKey() == "LIBHDFS_OPTS") {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||
|
||||
// ctx.setUser(userName);
|
||||
ctx.setEnvironment(env);
|
||||
synchronized (this) {
|
||||
assert (!this.runningTasks.containsKey(container.getId()));
|
||||
@@ -376,8 +421,17 @@ public class ApplicationMaster {
|
||||
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||
for (ContainerId cid : failed) {
|
||||
TaskRecord r = runningTasks.remove(cid);
|
||||
if (r == null)
|
||||
if (r == null) {
|
||||
continue;
|
||||
}
|
||||
LOG.info("Task "
|
||||
+ r.taskId
|
||||
+ "failed on "
|
||||
+ r.container.getId()
|
||||
+ ". See LOG at : "
|
||||
+ String.format("http://%s/node/containerlogs/%s/"
|
||||
+ userName, r.container.getNodeHttpAddress(),
|
||||
r.container.getId()));
|
||||
r.attemptCounter += 1;
|
||||
r.container = null;
|
||||
tasks.add(r);
|
||||
@@ -411,22 +465,26 @@ public class ApplicationMaster {
|
||||
finishedTasks.add(r);
|
||||
runningTasks.remove(s.getContainerId());
|
||||
} else {
|
||||
switch (exstatus) {
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
break;
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
break;
|
||||
default:
|
||||
LOG.info("[Rabit] Task " + r.taskId
|
||||
+ " exited with status " + exstatus);
|
||||
failed.add(s.getContainerId());
|
||||
try {
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_PMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
continue;
|
||||
}
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_VMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
continue;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
LOG.info("[Rabit] Task " + r.taskId + " exited with status "
|
||||
+ exstatus + " Diagnostics:"+ s.getDiagnostics());
|
||||
failed.add(s.getContainerId());
|
||||
}
|
||||
}
|
||||
this.handleFailure(failed);
|
||||
|
||||
@@ -9,6 +9,7 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||
@@ -19,6 +20,7 @@ import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.QueueInfo;
|
||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
|
||||
@@ -43,14 +45,21 @@ public class Client {
|
||||
private String appArgs = "";
|
||||
// HDFS Path to store temporal result
|
||||
private String tempdir = "/tmp";
|
||||
// user name
|
||||
private String userName = "";
|
||||
// job name
|
||||
private String jobName = "";
|
||||
// queue
|
||||
private String queue = "default";
|
||||
/**
|
||||
* constructor
|
||||
* @throws IOException
|
||||
*/
|
||||
private Client() throws IOException {
|
||||
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
|
||||
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/hdfs-site.xml"));
|
||||
dfs = FileSystem.get(conf);
|
||||
userName = UserGroupInformation.getCurrentUser().getShortUserName();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -127,6 +136,9 @@ public class Client {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
if (e.getKey() == "LIBHDFS_OPTS") {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
LOG.debug(env);
|
||||
return env;
|
||||
@@ -152,6 +164,8 @@ public class Client {
|
||||
this.jobName = args[++i];
|
||||
} else if(args[i].equals("-tempdir")) {
|
||||
this.tempdir = args[++i];
|
||||
} else if(args[i].equals("-queue")) {
|
||||
this.queue = args[++i];
|
||||
} else {
|
||||
sargs.append(" ");
|
||||
sargs.append(args[i]);
|
||||
@@ -168,7 +182,6 @@ public class Client {
|
||||
}
|
||||
this.initArgs(args);
|
||||
// Create yarnClient
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
YarnClient yarnClient = YarnClient.createYarnClient();
|
||||
yarnClient.init(conf);
|
||||
yarnClient.start();
|
||||
@@ -181,13 +194,14 @@ public class Client {
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
ApplicationSubmissionContext appContext = app
|
||||
.getApplicationSubmissionContext();
|
||||
|
||||
// Submit application
|
||||
ApplicationId appId = appContext.getApplicationId();
|
||||
// setup cache-files and environment variables
|
||||
amContainer.setLocalResources(this.setupCacheFiles(appId));
|
||||
amContainer.setEnvironment(this.getEnvironment());
|
||||
String cmd = "$JAVA_HOME/bin/java"
|
||||
+ " -Xmx256M"
|
||||
+ " -Xmx900M"
|
||||
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
|
||||
+ this.cacheFileArg + ' ' + this.appArgs + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
@@ -197,15 +211,15 @@ public class Client {
|
||||
|
||||
// Set up resource type requirements for ApplicationMaster
|
||||
Resource capability = Records.newRecord(Resource.class);
|
||||
capability.setMemory(256);
|
||||
capability.setMemory(1024);
|
||||
capability.setVirtualCores(1);
|
||||
LOG.info("jobname=" + this.jobName);
|
||||
|
||||
LOG.info("jobname=" + this.jobName + ",username=" + this.userName);
|
||||
|
||||
appContext.setApplicationName(jobName + ":RABIT-YARN");
|
||||
appContext.setAMContainerSpec(amContainer);
|
||||
appContext.setResource(capability);
|
||||
appContext.setQueue("default");
|
||||
|
||||
appContext.setQueue(queue);
|
||||
//appContext.setUser(userName);
|
||||
LOG.info("Submitting application " + appId);
|
||||
yarnClient.submitApplication(appContext);
|
||||
|
||||
@@ -218,12 +232,16 @@ public class Client {
|
||||
appReport = yarnClient.getApplicationReport(appId);
|
||||
appState = appReport.getYarnApplicationState();
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Application " + appId + " finished with"
|
||||
+ " state " + appState + " at " + appReport.getFinishTime());
|
||||
if (!appReport.getFinalApplicationStatus().equals(
|
||||
FinalApplicationStatus.SUCCEEDED)) {
|
||||
System.err.println(appReport.getDiagnostics());
|
||||
System.out.println("Available queues:");
|
||||
for (QueueInfo q : yarnClient.getAllQueues()) {
|
||||
System.out.println(q.getQueueName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user