Squashed 'subtree/rabit/' changes from 091634b..59e63bc

59e63bc minor
6233050 ok
14477f9 add namenode
75a6d34 add libhdfs opts
e3c76bf minmum fix
8b3c435 chg
2035799 test code
7751b2b add debug
7690313 ok
bd346b4 ok
faba1dc add testload
6f7783e add testload
e5f0340 ok
3ed9ec8 chg
e552ac4 ask for more ram in am
b2505e3 only stop nm when sucess
bc696c9 add queue info
f3e867e add option queue
5dc843c refactor fileio
cd9c81b quick fix
1e23af2 add virtual destructor to iseekstream
f165ffb fix hdfs
8cc6508 allow demo to pass in env
fad4d69 ok
0fd6197 fix more
7423837 fix more
d25de54 add temporal solution, run_yarn_prog.py
e5a9e31 final attempt
ed3bee8 add command back
0774000 add hdfs to resource
9b66e7e fix hadoop
6812f14 ok
08e1c16 change hadoop prefix back to hadoop home
d6b6828 Update build.sh
146e069 bugfix: logical boundary for ring buffer
19cb685 ok
4cf3c13 Merge branch 'master' of ssh://github.com/tqchen/rabit
20daddb add tracker
c57dad8 add ringbased passing and batch schedule
295d8a1 update
994cb02 add sge
014c866 OK

git-subtree-dir: subtree/rabit
git-subtree-split: 59e63bc135
This commit is contained in:
tqchen
2015-03-21 00:44:31 -07:00
parent 13a319ca01
commit 75bf97b575
34 changed files with 869 additions and 207 deletions

View File

@@ -1 +0,0 @@
foler used to hold generated class files

View File

@@ -1,4 +1,8 @@
#!/bin/bash
CPATH=`${HADOOP_PREFIX}/bin/hadoop classpath`
if [ ! -d bin ]; then
mkdir bin
fi
CPATH=`${HADOOP_HOME}/bin/hadoop classpath`
javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
jar cf rabit-yarn.jar -C bin .

45
yarn/run_hdfs_prog.py Executable file
View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python
"""
this script helps setup classpath env for HDFS, before running program
that links with libhdfs
"""
import glob
import sys
import os
import subprocess
if len(sys.argv) < 2:
print 'Usage: the command you want to run'
hadoop_home = os.getenv('HADOOP_HOME')
hdfs_home = os.getenv('HADOOP_HDFS_HOME')
java_home = os.getenv('JAVA_HOME')
if hadoop_home is None:
hadoop_home = os.getenv('HADOOP_PREFIX')
assert hadoop_home is not None, 'need to set HADOOP_HOME'
assert hdfs_home is not None, 'need to set HADOOP_HDFS_HOME'
assert java_home is not None, 'need to set JAVA_HOME'
(classpath, err) = subprocess.Popen('%s/bin/hadoop classpath' % hadoop_home,
stdout=subprocess.PIPE, shell = True,
env = os.environ).communicate()
cpath = []
for f in classpath.split(':'):
cpath += glob.glob(f)
lpath = []
lpath.append('%s/lib/native' % hdfs_home)
lpath.append('%s/jre/lib/amd64/server' % java_home)
env = os.environ.copy()
env['CLASSPATH'] = '${CLASSPATH}:' + (':'.join(cpath))
# setup hdfs options
if 'rabit_hdfs_opts' in env:
env['LIBHDFS_OPTS'] = env['rabit_hdfs_opts']
elif 'LIBHDFS_OPTS' not in env:
env['LIBHDFS_OPTS'] = '--Xmx128m'
env['LD_LIBRARY_PATH'] = '${LD_LIBRARY_PATH}:' + (':'.join(lpath))
ret = subprocess.call(args = sys.argv[1:], env = env)
sys.exit(ret)

View File

@@ -1,5 +1,6 @@
package org.apache.hadoop.yarn.rabit;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
@@ -7,6 +8,7 @@ import java.util.Map;
import java.util.Queue;
import java.util.Collection;
import java.util.Collections;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -34,6 +36,7 @@ import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.security.UserGroupInformation;
/**
* application master for allocating resources of rabit client
@@ -61,6 +64,8 @@ public class ApplicationMaster {
// command to launch
private String command = "";
// username
private String userName = "";
// application tracker hostname
private String appHostName = "";
// tracker URL to do
@@ -128,6 +133,8 @@ public class ApplicationMaster {
*/
private void initArgs(String args[]) throws IOException {
LOG.info("Invoke initArgs");
// get user name
userName = UserGroupInformation.getCurrentUser().getShortUserName();
// cached maps
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
for (int i = 0; i < args.length; ++i) {
@@ -156,7 +163,8 @@ public class ApplicationMaster {
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
maxNumAttempt);
}
/**
@@ -175,7 +183,7 @@ public class ApplicationMaster {
RegisterApplicationMasterResponse response = this.rmClient
.registerApplicationMaster(this.appHostName,
this.appTrackerPort, this.appTrackerUrl);
boolean success = false;
String diagnostics = "";
try {
@@ -208,19 +216,20 @@ public class ApplicationMaster {
assert (killedTasks.size() + finishedTasks.size() == numTasks);
success = finishedTasks.size() == numTasks;
LOG.info("Application completed. Stopping running containers");
nmClient.stop();
diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
+ ", finished=" + this.finishedTasks.size() + ", failed="
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
+ ", finished=" + this.finishedTasks.size() + ", failed="
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
nmClient.stop();
LOG.info(diagnostics);
} catch (Exception e) {
diagnostics = e.toString();
}
}
rmClient.unregisterApplicationMaster(
success ? FinalApplicationStatus.SUCCEEDED
: FinalApplicationStatus.FAILED, diagnostics,
appTrackerUrl);
if (!success) throw new Exception("Application not successful");
if (!success)
throw new Exception("Application not successful");
}
/**
@@ -265,30 +274,63 @@ public class ApplicationMaster {
task.containerRequest = null;
ContainerLaunchContext ctx = Records
.newRecord(ContainerLaunchContext.class);
String cmd =
// use this to setup CLASSPATH correctly for libhdfs
"CLASSPATH=${CLASSPATH}:`${HADOOP_PREFIX}/bin/hadoop classpath --glob` "
+ this.command + " 1>"
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
+ "/stderr";
LOG.info(cmd);
String hadoop = "hadoop";
if (System.getenv("HADOOP_HOME") != null) {
hadoop = "${HADOOP_HOME}/bin/hadoop";
} else if (System.getenv("HADOOP_PREFIX") != null) {
hadoop = "${HADOOP_PREFIX}/bin/hadoop";
}
String cmd =
// use this to setup CLASSPATH correctly for libhdfs
this.command + " 1>"
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
+ "/stderr";
ctx.setCommands(Collections.singletonList(cmd));
LOG.info(workerResources);
ctx.setLocalResources(this.workerResources);
// setup environment variables
Map<String, String> env = new java.util.HashMap<String, String>();
// setup class path, this is kind of duplicated, ignoring
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
for (String c : conf.getStrings(
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
cpath.append(':');
cpath.append(c.trim());
String[] arrPath = c.split(":");
for (String ps : arrPath) {
if (ps.endsWith("*.jar") || ps.endsWith("*")) {
ps = ps.substring(0, ps.lastIndexOf('*'));
String prefix = ps.substring(0, ps.lastIndexOf('/'));
if (ps.startsWith("$")) {
String[] arr =ps.split("/", 2);
if (arr.length != 2) continue;
try {
ps = System.getenv(arr[0].substring(1)) + '/' + arr[1];
} catch (Exception e){
continue;
}
}
File dir = new File(ps);
if (dir.isDirectory()) {
for (File f: dir.listFiles()) {
if (f.isFile() && f.getPath().endsWith(".jar")) {
cpath.append(":");
cpath.append(prefix + '/' + f.getName());
}
}
}
} else {
cpath.append(':');
cpath.append(ps.trim());
}
}
}
// already use hadoop command to get class path in worker, maybe a better solution in future
// env.put("CLASSPATH", cpath.toString());
// already use hadoop command to get class path in worker, maybe a
// better solution in future
env.put("CLASSPATH", cpath.toString());
//LOG.info("CLASSPATH =" + cpath.toString());
// setup LD_LIBARY_PATH path for libhdfs
env.put("LD_LIBRARY_PATH",
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
@@ -298,10 +340,13 @@ public class ApplicationMaster {
if (e.getKey().startsWith("rabit_")) {
env.put(e.getKey(), e.getValue());
}
if (e.getKey() == "LIBHDFS_OPTS") {
env.put(e.getKey(), e.getValue());
}
}
env.put("rabit_task_id", String.valueOf(task.taskId));
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
// ctx.setUser(userName);
ctx.setEnvironment(env);
synchronized (this) {
assert (!this.runningTasks.containsKey(container.getId()));
@@ -376,8 +421,17 @@ public class ApplicationMaster {
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
for (ContainerId cid : failed) {
TaskRecord r = runningTasks.remove(cid);
if (r == null)
if (r == null) {
continue;
}
LOG.info("Task "
+ r.taskId
+ "failed on "
+ r.container.getId()
+ ". See LOG at : "
+ String.format("http://%s/node/containerlogs/%s/"
+ userName, r.container.getNodeHttpAddress(),
r.container.getId()));
r.attemptCounter += 1;
r.container = null;
tasks.add(r);
@@ -411,22 +465,26 @@ public class ApplicationMaster {
finishedTasks.add(r);
runningTasks.remove(s.getContainerId());
} else {
switch (exstatus) {
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
this.abortJob("[Rabit] Task "
+ r.taskId
+ " killed because of exceeding allocated physical memory");
break;
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
this.abortJob("[Rabit] Task "
+ r.taskId
+ " killed because of exceeding allocated virtual memory");
break;
default:
LOG.info("[Rabit] Task " + r.taskId
+ " exited with status " + exstatus);
failed.add(s.getContainerId());
try {
if (exstatus == ContainerExitStatus.class.getField(
"KILLED_EXCEEDED_PMEM").getInt(null)) {
this.abortJob("[Rabit] Task "
+ r.taskId
+ " killed because of exceeding allocated physical memory");
continue;
}
if (exstatus == ContainerExitStatus.class.getField(
"KILLED_EXCEEDED_VMEM").getInt(null)) {
this.abortJob("[Rabit] Task "
+ r.taskId
+ " killed because of exceeding allocated virtual memory");
continue;
}
} catch (Exception e) {
}
LOG.info("[Rabit] Task " + r.taskId + " exited with status "
+ exstatus + " Diagnostics:"+ s.getDiagnostics());
failed.add(s.getContainerId());
}
}
this.handleFailure(failed);

View File

@@ -9,6 +9,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
@@ -19,6 +20,7 @@ import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
@@ -43,14 +45,21 @@ public class Client {
private String appArgs = "";
// HDFS Path to store temporal result
private String tempdir = "/tmp";
// user name
private String userName = "";
// job name
private String jobName = "";
// queue
private String queue = "default";
/**
* constructor
* @throws IOException
*/
private Client() throws IOException {
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/hdfs-site.xml"));
dfs = FileSystem.get(conf);
userName = UserGroupInformation.getCurrentUser().getShortUserName();
}
/**
@@ -127,6 +136,9 @@ public class Client {
if (e.getKey().startsWith("rabit_")) {
env.put(e.getKey(), e.getValue());
}
if (e.getKey() == "LIBHDFS_OPTS") {
env.put(e.getKey(), e.getValue());
}
}
LOG.debug(env);
return env;
@@ -152,6 +164,8 @@ public class Client {
this.jobName = args[++i];
} else if(args[i].equals("-tempdir")) {
this.tempdir = args[++i];
} else if(args[i].equals("-queue")) {
this.queue = args[++i];
} else {
sargs.append(" ");
sargs.append(args[i]);
@@ -168,7 +182,6 @@ public class Client {
}
this.initArgs(args);
// Create yarnClient
YarnConfiguration conf = new YarnConfiguration();
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
@@ -181,13 +194,14 @@ public class Client {
.newRecord(ContainerLaunchContext.class);
ApplicationSubmissionContext appContext = app
.getApplicationSubmissionContext();
// Submit application
ApplicationId appId = appContext.getApplicationId();
// setup cache-files and environment variables
amContainer.setLocalResources(this.setupCacheFiles(appId));
amContainer.setEnvironment(this.getEnvironment());
String cmd = "$JAVA_HOME/bin/java"
+ " -Xmx256M"
+ " -Xmx900M"
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
+ this.cacheFileArg + ' ' + this.appArgs + " 1>"
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
@@ -197,15 +211,15 @@ public class Client {
// Set up resource type requirements for ApplicationMaster
Resource capability = Records.newRecord(Resource.class);
capability.setMemory(256);
capability.setMemory(1024);
capability.setVirtualCores(1);
LOG.info("jobname=" + this.jobName);
LOG.info("jobname=" + this.jobName + ",username=" + this.userName);
appContext.setApplicationName(jobName + ":RABIT-YARN");
appContext.setAMContainerSpec(amContainer);
appContext.setResource(capability);
appContext.setQueue("default");
appContext.setQueue(queue);
//appContext.setUser(userName);
LOG.info("Submitting application " + appId);
yarnClient.submitApplication(appContext);
@@ -218,12 +232,16 @@ public class Client {
appReport = yarnClient.getApplicationReport(appId);
appState = appReport.getYarnApplicationState();
}
System.out.println("Application " + appId + " finished with"
+ " state " + appState + " at " + appReport.getFinishTime());
if (!appReport.getFinalApplicationStatus().equals(
FinalApplicationStatus.SUCCEEDED)) {
System.err.println(appReport.getDiagnostics());
System.out.println("Available queues:");
for (QueueInfo q : yarnClient.getAllQueues()) {
System.out.println(q.getQueueName());
}
}
}