Merge commit '75bf97b57539e5572e7ae8eba72bac6562c63c07'
Conflicts: subtree/rabit/rabit-learn/io/line_split-inl.h subtree/rabit/yarn/build.sh
This commit is contained in:
@@ -1 +0,0 @@
|
||||
foler used to hold generated class files
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/bin/bash
|
||||
if [ -z "$HADOOP_PREFIX" ]; then
|
||||
echo "cannot found $HADOOP_PREFIX in the environment variable, please set it properly"
|
||||
exit 1
|
||||
if [ ! -d bin ]; then
|
||||
mkdir bin
|
||||
fi
|
||||
CPATH=`${HADOOP_PREFIX}/bin/hadoop classpath`
|
||||
|
||||
CPATH=`${HADOOP_HOME}/bin/hadoop classpath`
|
||||
javac -cp $CPATH -d bin src/org/apache/hadoop/yarn/rabit/*
|
||||
jar cf rabit-yarn.jar -C bin .
|
||||
|
||||
45
subtree/rabit/yarn/run_hdfs_prog.py
Executable file
45
subtree/rabit/yarn/run_hdfs_prog.py
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
this script helps setup classpath env for HDFS, before running program
|
||||
that links with libhdfs
|
||||
"""
|
||||
import glob
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage: the command you want to run'
|
||||
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
hdfs_home = os.getenv('HADOOP_HDFS_HOME')
|
||||
java_home = os.getenv('JAVA_HOME')
|
||||
if hadoop_home is None:
|
||||
hadoop_home = os.getenv('HADOOP_PREFIX')
|
||||
assert hadoop_home is not None, 'need to set HADOOP_HOME'
|
||||
assert hdfs_home is not None, 'need to set HADOOP_HDFS_HOME'
|
||||
assert java_home is not None, 'need to set JAVA_HOME'
|
||||
|
||||
(classpath, err) = subprocess.Popen('%s/bin/hadoop classpath' % hadoop_home,
|
||||
stdout=subprocess.PIPE, shell = True,
|
||||
env = os.environ).communicate()
|
||||
cpath = []
|
||||
for f in classpath.split(':'):
|
||||
cpath += glob.glob(f)
|
||||
|
||||
lpath = []
|
||||
lpath.append('%s/lib/native' % hdfs_home)
|
||||
lpath.append('%s/jre/lib/amd64/server' % java_home)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['CLASSPATH'] = '${CLASSPATH}:' + (':'.join(cpath))
|
||||
|
||||
# setup hdfs options
|
||||
if 'rabit_hdfs_opts' in env:
|
||||
env['LIBHDFS_OPTS'] = env['rabit_hdfs_opts']
|
||||
elif 'LIBHDFS_OPTS' not in env:
|
||||
env['LIBHDFS_OPTS'] = '--Xmx128m'
|
||||
|
||||
env['LD_LIBRARY_PATH'] = '${LD_LIBRARY_PATH}:' + (':'.join(lpath))
|
||||
ret = subprocess.call(args = sys.argv[1:], env = env)
|
||||
sys.exit(ret)
|
||||
@@ -1,5 +1,6 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
@@ -7,6 +8,7 @@ import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@@ -34,6 +36,7 @@ import org.apache.hadoop.yarn.api.records.NodeReport;
|
||||
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
|
||||
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
|
||||
/**
|
||||
* application master for allocating resources of rabit client
|
||||
@@ -61,6 +64,8 @@ public class ApplicationMaster {
|
||||
// command to launch
|
||||
private String command = "";
|
||||
|
||||
// username
|
||||
private String userName = "";
|
||||
// application tracker hostname
|
||||
private String appHostName = "";
|
||||
// tracker URL to do
|
||||
@@ -128,6 +133,8 @@ public class ApplicationMaster {
|
||||
*/
|
||||
private void initArgs(String args[]) throws IOException {
|
||||
LOG.info("Invoke initArgs");
|
||||
// get user name
|
||||
userName = UserGroupInformation.getCurrentUser().getShortUserName();
|
||||
// cached maps
|
||||
Map<String, Path> cacheFiles = new java.util.HashMap<String, Path>();
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
@@ -156,7 +163,8 @@ public class ApplicationMaster {
|
||||
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
|
||||
maxNumAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -175,7 +183,7 @@ public class ApplicationMaster {
|
||||
RegisterApplicationMasterResponse response = this.rmClient
|
||||
.registerApplicationMaster(this.appHostName,
|
||||
this.appTrackerPort, this.appTrackerUrl);
|
||||
|
||||
|
||||
boolean success = false;
|
||||
String diagnostics = "";
|
||||
try {
|
||||
@@ -208,19 +216,20 @@ public class ApplicationMaster {
|
||||
assert (killedTasks.size() + finishedTasks.size() == numTasks);
|
||||
success = finishedTasks.size() == numTasks;
|
||||
LOG.info("Application completed. Stopping running containers");
|
||||
nmClient.stop();
|
||||
diagnostics = "Diagnostics." + ", num_tasks" + this.numTasks
|
||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||
+ ", finished=" + this.finishedTasks.size() + ", failed="
|
||||
+ this.killedTasks.size() + "\n" + this.abortDiagnosis;
|
||||
nmClient.stop();
|
||||
LOG.info(diagnostics);
|
||||
} catch (Exception e) {
|
||||
diagnostics = e.toString();
|
||||
}
|
||||
}
|
||||
rmClient.unregisterApplicationMaster(
|
||||
success ? FinalApplicationStatus.SUCCEEDED
|
||||
: FinalApplicationStatus.FAILED, diagnostics,
|
||||
appTrackerUrl);
|
||||
if (!success) throw new Exception("Application not successful");
|
||||
if (!success)
|
||||
throw new Exception("Application not successful");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -265,30 +274,63 @@ public class ApplicationMaster {
|
||||
task.containerRequest = null;
|
||||
ContainerLaunchContext ctx = Records
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
"CLASSPATH=${CLASSPATH}:`${HADOOP_PREFIX}/bin/hadoop classpath --glob` "
|
||||
+ this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
LOG.info(cmd);
|
||||
String hadoop = "hadoop";
|
||||
if (System.getenv("HADOOP_HOME") != null) {
|
||||
hadoop = "${HADOOP_HOME}/bin/hadoop";
|
||||
} else if (System.getenv("HADOOP_PREFIX") != null) {
|
||||
hadoop = "${HADOOP_PREFIX}/bin/hadoop";
|
||||
}
|
||||
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
ctx.setCommands(Collections.singletonList(cmd));
|
||||
LOG.info(workerResources);
|
||||
ctx.setLocalResources(this.workerResources);
|
||||
// setup environment variables
|
||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||
|
||||
|
||||
// setup class path, this is kind of duplicated, ignoring
|
||||
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||
for (String c : conf.getStrings(
|
||||
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||
cpath.append(':');
|
||||
cpath.append(c.trim());
|
||||
String[] arrPath = c.split(":");
|
||||
for (String ps : arrPath) {
|
||||
if (ps.endsWith("*.jar") || ps.endsWith("*")) {
|
||||
ps = ps.substring(0, ps.lastIndexOf('*'));
|
||||
String prefix = ps.substring(0, ps.lastIndexOf('/'));
|
||||
if (ps.startsWith("$")) {
|
||||
String[] arr =ps.split("/", 2);
|
||||
if (arr.length != 2) continue;
|
||||
try {
|
||||
ps = System.getenv(arr[0].substring(1)) + '/' + arr[1];
|
||||
} catch (Exception e){
|
||||
continue;
|
||||
}
|
||||
}
|
||||
File dir = new File(ps);
|
||||
if (dir.isDirectory()) {
|
||||
for (File f: dir.listFiles()) {
|
||||
if (f.isFile() && f.getPath().endsWith(".jar")) {
|
||||
cpath.append(":");
|
||||
cpath.append(prefix + '/' + f.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpath.append(':');
|
||||
cpath.append(ps.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
// already use hadoop command to get class path in worker, maybe a better solution in future
|
||||
// env.put("CLASSPATH", cpath.toString());
|
||||
// already use hadoop command to get class path in worker, maybe a
|
||||
// better solution in future
|
||||
env.put("CLASSPATH", cpath.toString());
|
||||
//LOG.info("CLASSPATH =" + cpath.toString());
|
||||
// setup LD_LIBARY_PATH path for libhdfs
|
||||
env.put("LD_LIBRARY_PATH",
|
||||
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||
@@ -298,10 +340,13 @@ public class ApplicationMaster {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
if (e.getKey() == "LIBHDFS_OPTS") {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||
|
||||
// ctx.setUser(userName);
|
||||
ctx.setEnvironment(env);
|
||||
synchronized (this) {
|
||||
assert (!this.runningTasks.containsKey(container.getId()));
|
||||
@@ -376,8 +421,17 @@ public class ApplicationMaster {
|
||||
Collection<TaskRecord> tasks = new java.util.LinkedList<TaskRecord>();
|
||||
for (ContainerId cid : failed) {
|
||||
TaskRecord r = runningTasks.remove(cid);
|
||||
if (r == null)
|
||||
if (r == null) {
|
||||
continue;
|
||||
}
|
||||
LOG.info("Task "
|
||||
+ r.taskId
|
||||
+ "failed on "
|
||||
+ r.container.getId()
|
||||
+ ". See LOG at : "
|
||||
+ String.format("http://%s/node/containerlogs/%s/"
|
||||
+ userName, r.container.getNodeHttpAddress(),
|
||||
r.container.getId()));
|
||||
r.attemptCounter += 1;
|
||||
r.container = null;
|
||||
tasks.add(r);
|
||||
@@ -411,22 +465,26 @@ public class ApplicationMaster {
|
||||
finishedTasks.add(r);
|
||||
runningTasks.remove(s.getContainerId());
|
||||
} else {
|
||||
switch (exstatus) {
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
break;
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
break;
|
||||
default:
|
||||
LOG.info("[Rabit] Task " + r.taskId
|
||||
+ " exited with status " + exstatus);
|
||||
failed.add(s.getContainerId());
|
||||
try {
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_PMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
continue;
|
||||
}
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_VMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
continue;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
LOG.info("[Rabit] Task " + r.taskId + " exited with status "
|
||||
+ exstatus + " Diagnostics:"+ s.getDiagnostics());
|
||||
failed.add(s.getContainerId());
|
||||
}
|
||||
}
|
||||
this.handleFailure(failed);
|
||||
|
||||
@@ -9,6 +9,7 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||
@@ -19,6 +20,7 @@ import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.QueueInfo;
|
||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
|
||||
@@ -43,14 +45,21 @@ public class Client {
|
||||
private String appArgs = "";
|
||||
// HDFS Path to store temporal result
|
||||
private String tempdir = "/tmp";
|
||||
// user name
|
||||
private String userName = "";
|
||||
// job name
|
||||
private String jobName = "";
|
||||
// queue
|
||||
private String queue = "default";
|
||||
/**
|
||||
* constructor
|
||||
* @throws IOException
|
||||
*/
|
||||
private Client() throws IOException {
|
||||
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
|
||||
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/hdfs-site.xml"));
|
||||
dfs = FileSystem.get(conf);
|
||||
userName = UserGroupInformation.getCurrentUser().getShortUserName();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -127,6 +136,9 @@ public class Client {
|
||||
if (e.getKey().startsWith("rabit_")) {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
if (e.getKey() == "LIBHDFS_OPTS") {
|
||||
env.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
LOG.debug(env);
|
||||
return env;
|
||||
@@ -152,6 +164,8 @@ public class Client {
|
||||
this.jobName = args[++i];
|
||||
} else if(args[i].equals("-tempdir")) {
|
||||
this.tempdir = args[++i];
|
||||
} else if(args[i].equals("-queue")) {
|
||||
this.queue = args[++i];
|
||||
} else {
|
||||
sargs.append(" ");
|
||||
sargs.append(args[i]);
|
||||
@@ -168,7 +182,6 @@ public class Client {
|
||||
}
|
||||
this.initArgs(args);
|
||||
// Create yarnClient
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
YarnClient yarnClient = YarnClient.createYarnClient();
|
||||
yarnClient.init(conf);
|
||||
yarnClient.start();
|
||||
@@ -181,13 +194,14 @@ public class Client {
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
ApplicationSubmissionContext appContext = app
|
||||
.getApplicationSubmissionContext();
|
||||
|
||||
// Submit application
|
||||
ApplicationId appId = appContext.getApplicationId();
|
||||
// setup cache-files and environment variables
|
||||
amContainer.setLocalResources(this.setupCacheFiles(appId));
|
||||
amContainer.setEnvironment(this.getEnvironment());
|
||||
String cmd = "$JAVA_HOME/bin/java"
|
||||
+ " -Xmx256M"
|
||||
+ " -Xmx900M"
|
||||
+ " org.apache.hadoop.yarn.rabit.ApplicationMaster"
|
||||
+ this.cacheFileArg + ' ' + this.appArgs + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
@@ -197,15 +211,15 @@ public class Client {
|
||||
|
||||
// Set up resource type requirements for ApplicationMaster
|
||||
Resource capability = Records.newRecord(Resource.class);
|
||||
capability.setMemory(256);
|
||||
capability.setMemory(1024);
|
||||
capability.setVirtualCores(1);
|
||||
LOG.info("jobname=" + this.jobName);
|
||||
|
||||
LOG.info("jobname=" + this.jobName + ",username=" + this.userName);
|
||||
|
||||
appContext.setApplicationName(jobName + ":RABIT-YARN");
|
||||
appContext.setAMContainerSpec(amContainer);
|
||||
appContext.setResource(capability);
|
||||
appContext.setQueue("default");
|
||||
|
||||
appContext.setQueue(queue);
|
||||
//appContext.setUser(userName);
|
||||
LOG.info("Submitting application " + appId);
|
||||
yarnClient.submitApplication(appContext);
|
||||
|
||||
@@ -218,12 +232,16 @@ public class Client {
|
||||
appReport = yarnClient.getApplicationReport(appId);
|
||||
appState = appReport.getYarnApplicationState();
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Application " + appId + " finished with"
|
||||
+ " state " + appState + " at " + appReport.getFinishTime());
|
||||
if (!appReport.getFinalApplicationStatus().equals(
|
||||
FinalApplicationStatus.SUCCEEDED)) {
|
||||
System.err.println(appReport.getDiagnostics());
|
||||
System.out.println("Available queues:");
|
||||
for (QueueInfo q : yarnClient.getAllQueues()) {
|
||||
System.out.println(q.getQueueName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user