fix hadoop
This commit is contained in:
parent
6812f14886
commit
9b66e7edf2
@ -97,15 +97,18 @@ class HDFSSplit : public LineSplitBase {
|
||||
public:
|
||||
explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
|
||||
fs_ = hdfsConnect("default", 0);
|
||||
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
|
||||
std::vector<std::string> paths;
|
||||
LineSplitBase::SplitNames(&paths, uri, "#");
|
||||
// get the files
|
||||
std::vector<size_t> fsize;
|
||||
for (size_t i = 0; i < paths.size(); ++i) {
|
||||
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
|
||||
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
|
||||
if (info->mKind == 'D') {
|
||||
int nentry;
|
||||
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
|
||||
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
|
||||
for (int i = 0; i < nentry; ++i) {
|
||||
if (files[i].mKind == 'F') {
|
||||
fsize.push_back(files[i].mSize);
|
||||
|
||||
@ -6,9 +6,10 @@ then
|
||||
fi
|
||||
|
||||
# put the local training file to HDFS
|
||||
hadoop fs -rm -r -f $2/data
|
||||
#hadoop fs -rm -r -f $2/data
|
||||
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||
hadoop fs -mkdir $2/data
|
||||
#hadoop fs -mkdir $2/data
|
||||
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||
|
||||
# submit to hadoop
|
||||
../../tracker/rabit_yarn.py -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
|
||||
|
||||
@ -21,7 +21,7 @@ if not os.path.exists(YARN_JAR_PATH):
|
||||
subprocess.check_call(cmd, shell = True, env = os.environ)
|
||||
assert os.path.exists(YARN_JAR_PATH), "failed to build rabit-yarn.jar, try it manually"
|
||||
|
||||
hadoop_binary = 'hadoop'
|
||||
hadoop_binary = None
|
||||
# code
|
||||
hadoop_home = os.getenv('HADOOP_HOME')
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package org.apache.hadoop.yarn.rabit;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
@ -7,6 +8,7 @@ import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
@ -161,7 +163,8 @@ public class ApplicationMaster {
|
||||
numVCores = this.getEnvInteger("rabit_cpu_vcores", true, numVCores);
|
||||
numMemoryMB = this.getEnvInteger("rabit_memory_mb", true, numMemoryMB);
|
||||
numTasks = this.getEnvInteger("rabit_world_size", true, numTasks);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false, maxNumAttempt);
|
||||
maxNumAttempt = this.getEnvInteger("rabit_max_attempt", false,
|
||||
maxNumAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -180,7 +183,7 @@ public class ApplicationMaster {
|
||||
RegisterApplicationMasterResponse response = this.rmClient
|
||||
.registerApplicationMaster(this.appHostName,
|
||||
this.appTrackerPort, this.appTrackerUrl);
|
||||
|
||||
|
||||
boolean success = false;
|
||||
String diagnostics = "";
|
||||
try {
|
||||
@ -225,7 +228,8 @@ public class ApplicationMaster {
|
||||
success ? FinalApplicationStatus.SUCCEEDED
|
||||
: FinalApplicationStatus.FAILED, diagnostics,
|
||||
appTrackerUrl);
|
||||
if (!success) throw new Exception("Application not successful");
|
||||
if (!success)
|
||||
throw new Exception("Application not successful");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -270,17 +274,10 @@ public class ApplicationMaster {
|
||||
task.containerRequest = null;
|
||||
ContainerLaunchContext ctx = Records
|
||||
.newRecord(ContainerLaunchContext.class);
|
||||
String hadoop = "hadoop";
|
||||
if (System.getenv("HADOOP_HOME") != null) {
|
||||
hadoop = "${HADOOP_HOME}/bin/hadoop";
|
||||
} else if (System.getenv("HADOOP_PREFIX") != null) {
|
||||
hadoop = "${HADOOP_PREFIX}/bin/hadoop";
|
||||
}
|
||||
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
"CLASSPATH=${CLASSPATH}:`" + hadoop + " classpath --glob` "
|
||||
+ this.command + " 1>"
|
||||
String cmd =
|
||||
// use this to setup CLASSPATH correctly for libhdfs
|
||||
// "CLASSPATH=${CLASSPATH}:`" + hadoop + " classpath --glob` "
|
||||
this.command + " 1>"
|
||||
+ ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
|
||||
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR
|
||||
+ "/stderr";
|
||||
@ -289,17 +286,45 @@ public class ApplicationMaster {
|
||||
ctx.setLocalResources(this.workerResources);
|
||||
// setup environment variables
|
||||
Map<String, String> env = new java.util.HashMap<String, String>();
|
||||
|
||||
|
||||
// setup class path, this is kind of duplicated, ignoring
|
||||
StringBuilder cpath = new StringBuilder("${CLASSPATH}:./*");
|
||||
for (String c : conf.getStrings(
|
||||
YarnConfiguration.YARN_APPLICATION_CLASSPATH,
|
||||
YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) {
|
||||
cpath.append(':');
|
||||
cpath.append(c.trim());
|
||||
String[] arrPath = c.split(":");
|
||||
for (String ps : arrPath) {
|
||||
if (ps.endsWith("*.jar") || ps.endsWith("*")) {
|
||||
ps = ps.substring(0, ps.lastIndexOf('*'));
|
||||
String prefix = ps.substring(0, ps.lastIndexOf('/'));
|
||||
if (ps.startsWith("$")) {
|
||||
String[] arr =ps.split("/", 2);
|
||||
if (arr.length != 2) continue;
|
||||
try {
|
||||
ps = System.getenv(arr[0].substring(1)) + '/' + arr[1];
|
||||
} catch (Exception e){
|
||||
continue;
|
||||
}
|
||||
}
|
||||
File dir = new File(ps);
|
||||
if (dir.isDirectory()) {
|
||||
for (File f: dir.listFiles()) {
|
||||
if (f.isFile() && f.getPath().endsWith(".jar")) {
|
||||
cpath.append(":");
|
||||
cpath.append(prefix + '/' + f.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpath.append(':');
|
||||
cpath.append(ps.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
// already use hadoop command to get class path in worker, maybe a better solution in future
|
||||
// env.put("CLASSPATH", cpath.toString());
|
||||
// already use hadoop command to get class path in worker, maybe a
|
||||
// better solution in future
|
||||
env.put("CLASSPATH", cpath.toString());
|
||||
//LOG.info("CLASSPATH =" + cpath.toString());
|
||||
// setup LD_LIBARY_PATH path for libhdfs
|
||||
env.put("LD_LIBRARY_PATH",
|
||||
"${LD_LIBRARY_PATH}:$HADOOP_HDFS_HOME/lib/native:$JAVA_HOME/jre/lib/amd64/server");
|
||||
@ -312,7 +337,7 @@ public class ApplicationMaster {
|
||||
}
|
||||
env.put("rabit_task_id", String.valueOf(task.taskId));
|
||||
env.put("rabit_num_trial", String.valueOf(task.attemptCounter));
|
||||
|
||||
|
||||
ctx.setEnvironment(env);
|
||||
synchronized (this) {
|
||||
assert (!this.runningTasks.containsKey(container.getId()));
|
||||
@ -390,9 +415,14 @@ public class ApplicationMaster {
|
||||
if (r == null) {
|
||||
continue;
|
||||
}
|
||||
LOG.info("Task " + r.taskId + "failed on " + r.container.getId() + ". See LOG at : " +
|
||||
String.format("http://%s/node/containerlogs/%s/" + userName,
|
||||
r.container.getNodeHttpAddress(), r.container.getId()));
|
||||
LOG.info("Task "
|
||||
+ r.taskId
|
||||
+ "failed on "
|
||||
+ r.container.getId()
|
||||
+ ". See LOG at : "
|
||||
+ String.format("http://%s/node/containerlogs/%s/"
|
||||
+ userName, r.container.getNodeHttpAddress(),
|
||||
r.container.getId()));
|
||||
r.attemptCounter += 1;
|
||||
r.container = null;
|
||||
tasks.add(r);
|
||||
@ -426,22 +456,26 @@ public class ApplicationMaster {
|
||||
finishedTasks.add(r);
|
||||
runningTasks.remove(s.getContainerId());
|
||||
} else {
|
||||
switch (exstatus) {
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
break;
|
||||
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
break;
|
||||
default:
|
||||
LOG.info("[Rabit] Task " + r.taskId
|
||||
+ " exited with status " + exstatus);
|
||||
failed.add(s.getContainerId());
|
||||
try {
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_PMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated physical memory");
|
||||
continue;
|
||||
}
|
||||
if (exstatus == ContainerExitStatus.class.getField(
|
||||
"KILLED_EXCEEDED_VMEM").getInt(null)) {
|
||||
this.abortJob("[Rabit] Task "
|
||||
+ r.taskId
|
||||
+ " killed because of exceeding allocated virtual memory");
|
||||
continue;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
LOG.info("[Rabit] Task " + r.taskId + " exited with status "
|
||||
+ exstatus);
|
||||
failed.add(s.getContainerId());
|
||||
}
|
||||
}
|
||||
this.handleFailure(failed);
|
||||
|
||||
@ -50,6 +50,7 @@ public class Client {
|
||||
* @throws IOException
|
||||
*/
|
||||
private Client() throws IOException {
|
||||
conf.addResource(new Path(System.getenv("HADOOP_CONF_DIR") +"/core-site.xml"));
|
||||
dfs = FileSystem.get(conf);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user