diff --git a/toolkit/Makefile b/toolkit/Makefile index 69819246a..646558a74 100644 --- a/toolkit/Makefile +++ b/toolkit/Makefile @@ -7,12 +7,11 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include # specify tensor path BIN = kmeans.rabit MOCKBIN= kmeans.mock +MPIBIN = kmeans.mpi # objectives that makes up rabit library OBJ = kmeans.o -MPIBIN = kmeans.mpi -.PHONY: clean all lib libmpi - -all: $(BIN) $(MOCKBIN) +.PHONY: clean all lib +all: $(BIN) lib: cd ..;make lib/librabit.a lib/librabit_mock.a; cd - @@ -38,4 +37,4 @@ $(MPIBIN) : $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi clean: - $(RM) $(OBJ) $(BIN) $(MPIBIN) *~ ../src/*~ + $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~ diff --git a/toolkit/kmeans.cpp b/toolkit/kmeans.cpp index e6be48fc0..0a8171f9f 100644 --- a/toolkit/kmeans.cpp +++ b/toolkit/kmeans.cpp @@ -83,7 +83,9 @@ inline size_t GetCluster(const Matrix ¢roids, int main(int argc, char *argv[]) { if (argc < 5) { - printf("Usage: num_cluster max_iter \n"); + if (rabit::GetRank() == 0) { + rabit::TrackerPrintf("Usage: num_cluster max_iter \n"); + } return 0; } clock_t tStart = clock(); diff --git a/toolkit/kmeans_hadoop.sh b/toolkit/kmeans_hadoop.sh index 9e7b3b832..fb8d1d5a2 100755 --- a/toolkit/kmeans_hadoop.sh +++ b/toolkit/kmeans_hadoop.sh @@ -6,4 +6,4 @@ then fi #set path to hadoop streaming jar here STREAMING_JAR= -python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -s $1 -i $2 -m kmeans.rabit --args "stdin "$3" "$4" stdout" -o $5 --file kmeans.rabit +python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout diff --git a/toolkit/toolkit_util.h b/toolkit/toolkit_util.h index a2f8f56ac..061d3e97b 100644 --- a/toolkit/toolkit_util.h +++ b/toolkit/toolkit_util.h @@ -29,7 +29,7 @@ struct SparseMat { v.length = static_cast(row_ptr[i + 1]-row_ptr[i]); return v; } - // load data from file + // load data from LibSVM format inline void Load(const char *fname) { FILE *fi; if (!strcmp(fname, "stdin")) { @@ -41,17 +41,25 @@ struct SparseMat { row_ptr.push_back(0); data.clear(); feat_dim = 0; - unsigned num_feat; - while (fscanf(fi, "%u", &num_feat) == 1) { + float label; bool init = true; + char tmp[1024]; + while (fscanf(file, "%s", tmp) == 1) { Entry e; - for (unsigned i = 0; i < num_feat; ++i) { - utils::Check(fscanf(fi, "%u:%f", &e.findex, &e.fvalue) == 2, - "invalid format"); + if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) { data.push_back(e); feat_dim = std::max(e.findex, feat_dim); + } else { + if (!init) { + labels.push_back(label); + row_ptr.push_back(data.size()); + } + utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format"); + init = false; } - row_ptr.push_back(data.size()); } + // last row + labels.push_back(label); + row_ptr.push_back(data.size()); feat_dim += 1; // close the filed if (fi != stdin) fclose(fi); @@ -63,6 +71,7 @@ struct SparseMat { unsigned feat_dim; std::vector row_ptr; std::vector data; + std::vector labels; }; // dense matrix struct Matrix { @@ -85,7 +94,6 @@ struct Matrix { } else { fo = utils::FopenCheck(fname, "w"); } - fprintf(fo, "%lu %lu\n", nrow, ncol); for (size_t i = 0; i < data.size(); ++i) { fprintf(fo, "%g", data[i]); if ((i+1) % ncol == 0) { diff --git a/tracker/rabit_hadoop.py b/tracker/rabit_hadoop.py index fbed6ad8f..0d2a33b90 100755 --- a/tracker/rabit_hadoop.py +++ b/tracker/rabit_hadoop.py @@ -11,7 +11,7 @@ import subprocess import rabit_tracker as tracker #!!! Set path to hadoop and hadoop streaming jar here -hadoop_binary = None +hadoop_binary = 'hadoop' hadoop_streaming_jar = None # code