add kmeans example

2014-12-29 18:32:56 -08:00 · 2014-12-29 18:32:56 -08:00 · 39504825d8
commit 39504825d8
parent 76abd80cb7
5 changed files with 25 additions and 16 deletions
--- a/toolkit/Makefile
+++ b/toolkit/Makefile
@ -7,12 +7,11 @@ export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include
 # specify tensor path
 BIN = kmeans.rabit 
 MOCKBIN= kmeans.mock
 MPIBIN = kmeans.mpi
 # objectives that makes up rabit library
 OBJ = kmeans.o
-MPIBIN = kmeans.mpi
+.PHONY: clean all lib 
-.PHONY: clean all lib libmpi
+all: $(BIN)
 all: $(BIN) $(MOCKBIN)
 lib:
 	cd ..;make lib/librabit.a lib/librabit_mock.a; cd -
@ -38,4 +37,4 @@ $(MPIBIN) :
 	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^)  $(LDFLAGS) -lrabit_mpi 
 clean:
-	$(RM) $(OBJ) $(BIN) $(MPIBIN) *~ ../src/*~
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~
--- a/toolkit/kmeans.cpp
+++ b/toolkit/kmeans.cpp
@ -83,7 +83,9 @@ inline size_t GetCluster(const Matrix &centroids,
 int main(int argc, char *argv[]) {
  if (argc < 5) {
-    printf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
+    if (rabit::GetRank() == 0) {
      rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
    }
    return 0;
  }
  clock_t tStart = clock();
--- a/toolkit/kmeans_hadoop.sh
+++ b/toolkit/kmeans_hadoop.sh
@ -6,4 +6,4 @@ then
 fi
 #set path to hadoop streaming jar here
 STREAMING_JAR=
-python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -s $1 -i $2  -m kmeans.rabit --args "stdin "$3" "$4" stdout" -o $5 --file kmeans.rabit
+python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5  kmeans.rabit stdin $3 $4 stdout
--- a/toolkit/toolkit_util.h
+++ b/toolkit/toolkit_util.h
@ -29,7 +29,7 @@ struct SparseMat {
    v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]);
    return v;
  }
-  // load data from file
+  // load data from LibSVM format
  inline void Load(const char *fname) {
    FILE *fi;
    if (!strcmp(fname, "stdin")) {
@ -41,17 +41,25 @@ struct SparseMat {
    row_ptr.push_back(0);
    data.clear();    
    feat_dim = 0;
-    unsigned num_feat;
+    float label; bool init = true;
-    while (fscanf(fi, "%u", &num_feat) == 1) {
+    char tmp[1024];
    while (fscanf(file, "%s", tmp) == 1) {
      Entry e;
-      for (unsigned i = 0; i < num_feat; ++i) {
+      if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
        utils::Check(fscanf(fi, "%u:%f", &e.findex, &e.fvalue) == 2,
                     "invalid format");
        data.push_back(e);
        feat_dim = std::max(e.findex, feat_dim);
      } else {
        if (!init) {
          labels.push_back(label);
          row_ptr.push_back(data.size());
        }
        utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
        init = false;
      }
      row_ptr.push_back(data.size());
    }
    // last row
    labels.push_back(label);
    row_ptr.push_back(data.size());
    feat_dim += 1;
    // close the filed
    if (fi != stdin) fclose(fi);
@ -63,6 +71,7 @@ struct SparseMat {
  unsigned feat_dim;
  std::vector<size_t> row_ptr;
  std::vector<Entry> data;
  std::vector<float> labels;
 };
 // dense matrix
 struct Matrix {
@ -85,7 +94,6 @@ struct Matrix {
    } else {
      fo = utils::FopenCheck(fname, "w");
    }
    fprintf(fo, "%lu %lu\n", nrow, ncol);
    for (size_t i = 0; i < data.size(); ++i) {
      fprintf(fo, "%g", data[i]);
      if ((i+1) % ncol == 0) {
--- a/tracker/rabit_hadoop.py
+++ b/tracker/rabit_hadoop.py
@ -11,7 +11,7 @@ import subprocess
 import rabit_tracker as tracker
 #!!! Set path to hadoop and hadoop streaming jar here
-hadoop_binary = None
+hadoop_binary = 'hadoop'
 hadoop_streaming_jar = None
 # code