add kmeans example

This commit is contained in:
tqchen 2014-12-29 18:32:56 -08:00
parent 76abd80cb7
commit 39504825d8
5 changed files with 25 additions and 16 deletions

View File

@ -7,12 +7,11 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include
# specify tensor path # specify tensor path
BIN = kmeans.rabit BIN = kmeans.rabit
MOCKBIN= kmeans.mock MOCKBIN= kmeans.mock
MPIBIN = kmeans.mpi
# objectives that makes up rabit library # objectives that makes up rabit library
OBJ = kmeans.o OBJ = kmeans.o
MPIBIN = kmeans.mpi .PHONY: clean all lib
.PHONY: clean all lib libmpi all: $(BIN)
all: $(BIN) $(MOCKBIN)
lib: lib:
cd ..;make lib/librabit.a lib/librabit_mock.a; cd - cd ..;make lib/librabit.a lib/librabit_mock.a; cd -
@ -38,4 +37,4 @@ $(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi
clean: clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) *~ ../src/*~ $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~

View File

@ -83,7 +83,9 @@ inline size_t GetCluster(const Matrix &centroids,
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
if (argc < 5) { if (argc < 5) {
printf("Usage: <data_dir> num_cluster max_iter <out_model>\n"); if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
}
return 0; return 0;
} }
clock_t tStart = clock(); clock_t tStart = clock();

View File

@ -6,4 +6,4 @@ then
fi fi
#set path to hadoop streaming jar here #set path to hadoop streaming jar here
STREAMING_JAR= STREAMING_JAR=
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -s $1 -i $2 -m kmeans.rabit --args "stdin "$3" "$4" stdout" -o $5 --file kmeans.rabit python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout

View File

@ -29,7 +29,7 @@ struct SparseMat {
v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]); v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]);
return v; return v;
} }
// load data from file // load data from LibSVM format
inline void Load(const char *fname) { inline void Load(const char *fname) {
FILE *fi; FILE *fi;
if (!strcmp(fname, "stdin")) { if (!strcmp(fname, "stdin")) {
@ -41,17 +41,25 @@ struct SparseMat {
row_ptr.push_back(0); row_ptr.push_back(0);
data.clear(); data.clear();
feat_dim = 0; feat_dim = 0;
unsigned num_feat; float label; bool init = true;
while (fscanf(fi, "%u", &num_feat) == 1) { char tmp[1024];
while (fscanf(file, "%s", tmp) == 1) {
Entry e; Entry e;
for (unsigned i = 0; i < num_feat; ++i) { if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
utils::Check(fscanf(fi, "%u:%f", &e.findex, &e.fvalue) == 2,
"invalid format");
data.push_back(e); data.push_back(e);
feat_dim = std::max(e.findex, feat_dim); feat_dim = std::max(e.findex, feat_dim);
} else {
if (!init) {
labels.push_back(label);
row_ptr.push_back(data.size());
}
utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
init = false;
} }
row_ptr.push_back(data.size());
} }
// last row
labels.push_back(label);
row_ptr.push_back(data.size());
feat_dim += 1; feat_dim += 1;
// close the filed // close the filed
if (fi != stdin) fclose(fi); if (fi != stdin) fclose(fi);
@ -63,6 +71,7 @@ struct SparseMat {
unsigned feat_dim; unsigned feat_dim;
std::vector<size_t> row_ptr; std::vector<size_t> row_ptr;
std::vector<Entry> data; std::vector<Entry> data;
std::vector<float> labels;
}; };
// dense matrix // dense matrix
struct Matrix { struct Matrix {
@ -85,7 +94,6 @@ struct Matrix {
} else { } else {
fo = utils::FopenCheck(fname, "w"); fo = utils::FopenCheck(fname, "w");
} }
fprintf(fo, "%lu %lu\n", nrow, ncol);
for (size_t i = 0; i < data.size(); ++i) { for (size_t i = 0; i < data.size(); ++i) {
fprintf(fo, "%g", data[i]); fprintf(fo, "%g", data[i]);
if ((i+1) % ncol == 0) { if ((i+1) % ncol == 0) {

View File

@ -11,7 +11,7 @@ import subprocess
import rabit_tracker as tracker import rabit_tracker as tracker
#!!! Set path to hadoop and hadoop streaming jar here #!!! Set path to hadoop and hadoop streaming jar here
hadoop_binary = None hadoop_binary = 'hadoop'
hadoop_streaming_jar = None hadoop_streaming_jar = None
# code # code