add kmeans example

This commit is contained in:
tqchen 2014-12-29 18:32:56 -08:00
parent 76abd80cb7
commit 39504825d8
5 changed files with 25 additions and 16 deletions

View File

@ -7,12 +7,11 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include
# specify tensor path
BIN = kmeans.rabit
MOCKBIN= kmeans.mock
MPIBIN = kmeans.mpi
# objectives that makes up rabit library
OBJ = kmeans.o
MPIBIN = kmeans.mpi
.PHONY: clean all lib libmpi
all: $(BIN) $(MOCKBIN)
.PHONY: clean all lib
all: $(BIN)
lib:
cd ..;make lib/librabit.a lib/librabit_mock.a; cd -
@ -38,4 +37,4 @@ $(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi
clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) *~ ../src/*~
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~

View File

@ -83,7 +83,9 @@ inline size_t GetCluster(const Matrix &centroids,
int main(int argc, char *argv[]) {
if (argc < 5) {
printf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
}
return 0;
}
clock_t tStart = clock();

View File

@ -6,4 +6,4 @@ then
fi
#set path to hadoop streaming jar here
STREAMING_JAR=
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -s $1 -i $2 -m kmeans.rabit --args "stdin "$3" "$4" stdout" -o $5 --file kmeans.rabit
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout

View File

@ -29,7 +29,7 @@ struct SparseMat {
v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]);
return v;
}
// load data from file
// load data from LibSVM format
inline void Load(const char *fname) {
FILE *fi;
if (!strcmp(fname, "stdin")) {
@ -41,17 +41,25 @@ struct SparseMat {
row_ptr.push_back(0);
data.clear();
feat_dim = 0;
unsigned num_feat;
while (fscanf(fi, "%u", &num_feat) == 1) {
float label; bool init = true;
char tmp[1024];
while (fscanf(file, "%s", tmp) == 1) {
Entry e;
for (unsigned i = 0; i < num_feat; ++i) {
utils::Check(fscanf(fi, "%u:%f", &e.findex, &e.fvalue) == 2,
"invalid format");
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
data.push_back(e);
feat_dim = std::max(e.findex, feat_dim);
} else {
if (!init) {
labels.push_back(label);
row_ptr.push_back(data.size());
}
utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
init = false;
}
row_ptr.push_back(data.size());
}
// last row
labels.push_back(label);
row_ptr.push_back(data.size());
feat_dim += 1;
// close the filed
if (fi != stdin) fclose(fi);
@ -63,6 +71,7 @@ struct SparseMat {
unsigned feat_dim;
std::vector<size_t> row_ptr;
std::vector<Entry> data;
std::vector<float> labels;
};
// dense matrix
struct Matrix {
@ -85,7 +94,6 @@ struct Matrix {
} else {
fo = utils::FopenCheck(fname, "w");
}
fprintf(fo, "%lu %lu\n", nrow, ncol);
for (size_t i = 0; i < data.size(); ++i) {
fprintf(fo, "%g", data[i]);
if ((i+1) % ncol == 0) {

View File

@ -11,7 +11,7 @@ import subprocess
import rabit_tracker as tracker
#!!! Set path to hadoop and hadoop streaming jar here
hadoop_binary = None
hadoop_binary = 'hadoop'
hadoop_streaming_jar = None
# code