finish mushroom example

2014-10-16 18:06:47 -07:00
parent 0cf2dd39ea
commit f512f08437
13 changed files with 167 additions and 10 deletions
--- a/demo/mpi/README.md
+++ b/demo/mpi/README.md
@@ -0,0 +1,3 @@
+This folder contains toy example script to run xgboost-mpi. 
+
+This is an experimental distributed version of xgboost
--- a/demo/mpi/mpi.conf
+++ b/demo/mpi/mpi.conf
@@ -0,0 +1,36 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "agaricus.txt.test"      
--- a/demo/mpi/runexp-mpi.sh
+++ b/demo/mpi/runexp-mpi.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.col*
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi  mpi.conf 
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
--- a/demo/mpi/splitsvm.py
+++ b/demo/mpi/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    arr = l.split()
+    for f in fos:
+        f.write(arr[0])
+    for it in arr[1:]:
+        fid = int(it.split(':')[0])
+        if fid not in fmap:
+            fmap[fid] = random.randint(0, k-1)
+        fos[fmap[fid]].write(' '+it)
+    for f in fos:
+        f.write('\n')
+for f in fos:    
+    f.close()