recheck column mode

This commit is contained in:
tqchen
2014-11-19 11:21:07 -08:00
parent dffcbc838b
commit 54e2ed90d7
8 changed files with 51 additions and 93 deletions

View File

@@ -1,3 +0,0 @@
This folder contains toy example script to run xgboost-mpi.
This is an experimental distributed version of xgboost

View File

@@ -1,36 +0,0 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0
# The path of training data %d is the wildcard for the rank of the data
# The idea is each process take a feature matrix with subset of columns
#
data = "train.col%d"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../data/agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1
# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
test:data = "agaricus.txt.test"

View File

@@ -1,19 +0,0 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train.col*
k=$1
# split the lib svm file into k subfiles
python splitsvm.py ../data/agaricus.txt.train train $k
# run xgboost mpi
mpirun -n $k ../../xgboost-mpi mpi.conf
# the model can be directly loaded by single machine xgboost solver, as usuall
../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
cat dump.nice.$k.txt

View File

@@ -1,32 +0,0 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different subcolumns
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
fmap = {}
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
for l in open(sys.argv[1]):
arr = l.split()
for f in fos:
f.write(arr[0])
for it in arr[1:]:
fid = int(it.split(':')[0])
if fid not in fmap:
fmap[fid] = random.randint(0, k-1)
fos[fmap[fid]].write(' '+it)
for f in fos:
f.write('\n')
for f in fos:
f.close()