add subtree folder

This commit is contained in:
tqchen 2015-01-18 21:07:31 -08:00
parent 9695c51ce1
commit 07da390575
6 changed files with 1 additions and 377 deletions

1
subtree/README.md Normal file
View File

@ -0,0 +1 @@
This folder contains git subtree projects of xgboost

View File

@ -1,35 +0,0 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../src
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
CFLAGS += -fopenmp
endif
# specify tensor path
BIN = test_group_data test_quantile test_allreduce
OBJ = sync_tcp.o
.PHONY: clean all
all: $(BIN) $(MPIBIN)
sync_tcp.o: ../src/sync/sync_tcp.cpp ../src/utils/*.h
test_group_data: test_group_data.cpp ../src/utils/*.h
test_quantile: test_quantile.cpp ../src/utils/*.h
test_allreduce: test_allreduce.cpp ../src/utils/*.h ../src/sync/sync.h sync_tcp.o
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
clean:
$(RM) $(BIN) $(MPIBIN) *~

View File

@ -1,42 +0,0 @@
#!/usr/bin/python
import math
import sys
import random
import subprocess
funcs = {
'seq': 'lambda n: sorted([(x,1) for x in range(1,n+1)], key = lambda x:random.random())',
'seqlogw': 'lambda n: sorted([(x, math.log(x)) for x in range(1,n+1)], key = lambda x:random.random())',
'lots0': 'lambda n: sorted([(max(x - n*3/4,0), 1) for x in range(1,n+1)], key = lambda x:random.random())',
'lots9': 'lambda n: sorted([(9 if x > n / 4 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
'lotsm': 'lambda n: sorted([(n/8 if x > n / 4 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
'lotsmr': 'lambda n: sorted([( x * 4 / n + n / 20 if x > n / 10 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
'lotsmr2': 'lambda n: sorted([( x * 10 / n + n / 20 if x > n / 10 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())'
}
if len(sys.argv) < 3:
print 'Usage: python mkquantest.py <maxn> <eps> [generate-type] [ndata]|./test_quantile [solver]'
print 'test_quantile need to be compiled, solver can be gk(GK nonweight version), wq(weighted version), wx(weighthed version, with prune optimized for heavy hitter)'
print 'Possible generate-types:'
for k, v in funcs.items():
print '\t%s: %s' % (k, v)
print 'Example: ./mkquantest.py 50000 0.3 lotsmr |./test_quantile wq'
exit(-1)
random.seed(0)
maxn = int(sys.argv[1])
eps = float(sys.argv[2])
if len(sys.argv) > 3:
method = sys.argv[3]
assert method in funcs, ('cannot find method %s' % method)
else:
method = 'seq'
if len(sys.argv) > 4:
ndata = int(sys.argv[4])
assert ndata <= maxn, 'ndata must be smaller than maxn'
else:
ndata = maxn
fo = sys.stdout
fo.write('%d\t%g\n' % (maxn, eps))
for x, w in eval(funcs[method])(ndata):
fo.write(str(x)+'\t'+str(w)+'\n')

View File

@ -1,124 +0,0 @@
#include <sync/sync.h>
#include <utils/utils.h>
#include <cstdio>
#include <cstdlib>
#include <cmath>
using namespace xgboost;
inline void TestMax(size_t n) {
int rank = sync::GetRank();
int nproc = sync::GetWorldSize();
std::vector<float> ndata(n);
for (size_t i = 0; i < ndata.size(); ++i) {
ndata[i] = (i * (rank+1)) % 111;
}
sync::AllReduce(&ndata[0], ndata.size(), sync::kMax);
for (size_t i = 0; i < ndata.size(); ++i) {
float rmax = (i * 1) % 111;
for (int r = 0; r < nproc; ++r) {
rmax = std::max(rmax, (float)((i * (r+1)) % 111));
}
utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
}
}
inline void TestSum(size_t n) {
int rank = sync::GetRank();
int nproc = sync::GetWorldSize();
const int z = 131;
std::vector<float> ndata(n);
for (size_t i = 0; i < ndata.size(); ++i) {
ndata[i] = (i * (rank+1)) % z;
}
sync::AllReduce(&ndata[0], ndata.size(), sync::kSum);
for (size_t i = 0; i < ndata.size(); ++i) {
float rsum = 0.0f;
for (int r = 0; r < nproc; ++r) {
rsum += (float)((i * (r+1)) % z);
}
utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
"[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
}
}
struct Rec {
double rmax;
double rmin;
double rsum;
Rec() {}
Rec(double r) {
rmax = rmin = rsum = r;
}
inline void Reduce(const Rec &b) {
rmax = std::max(b.rmax, rmax);
rmin = std::max(b.rmin, rmin);
rsum += b.rsum;
}
inline void CheckSameAs(const Rec &b) {
if (rmax != b.rmax || rmin != b.rmin || fabs(rsum - b.rsum) > 1e-6) {
utils::Error("[%d] TestReducer check failure", sync::GetRank());
}
}
};
inline void TestReducer(int n) {
int rank = sync::GetRank();
int nproc = sync::GetWorldSize();
const int z = 131;
sync::Reducer<Rec> red;
std::vector<Rec> ndata(n);
for (size_t i = 0; i < ndata.size(); ++i) {
ndata[i] = Rec((i * (rank+1)) % z);
}
red.AllReduce(&ndata[0], ndata.size());
for (size_t i = 0; i < ndata.size(); ++i) {
Rec rec((i * 1) % z);
for (int r = 1; r < nproc; ++r) {
rec.Reduce(Rec((i * (r+1)) % z));
}
rec.CheckSameAs(ndata[i]);
}
}
inline void TestBcast(size_t n, int root) {
int rank = sync::GetRank();
std::string s; s.resize(n);
for (size_t i = 0; i < n; ++i) {
s[i] = char(i % 126 + 1);
}
std::string res;
if (root == rank) {
res = s;
sync::Bcast(&res, root);
} else {
sync::Bcast(&res, root);
}
utils::Check(res == s, "[%d] TestBcast fail", rank);
}
int main(int argc, char *argv[]) {
if (argc < 2) {
printf("Usage: <ndata>\n");
return 0;
}
int n = atoi(argv[1]);
sync::Init(argc, argv);
int rank = sync::GetRank();
//int nproc = sync::GetWorldSize();
std::string name = sync::GetProcessorName();
printf("[%d] start at %s\n", rank, name.c_str());
TestMax(n);
printf("[%d] TestMax pass\n", rank);
TestSum(n);
printf("[%d] TestSum pass\n", rank);
TestReducer(n);
printf("[%d] TestReducer pass\n", rank);
sync::Finalize();
printf("[%d] all check pass\n", rank);
return 0;
}

View File

@ -1,84 +0,0 @@
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <utility>
#include <ctime>
#include <utils/group_data.h>
#include <utils/random.h>
#include <utils/omp.h>
#include <utils/utils.h>
using namespace xgboost::utils;
using namespace xgboost;
int main(int argc, char *argv[]) {
if (argc < 3) {
printf("Usage: <nkey> <ndata> pnthread]\n");
return 0;
}
if (argc > 3) {
omp_set_num_threads(atoi(argv[3]));
}
random::Seed(0);
unsigned nkey = static_cast<unsigned>(atoi(argv[1]));
size_t ndata = static_cast<size_t>(atol(argv[2]));
std::vector<unsigned> keys;
std::vector< std::pair<unsigned, unsigned> > raw;
raw.reserve(ndata); keys.reserve(ndata);
for (size_t i = 0; i < ndata; ++i) {
unsigned key = random::NextUInt32(nkey);
utils::Check(key < nkey, "key exceed bound\n");
raw.push_back(std::make_pair(key, i));
keys.push_back(key);
}
printf("loading finish, start working\n");
time_t start_t = time(NULL);
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
std::vector<size_t> rptr;
std::vector<unsigned> data;
ParallelGroupBuilder<unsigned> builder(&rptr, &data);
builder.InitBudget(0, nthread);
size_t nstep = (raw.size() +nthread-1)/ nthread;
#pragma omp parallel
{
int tid = omp_get_thread_num();
size_t begin = tid * nstep;
size_t end = std::min((tid + 1) * nstep, raw.size());
for (size_t i = begin; i < end; ++i) {
builder.AddBudget(raw[i].first, tid);
}
}
double first_cost = time(NULL) - start_t;
builder.InitStorage();
#pragma omp parallel
{
int tid = omp_get_thread_num();
size_t begin = tid * nstep;
size_t end = std::min((tid + 1)* nstep, raw.size());
for (size_t i = begin; i < end; ++i) {
builder.Push(raw[i].first, raw[i].second, tid);
}
}
double second_cost = time(NULL) - start_t;
printf("all finish, phase1=%g sec, phase2=%g sec\n", first_cost, second_cost);
Check(rptr.size() <= nkey+1, "nkey exceed bound");
Check(rptr.back() == ndata, "data shape inconsistent");
for (size_t i = 0; i < rptr.size()-1; ++ i) {
Check(rptr[i] <= rptr[i+1], "rptr error");
for (size_t j = rptr[i]; j < rptr[i+1]; ++j) {
unsigned pos = data[j];
Check(pos < keys.size(), "invalid pos");
Check(keys[pos] == i, "invalid key entry");
}
}
printf("all check pass\n");
return 0;
}

View File

@ -1,92 +0,0 @@
#include <vector>
#include <utils/quantile.h>
#include <ctime>
using namespace xgboost;
struct Entry {
double x, w, rmin;
inline bool operator<(const Entry &e) const {
return x < e.x;
}
};
inline void MakeQuantile(std::vector<Entry> &dat) {
std::sort(dat.begin(), dat.end());
size_t top = 0;
double wsum = 0.0;
for (size_t i = 0; i < dat.size();) {
size_t j = i + 1;
for (;j < dat.size() && dat[i].x == dat[j].x; ++j) {
dat[i].w += dat[j].w;
}
dat[top] = dat[i];
dat[top].rmin = wsum;
wsum += dat[top].w;
++top;
i = j;
}
dat.resize(top);
}
template<typename Summary>
inline void verifyWQ(std::vector<Entry> &dat, Summary out) {
MakeQuantile(dat);
size_t j = 0;
double err = 0.0;
const double eps = 1e-4;
for (size_t i = 0; i < out.size; ++i) {
while (j < dat.size() && dat[j].x < out.data[i].value) ++j;
utils::Assert(j < dat.size() && fabs(dat[j].x - out.data[i].value) < eps, "bug");
err = std::min(dat[j].rmin - out.data[i].rmin, err);
err = std::min(out.data[i].rmax - dat[j].rmin + dat[j].w, err);
err = std::min(dat[j].w - out.data[i].wmin, err);
}
if (err < 0.0) err = -err;
printf("verify correctness, max-constraint-violation=%g (0 means perfect, coubld be nonzero due to floating point)\n", err);
}
template<typename Sketch, typename RType>
inline typename Sketch::SummaryContainer test(std::vector<Entry> &dat) {
Sketch sketch;
size_t n;
double wsum = 0.0;
float eps;
utils::Check(scanf("%lu%f", &n, &eps) == 2, "needs to start with n eps");
sketch.Init(n, eps);
Entry e;
while (scanf("%lf%lf", &e.x, &e.w) == 2) {
dat.push_back(e);
wsum += e.w;
}
clock_t start = clock();
for (size_t i = 0; i < dat.size(); ++i) {
sketch.Push(dat[i].x, dat[i].w);
}
double tcost = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
typename Sketch::SummaryContainer out;
sketch.GetSummary(&out);
double maxerr = static_cast<double>(out.MaxError());
out.Print();
printf("-------------------------\n");
printf("timecost=%g sec\n", tcost);
printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum);
printf("maxlevel = %lu, usedlevel=%lu, limit_size=%lu\n", sketch.nlevel, sketch.level.size(), sketch.limit_size);
return out;
}
int main(int argc, char *argv[]) {
const char *method = "wq";
if (argc > 1) method = argv[1];
std::vector<Entry> dat;
if (!strcmp(method, "wq")) {
verifyWQ(dat, test<utils::WQuantileSketch<float, float>, float>(dat));
}
if (!strcmp(method, "wx")) {
verifyWQ(dat, test<utils::WXQuantileSketch<float, float>, float>(dat));
}
if (!strcmp(method, "gk")) {
test<utils::GKQuantileSketch<float, unsigned>, unsigned>(dat);
}
return 0;
}