Merge remote-tracking branch 'upstream/master'

This commit is contained in:
El Potaeto 2015-02-12 09:51:42 +01:00
commit f1f346713a
23 changed files with 9714 additions and 35 deletions

View File

@ -10,6 +10,7 @@
#include "./updater_sync-inl.hpp"
#include "./updater_distcol-inl.hpp"
#include "./updater_histmaker-inl.hpp"
#include "./updater_skmaker-inl.hpp"
#endif
namespace xgboost {
@ -22,6 +23,7 @@ IUpdater* CreateUpdater(const char *name) {
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(name, "sync")) return new TreeSyncher();
if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
#endif
utils::Error("unknown updater:%s", name);

View File

@ -8,7 +8,7 @@
*/
#include <vector>
#include <algorithm>
#include <rabit.h>
#include "../sync/sync.h"
#include "../utils/quantile.h"
#include "./updater_basemaker-inl.hpp"
@ -123,8 +123,8 @@ class SketchMaker: public BaseMaker {
sum_hess += b.sum_hess;
}
/*! \brief same as add, reduce is used in All Reduce */
inline void Reduce(const SKStats &b) {
this->Add(b);
inline static void Reduce(SKStats &a, const SKStats &b) {
a.Add(b);
}
/*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
@ -156,18 +156,19 @@ class SketchMaker: public BaseMaker {
batch[i].length == nrows,
&thread_sketch[omp_get_thread_num()]);
}
}
}
// setup maximum size
unsigned max_size = param.max_sketch_size();
// synchronize sketch
summary_array.Init(sketchs.size(), max_size);
summary_array.resize(sketchs.size());
for (size_t i = 0; i < sketchs.size(); ++i) {
utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array.Set(i, out);
summary_array[i].Reserve(max_size);
summary_array[i].SetPrune(out, max_size);
}
size_t nbytes = summary_array.MemSize();;
sketch_reducer.Allreduce(&summary_array, nbytes);
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
}
// update sketch information in column fid
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
@ -186,7 +187,7 @@ class SketchMaker: public BaseMaker {
const unsigned wid = this->node2workindex[nid];
for (int k = 0; k < 3; ++k) {
sbuilder[3 * nid + k].sum_total = 0.0f;
sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
}
}
if (!col_full) {
@ -367,7 +368,7 @@ class SketchMaker: public BaseMaker {
c.sum_hess >= param.min_child_weight) {
bst_float cpt = fsplits.back();
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, true);
best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, false);
}
}
}
@ -380,11 +381,11 @@ class SketchMaker: public BaseMaker {
// node statistics
std::vector<SKStats> node_stats;
// summary array
WXQSketch::SummaryArray summary_array;
std::vector<WXQSketch::SummaryContainer> summary_array;
// reducer for summary
rabit::Reducer<SKStats> stats_reducer;
rabit::Reducer<SKStats, SKStats::Reduce> stats_reducer;
// reducer for summary
rabit::SerializeReducer<WXQSketch::SummaryArray> sketch_reducer;
rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer;
// per node, per feature sketch
std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
};

View File

@ -1,13 +1,28 @@
Copyright (c) 2014 by Contributors
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of rabit nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -8,6 +8,8 @@ It also contain links to the Machine Learning packages that uses rabit.
Toolkits
====
* [KMeans Clustering](kmeans)
* [Linear and Logistic Regression](linear)
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
10 times faster than existing packages

View File

@ -4,7 +4,7 @@ export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm -L../../lib
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../../include -I../common
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
.PHONY: clean all lib mpi
all: $(BIN) $(MOCKBIN)

View File

@ -0,0 +1,2 @@
This folder contains processed example dataset used by the demos.
Copyright of the dataset belongs to the original copyright holder

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -2,8 +2,8 @@
// facing an exception
#include <rabit.h>
#include <rabit/utils.h>
#include "./toolkit_util.h"
#include <time.h>
#include "../utils/data.h"
using namespace rabit;
@ -83,9 +83,12 @@ inline size_t GetCluster(const Matrix &centroids,
int main(int argc, char *argv[]) {
if (argc < 5) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
}
rabit::Finalize();
return 0;
}
clock_t tStart = clock();

View File

@ -0,0 +1,14 @@
# specify tensor path
BIN = linear.rabit
MOCKBIN= linear.mock
MPIBIN =
# objectives that makes up rabit library
OBJ = linear.o
# common build script for programs
include ../common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here
linear.rabit: linear.o lib
linear.mock: linear.o lib

View File

@ -0,0 +1,33 @@
Linear and Logistic Regression
====
* input format: LibSVM
* Example: [run-linear.sh](run-linear.sh)
Parameters
===
All the parameters can be set by param=value
#### Important Parameters
* objective [default = logistic]
- can be linear or logistic
* base_score [default = 0.5]
- global bias, recommended set to mean value of label
* reg_L1 [default = 0]
- l1 regularization co-efficient
* reg_L2 [default = 1]
- l2 regularization co-efficient
* lbfgs_stop_tol [default = 1e-5]
- relative tolerance level of loss reduction with respect to initial loss
* max_lbfgs_iter [default = 500]
- maximum number of lbfgs iterations
### Optimization Related parameters
* min_lbfgs_iter [default = 5]
- minimum number of lbfgs iterations
* max_linesearch_iter [default = 100]
- maximum number of iterations in linesearch
* linesearch_c1 [default = 1e-4]
- c1 co-efficient in backoff linesearch
* linesarch_backoff [default = 0.5]
- backoff ratio in linesearch

View File

@ -0,0 +1,239 @@
#include "./linear.h"
#include "../utils/io.h"
#include "../utils/base64.h"
namespace rabit {
namespace linear {
class LinearObjFunction : public solver::IObjFunction<float> {
public:
// training threads
int nthread;
// L2 regularization
float reg_L2;
// model
LinearModel model;
// training data
SparseMat dtrain;
// solver
solver::LBFGSSolver<float> lbfgs;
// constructor
LinearObjFunction(void) {
lbfgs.SetObjFunction(this);
nthread = 1;
reg_L2 = 0.0f;
model.weight = NULL;
task = "train";
model_in = "NULL";
name_pred = "pred.txt";
model_out = "final.model";
}
virtual ~LinearObjFunction(void) {
}
// set parameters
inline void SetParam(const char *name, const char *val) {
model.param.SetParam(name, val);
lbfgs.SetParam(name, val);
if (!strcmp(name, "num_feature")) {
char ndigit[30];
sprintf(ndigit, "%lu", model.param.num_feature + 1);
lbfgs.SetParam("num_dim", ndigit);
}
if (!strcmp(name, "reg_L2")) {
reg_L2 = static_cast<float>(atof(val));
}
if (!strcmp(name, "nthread")) {
nthread = atoi(val);
}
if (!strcmp(name, "task")) task = val;
if (!strcmp(name, "model_in")) model_in = val;
if (!strcmp(name, "model_out")) model_out = val;
if (!strcmp(name, "name_pred")) name_pred = val;
}
inline void Run(void) {
if (model_in != "NULL") {
this->LoadModel(model_in.c_str());
}
if (task == "train") {
lbfgs.Run();
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
} else if (task == "pred") {
this->TaskPred();
} else {
utils::Error("unknown task=%s", task.c_str());
}
}
inline void TaskPred(void) {
utils::Check(model_in != "NULL",
"must set model_in for task=pred");
FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float pred = model.Predict(dtrain[i]);
fprintf(fp, "%g\n", pred);
}
fclose(fp);
printf("Finishing writing to %s\n", name_pred.c_str());
}
inline void LoadModel(const char *fname) {
FILE *fp = utils::FopenCheck(fname, "rb");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
utils::FileStream fi(fp);
utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
// base64 format
if (header == "bs64") {
utils::Base64InStream bsin(fp);
bsin.InitPosition();
model.Load(bsin);
fclose(fp);
return;
} else if (header == "binf") {
model.Load(fi);
fclose(fp);
return;
} else {
utils::Error("invalid model file");
}
}
inline void SaveModel(const char *fname,
const float *wptr,
bool save_base64 = false) {
FILE *fp;
bool use_stdout = false;
if (!strcmp(fname, "stdout")) {
fp = stdout;
use_stdout = true;
} else {
fp = utils::FopenCheck(fname, "wb");
}
utils::FileStream fo(fp);
if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp);
model.Save(bout, wptr);
bout.Finish('\n');
} else {
fo.Write("binf", 4);
model.Save(fo, wptr);
}
if (!use_stdout) {
fclose(fp);
}
}
inline void LoadData(const char *fname) {
dtrain.Load(fname);
}
virtual size_t InitNumDim(void) {
if (model_in == "NULL") {
size_t ndim = dtrain.feat_dim;
rabit::Allreduce<rabit::op::Max>(&ndim, 1);
model.param.num_feature = std::max(ndim, model.param.num_feature);
}
return model.param.num_feature + 1;
}
virtual void InitModel(float *weight, size_t size) {
if (model_in == "NULL") {
memset(weight, 0.0f, size * sizeof(float));
model.param.InitBaseScore();
} else {
rabit::Broadcast(model.weight, size * sizeof(float), 0);
memcpy(weight, model.weight, size * sizeof(float));
}
}
// load model
virtual void Load(rabit::IStream &fi) {
fi.Read(&model.param, sizeof(model.param));
}
virtual void Save(rabit::IStream &fo) const {
fo.Write(&model.param, sizeof(model.param));
}
virtual double Eval(const float *weight, size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
double sum_val = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_val)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float py = model.param.PredictMargin(weight, dtrain[i]);
float fv = model.param.MarginToLoss(dtrain.labels[i], py);
sum_val += fv;
}
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
double sum_sqr = 0.0;
for (size_t i = 0; i < model.param.num_feature; ++i) {
sum_sqr += weight[i] * weight[i];
}
sum_val += 0.5 * reg_L2 * sum_sqr;
}
}
utils::Check(!std::isnan(sum_val), "nan occurs");
return sum_val;
}
virtual void CalcGrad(float *out_grad,
const float *weight,
size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
memset(out_grad, 0.0f, sizeof(float) * size);
double sum_gbias = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_gbias)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
SparseMat::Vector v = dtrain[i];
float py = model.param.Predict(weight, v);
float grad = model.param.PredToGrad(dtrain.labels[i], py);
for (index_t j = 0; j < v.length; ++j) {
out_grad[v[j].findex] += v[j].fvalue * grad;
}
sum_gbias += grad;
}
out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
for (size_t i = 0; i < model.param.num_feature; ++i) {
out_grad[i] += reg_L2 * weight[i];
}
}
}
}
private:
std::string task;
std::string model_in;
std::string model_out;
std::string name_pred;
};
} // namespace linear
} // namespace rabit
int main(int argc, char *argv[]) {
if (argc < 2) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_in> param=val\n");
}
rabit::Finalize();
return 0;
}
rabit::linear::LinearObjFunction linear;
if (!strcmp(argv[1], "stdin")) {
linear.LoadData(argv[1]);
rabit::Init(argc, argv);
} else {
rabit::Init(argc, argv);
linear.LoadData(argv[1]);
}
for (int i = 2; i < argc; ++i) {
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
linear.SetParam(name, val);
}
}
linear.Run();
rabit::Finalize();
return 0;
}

View File

@ -0,0 +1,133 @@
/*!
* Copyright (c) 2015 by Contributors
* \file linear.h
* \brief Linear and Logistic regression
*
* \author Tianqi Chen
*/
#ifndef RABIT_LINEAR_H_
#define RABIT_LINEAR_H_
#include <omp.h>
#include "../utils/data.h"
#include "../solver/lbfgs.h"
namespace rabit {
namespace linear {
/*! \brief simple linear model */
struct LinearModel {
struct ModelParam {
/*! \brief global bias */
float base_score;
/*! \brief number of features */
size_t num_feature;
/*! \brief loss type*/
int loss_type;
// reserved field
int reserved[16];
// constructor
ModelParam(void) {
base_score = 0.5f;
num_feature = 0;
loss_type = 1;
std::memset(reserved, 0, sizeof(reserved));
}
// initialize base score
inline void InitBaseScore(void) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -std::log(1.0f / base_score - 1.0f);
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("base_score", name)) {
base_score = static_cast<float>(atof(val));
}
if (!strcmp("num_feature", name)) {
num_feature = static_cast<size_t>(atol(val));
}
if (!strcmp("objective", name)) {
if (!strcmp("linear", val)) {
loss_type = 0;
} else if (!strcmp("logistic", val)) {
loss_type = 1;
} else {
utils::Error("unknown objective type %s\n", val);
}
}
}
// transform margin to prediction
inline float MarginToPred(float margin) const {
if (loss_type == 1) {
return 1.0f / (1.0f + std::exp(-margin));
} else {
return margin;
}
}
// margin to loss
inline float MarginToLoss(float label, float margin) const {
if (loss_type == 1) {
float nlogprob;
if (margin > 0.0f) {
nlogprob = std::log(1.0f + std::exp(-margin));
} else {
nlogprob = -margin + std::log(1.0f + std::exp(margin));
}
return label * nlogprob +
(1.0f -label) * (margin + nlogprob);
} else {
float diff = margin - label;
return 0.5f * diff * diff;
}
}
inline float PredToGrad(float label, float pred) const {
return pred - label;
}
inline float PredictMargin(const float *weight,
const SparseMat::Vector &v) const {
// weight[num_feature] is bias
float sum = base_score + weight[num_feature];
for (unsigned i = 0; i < v.length; ++i) {
if (v[i].findex >= num_feature) continue;
sum += weight[v[i].findex] * v[i].fvalue;
}
return sum;
}
inline float Predict(const float *weight,
const SparseMat::Vector &v) const {
return MarginToPred(PredictMargin(weight, v));
}
};
// model parameter
ModelParam param;
// weight corresponding to the model
float *weight;
LinearModel(void) : weight(NULL) {
}
~LinearModel(void) {
if (weight != NULL) delete [] weight;
}
// load model
inline void Load(rabit::IStream &fi) {
fi.Read(&param, sizeof(param));
if (weight == NULL) {
weight = new float[param.num_feature + 1];
}
fi.Read(weight, sizeof(float) * (param.num_feature + 1));
}
inline void Save(rabit::IStream &fo, const float *wptr = NULL) const {
fo.Write(&param, sizeof(param));
if (wptr == NULL) wptr = weight;
fo.Write(wptr, sizeof(float) * (param.num_feature + 1));
}
inline float Predict(const SparseMat::Vector &v) const {
return param.Predict(weight, v);
}
};
} // namespace linear
} // namespace rabit
#endif // RABIT_LINEAR_H_

View File

@ -0,0 +1,15 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf mushroom.row* *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1

View File

@ -0,0 +1,17 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf mushroom.row* *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model

View File

@ -0,0 +1,24 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different rows
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
for l in open(sys.argv[1]):
i = random.randint(0, k-1)
fos[i].write(l)
for f in fos:
f.close()

View File

@ -0,0 +1,653 @@
/*!
* Copyright (c) 2015 by Contributors
* \file lbfgs.h
* \brief L-BFGS solver for general optimization problem
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_LBFGS_H_
#define RABIT_LEARN_LBFGS_H_
#include <cmath>
#include <rabit.h>
namespace rabit {
/*! \brief namespace of solver for general problems */
namespace solver {
/*!
* \brief objective function for optimizers
* the objective function can also implement save/load
* to remember the state parameters that might need to remember
*/
template<typename DType>
class IObjFunction : public rabit::ISerializable {
public:
// destructor
virtual ~IObjFunction(void){}
/*!
* \brief evaluate function values for a given weight
* \param weight weight of the function
* \param size size of the weight
*/
virtual double Eval(const DType *weight, size_t size) = 0;
/*!
* \return number of feature dimension to be allocated
* only called once during initialization
*/
virtual size_t InitNumDim(void) = 0;
/*!
* \brief initialize the weight before starting the solver
* only called once for initialization
*/
virtual void InitModel(DType *weight, size_t size) = 0;
/*!
* \brief calculate gradient for a given weight
* \param out_grad used to store the gradient value of the function
* \param weight weight of the function
* \param size size of the weight
*/
virtual void CalcGrad(DType *out_grad,
const DType *weight,
size_t size) = 0;
};
/*! \brief a basic version L-BFGS solver */
template<typename DType>
class LBFGSSolver {
public:
LBFGSSolver(void) {
// set default values
reg_L1 = 0.0f;
max_linesearch_iter = 100;
linesearch_backoff = 0.5f;
linesearch_c1 = 1e-4;
min_lbfgs_iter = 5;
max_lbfgs_iter = 500;
lbfgs_stop_tol = 1e-5f;
silent = 0;
}
virtual ~LBFGSSolver(void) {}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) {
if (!strcmp("num_dim", name)) {
gstate.num_dim = static_cast<size_t>(atol(val));
}
if (!strcmp("size_memory", name)) {
gstate.size_memory = static_cast<size_t>(atol(val));
}
if (!strcmp("reg_L1", name)) {
reg_L1 = static_cast<float>(atof(val));
}
if (!strcmp("lbfgs_stop_tol", name)) {
lbfgs_stop_tol = static_cast<float>(atof(val));
}
if (!strcmp("linesearch_backoff", name)) {
linesearch_backoff = static_cast<float>(atof(val));
}
if (!strcmp("max_linesearch_iter", name)) {
max_linesearch_iter = atoi(val);
}
if (!strcmp("max_lbfgs_iter", name)) {
max_lbfgs_iter = atoi(val);
}
if (!strcmp("min_lbfgs_iter", name)) {
min_lbfgs_iter = atoi(val);
}
if (!strcmp("linesearch_c1", name)) {
linesearch_c1 = static_cast<float>(atof(val));
}
}
/*!
* \brief set objective function to optimize
* the objective function only need to evaluate and calculate
* gradient with respect to current subset of data
* \param obj the objective function we are looking for
*/
virtual void SetObjFunction(IObjFunction<DType> *obj) {
gstate.obj = obj;
}
/*!
* \brief initialize the LBFGS solver
* user must already set the objective function
*/
virtual void Init(void) {
utils::Check(gstate.obj != NULL,
"LBFGSSolver.Init must SetObjFunction first");
int version = rabit::LoadCheckPoint(&gstate, &hist);
if (version == 0) {
gstate.num_dim = gstate.obj->InitNumDim();
} else {
printf("restart from version=%d\n", version);
}
{
// decide parameter partition
size_t nproc = rabit::GetWorldSize();
size_t rank = rabit::GetRank();
size_t step = (gstate.num_dim + nproc - 1) / nproc;
// upper align
step = (step + 7) / 8 * 8;
utils::Assert(step * nproc >= gstate.num_dim, "BUG");
range_begin_ = std::min(rank * step, gstate.num_dim);
range_end_ = std::min((rank + 1) * step, gstate.num_dim);
}
if (version == 0) {
gstate.Init();
hist.Init(range_end_ - range_begin_, gstate.size_memory);
gstate.obj->InitModel(gstate.weight, gstate.num_dim);
// broadcast initialize model
rabit::Broadcast(gstate.weight,
sizeof(DType) * gstate.num_dim, 0);
gstate.old_objval = this->Eval(gstate.weight);
gstate.init_objval = gstate.old_objval;
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu\n",
gstate.num_dim, gstate.init_objval, gstate.size_memory);
}
}
}
/*!
* \brief get the current weight vector
* note that if update function is called
* the content of weight vector is no longer valid
* \return weight vector
*/
virtual DType *GetWeight(void) {
return gstate.weight;
}
/*!
* \brief update the weight for one L-BFGS iteration
* \return whether stopping condition is met
*/
virtual bool UpdateOneIter(void) {
bool stop = false;
GlobalState &g = gstate;
g.obj->CalcGrad(g.grad, g.weight, g.num_dim);
rabit::Allreduce<rabit::op::Sum>(g.grad, g.num_dim);
// find change direction
double vdot = FindChangeDirection(g.tempw, g.grad, g.weight);
// line-search, g.grad is now new weight
int iter = BacktrackLineSearch(g.grad, g.tempw, g.weight, vdot);
utils::Check(iter < max_linesearch_iter, "line search failed");
// swap new weight
std::swap(g.weight, g.grad);
// check stop condition
if (gstate.num_iteration > min_lbfgs_iter) {
if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
return true;
}
}
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("[%d] L-BFGS: linesearch finishes in %d rounds, new_objval=%g, improvment=%g\n",
gstate.num_iteration, iter,
gstate.new_objval,
gstate.old_objval - gstate.new_objval);
}
gstate.old_objval = gstate.new_objval;
rabit::CheckPoint(&gstate, &hist);
return stop;
}
/*! \brief run optimization */
virtual void Run(void) {
this->Init();
while (gstate.num_iteration < max_lbfgs_iter) {
if (this->UpdateOneIter()) break;
}
if (silent == 0 && rabit::GetRank() == 0) {
size_t nonzero = 0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
if (gstate.weight[i] != 0.0f) nonzero += 1;
}
rabit::TrackerPrintf
("L-BFGS: finishes at iteration %d, %lu/%lu active weights\n",
gstate.num_iteration, nonzero, gstate.num_dim);
}
}
protected:
// find the delta value, given gradient
// return dot(dir, l1grad)
virtual double FindChangeDirection(DType *dir,
const DType *grad,
const DType *weight) {
int m = static_cast<int>(gstate.size_memory);
int n = static_cast<int>(hist.num_useful());
if (n < m) {
utils::Assert(hist.num_useful() == gstate.num_iteration,
"BUG2, n=%d, it=%d", n, gstate.num_iteration);
} else {
utils::Assert(n == m, "BUG3");
}
const size_t num_dim = gstate.num_dim;
const DType *gsub = grad + range_begin_;
const size_t nsub = range_end_ - range_begin_;
double vdot;
if (n != 0) {
// hist[m + n - 1] stores old gradient
Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
SetL1Dir(hist[2 * m], gsub, weight + range_begin_, nsub);
// index set for calculating results
std::vector<std::pair<size_t, size_t> > idxset;
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(j, 2 * m));
idxset.push_back(std::make_pair(j, n - 1));
idxset.push_back(std::make_pair(j, m + n - 1));
}
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(m + j, 2 * m));
idxset.push_back(std::make_pair(m + j, m + n - 1));
}
// calculate dot products
std::vector<double> tmp(idxset.size());
for (size_t i = 0; i < tmp.size(); ++i) {
tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
}
rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
for (size_t i = 0; i < tmp.size(); ++i) {
gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
}
// BFGS steps, use vector-free update
// parameterize vector using basis in hist
std::vector<double> alpha(n);
std::vector<double> delta(2 * m + 1, 0.0);
delta[2 * m] = 1.0;
// backward step
for (int j = n - 1; j >= 0; --j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, j);
}
alpha[j] = vsum / gstate.DotBuf(j, m + j);
delta[m + j] = delta[m + j] - alpha[j];
}
// scale
double scale = gstate.DotBuf(n - 1, m + n - 1) /
gstate.DotBuf(m + n - 1, m + n - 1);
for (size_t k = 0; k < delta.size(); ++k) {
delta[k] *= scale;
}
// forward step
for (int j = 0; j < n; ++j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, m + j);
}
double beta = vsum / gstate.DotBuf(j, m + j);
delta[j] = delta[j] + (alpha[j] - beta);
}
// set all to zero
std::fill(dir, dir + num_dim, 0.0f);
DType *dirsub = dir + range_begin_;
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[m + i], delta[m + i], nsub);
}
AddScale(dirsub, dirsub, hist[2 * m], delta[2 * m], nsub);
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[i], delta[i], nsub);
}
FixDirL1Sign(dirsub, hist[2 * m], nsub);
vdot = -Dot(dirsub, hist[2 * m], nsub);
// allreduce to get full direction
rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
} else {
SetL1Dir(dir, grad, weight, num_dim);
vdot = -Dot(dir, dir, num_dim);
}
// shift the history record
if (n < m) {
n += 1;
} else {
gstate.Shift(); hist.Shift();
}
hist.set_num_useful(n);
// copy gradient to hist[m + n - 1]
memcpy(hist[m + n - 1], gsub, nsub * sizeof(DType));
return vdot;
}
// line search for given direction
// return whether there is a descent
inline int BacktrackLineSearch(DType *new_weight,
const DType *dir,
const DType *weight,
double dot_dir_l1grad) {
utils::Assert(dot_dir_l1grad < 0.0f,
"gradient error, dotv=%g", dot_dir_l1grad);
double alpha = 1.0;
double backoff = linesearch_backoff;
// unit descent direction in first iter
if (gstate.num_iteration == 0) {
utils::Assert(hist.num_useful() == 1, "hist.nuseful");
alpha = 1.0f / std::sqrt(-dot_dir_l1grad);
backoff = 0.1f;
}
int iter = 0;
double old_val = gstate.old_objval;
double c1 = this->linesearch_c1;
while (true) {
const size_t num_dim = gstate.num_dim;
if (++iter >= max_linesearch_iter) return iter;
AddScale(new_weight, weight, dir, alpha, num_dim);
this->FixWeightL1Sign(new_weight, weight, num_dim);
double new_val = this->Eval(new_weight);
if (new_val - old_val <= c1 * dot_dir_l1grad * alpha) {
gstate.new_objval = new_val; break;
}
alpha *= backoff;
}
// hist[n - 1] = new_weight - weight
Minus(hist[hist.num_useful() - 1],
new_weight + range_begin_,
weight + range_begin_,
range_end_ - range_begin_);
gstate.num_iteration += 1;
return iter;
}
// OWL-QN step for L1 regularization
inline void SetL1Dir(DType *dst,
const DType *grad,
const DType *weight,
size_t size) {
if (reg_L1 == 0.0) {
for (size_t i = 0; i < size; ++i) {
dst[i] = -grad[i];
}
} else {
for (size_t i = 0; i < size; ++i) {
if (weight[i] > 0.0f) {
dst[i] = -grad[i] - reg_L1;
} else if (weight[i] < 0.0f) {
dst[i] = -grad[i] + reg_L1;
} else {
if (grad[i] < -reg_L1) {
dst[i] = -grad[i] - reg_L1;
} else if (grad[i] > reg_L1) {
dst[i] = -grad[i] + reg_L1;
} else {
dst[i] = 0.0;
}
}
}
}
}
// OWL-QN step: fix direction sign to be consistent with proposal
inline void FixDirL1Sign(DType *dir,
const DType *steepdir,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (dir[i] * steepdir[i] <= 0.0f) {
dir[i] = 0.0f;
}
}
}
}
// QWL-QN step: fix direction sign to be consistent with proposal
inline void FixWeightL1Sign(DType *new_weight,
const DType *weight,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (new_weight[i] * weight[i] < 0.0f) {
new_weight[i] = 0.0f;
}
}
}
}
inline double Eval(const DType *weight) {
double val = gstate.obj->Eval(weight, gstate.num_dim);
rabit::Allreduce<rabit::op::Sum>(&val, 1);
if (reg_L1 != 0.0f) {
double l1norm = 0.0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
l1norm += std::abs(weight[i]);
}
val += l1norm * reg_L1;
}
return val;
}
private:
// helper functions
// dst = lhs + rhs * scale
inline static void AddScale(DType *dst,
const DType *lhs,
const DType *rhs,
DType scale,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] + rhs[i] * scale;
}
}
// dst = lhs - rhs
inline static void Minus(DType *dst,
const DType *lhs,
const DType *rhs,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] - rhs[i];
}
}
// return dot(lhs, rhs)
inline static double Dot(const DType *lhs,
const DType *rhs,
size_t size) {
double res = 0.0;
for (size_t i = 0; i < size; ++i) {
res += lhs[i] * rhs[i];
}
return res;
}
// map rolling array index
inline static size_t MapIndex(size_t i, size_t offset,
size_t size_memory) {
if (i == 2 * size_memory) return i;
if (i < size_memory) {
return (i + offset) % size_memory;
} else {
utils::Assert(i < 2 * size_memory,
"MapIndex: index exceed bound, i=%lu", i);
return (i + offset) % size_memory + size_memory;
}
}
// global solver state
struct GlobalState : public rabit::ISerializable {
public:
// memory size of L-BFGS
size_t size_memory;
// number of iterations passed
size_t num_iteration;
// number of features in the solver
size_t num_dim;
// initialize objective value
double init_objval;
// history objective value
double old_objval;
// new objective value
double new_objval;
// objective function
IObjFunction<DType> *obj;
// temporal storage
DType *grad, *weight, *tempw;
// constructor
GlobalState(void)
: obj(NULL), grad(NULL),
weight(NULL), tempw(NULL) {
size_memory = 10;
num_iteration = 0;
num_dim = 0;
old_objval = 0.0;
}
~GlobalState(void) {
if (grad != NULL) {
delete [] grad;
delete [] weight;
delete [] tempw;
}
}
// intilize the space of rolling array
inline void Init(void) {
size_t n = size_memory * 2 + 1;
data.resize(n * n, 0.0);
this->AllocSpace();
}
inline double &DotBuf(size_t i, size_t j) {
if (i > j) std::swap(i, j);
return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
MapIndex(j, offset_, size_memory)];
}
// load the shift array
virtual void Load(rabit::IStream &fi) {
fi.Read(&size_memory, sizeof(size_memory));
fi.Read(&num_iteration, sizeof(num_iteration));
fi.Read(&num_dim, sizeof(num_dim));
fi.Read(&init_objval, sizeof(init_objval));
fi.Read(&old_objval, sizeof(old_objval));
fi.Read(&offset_, sizeof(offset_));
fi.Read(&data);
this->AllocSpace();
fi.Read(weight, sizeof(DType) * num_dim);
obj->Load(fi);
}
// save the shift array
virtual void Save(rabit::IStream &fo) const {
fo.Write(&size_memory, sizeof(size_memory));
fo.Write(&num_iteration, sizeof(num_iteration));
fo.Write(&num_dim, sizeof(num_dim));
fo.Write(&init_objval, sizeof(init_objval));
fo.Write(&old_objval, sizeof(old_objval));
fo.Write(&offset_, sizeof(offset_));
fo.Write(data);
fo.Write(weight, sizeof(DType) * num_dim);
obj->Save(fo);
}
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory;
}
private:
// rolling offset in the current memory
size_t offset_;
std::vector<double> data;
// allocate sapce
inline void AllocSpace(void) {
if (grad == NULL) {
grad = new DType[num_dim];
weight = new DType[num_dim];
tempw = new DType[num_dim];
}
}
};
/*! \brief rolling array that carries history information */
struct HistoryArray : public rabit::ISerializable {
public:
HistoryArray(void) : dptr_(NULL) {
num_useful_ = 0;
}
~HistoryArray(void) {
if (dptr_ != NULL) delete [] dptr_;
}
// intilize the space of rolling array
inline void Init(size_t num_col, size_t size_memory) {
if (dptr_ != NULL &&
(num_col_ != num_col || size_memory_ != size_memory)) {
delete dptr_;
}
num_col_ = num_col;
size_memory_ = size_memory;
stride_ = num_col_;
offset_ = 0;
size_t n = size_memory * 2 + 1;
dptr_ = new DType[n * stride_];
}
// fetch element from rolling array
inline const DType *operator[](size_t i) const {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
inline DType *operator[](size_t i) {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
// shift array: arr_old -> arr_new
// for i in [0, size_memory - 1), arr_new[i] = arr_old[i + 1]
// for i in [size_memory, 2 * size_memory - 1), arr_new[i] = arr_old[i + 1]
// arr_old[0] and arr_arr[size_memory] will be discarded
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory_;
}
inline double CalcDot(size_t i, size_t j) const {
return Dot((*this)[i], (*this)[j], num_col_);
}
// set number of useful memory
inline const size_t &num_useful(void) const {
return num_useful_;
}
// set number of useful memory
inline void set_num_useful(size_t num_useful) {
utils::Assert(num_useful <= size_memory_,
"num_useful exceed bound");
num_useful_ = num_useful;
}
// load the shift array
virtual void Load(rabit::IStream &fi) {
fi.Read(&num_col_, sizeof(num_col_));
fi.Read(&stride_, sizeof(stride_));
fi.Read(&size_memory_, sizeof(size_memory_));
fi.Read(&num_useful_, sizeof(num_useful_));
this->Init(num_col_, size_memory_);
for (size_t i = 0; i < num_useful_; ++i) {
fi.Read((*this)[i], num_col_ * sizeof(DType));
fi.Read((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
// save the shift array
virtual void Save(rabit::IStream &fi) const {
fi.Write(&num_col_, sizeof(num_col_));
fi.Write(&stride_, sizeof(stride_));
fi.Write(&size_memory_, sizeof(size_memory_));
fi.Write(&num_useful_, sizeof(num_useful_));
for (size_t i = 0; i < num_useful_; ++i) {
fi.Write((*this)[i], num_col_ * sizeof(DType));
fi.Write((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
private:
// number of columns in each of array
size_t num_col_;
// stride for each of column for alignment
size_t stride_;
// memory size of L-BFGS
size_t size_memory_;
// number of useful memory that will be used
size_t num_useful_;
// rolling offset in the current memory
size_t offset_;
// data pointer
DType *dptr_;
};
// data structure for LBFGS
GlobalState gstate;
HistoryArray hist;
// silent
int silent;
// the subrange of current node
size_t range_begin_;
size_t range_end_;
// L1 regularization co-efficient
float reg_L1;
// c1 ratio for line search
float linesearch_c1;
float linesearch_backoff;
int max_linesearch_iter;
int max_lbfgs_iter;
int min_lbfgs_iter;
float lbfgs_stop_tol;
};
} // namespace solver
} // namespace rabit
#endif // RABIT_LEARN_LBFGS_H_

View File

@ -0,0 +1,204 @@
#ifndef RABIT_LEARN_UTILS_BASE64_H_
#define RABIT_LEARN_UTILS_BASE64_H_
/*!
* \file base64.h
* \brief data stream support to input and output from/to base64 stream
* base64 is easier to store and pass as text format in mapreduce
* \author Tianqi Chen
*/
#include <cctype>
#include <cstdio>
#include <rabit/io.h>
namespace rabit {
namespace utils {
/*! \brief namespace of base64 decoding and encoding table */
namespace base64 {
const char DecodeTable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62, // '+'
0, 0, 0,
63, // '/'
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
0, 0, 0, 0, 0, 0,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
};
static const char EncodeTable[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
} // namespace base64
/*! \brief the stream that reads from base64, note we take from file pointers */
class Base64InStream: public IStream {
public:
explicit Base64InStream(FILE *fp) : fp(fp) {
num_prev = 0; tmp_ch = 0;
}
/*!
* \brief initialize the stream position to beginning of next base64 stream
* call this function before actually start read
*/
inline void InitPosition(void) {
// get a charater
do {
tmp_ch = fgetc(fp);
} while (isspace(tmp_ch));
}
/*! \brief whether current position is end of a base64 stream */
inline bool IsEOF(void) const {
return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
}
virtual size_t Read(void *ptr, size_t size) {
using base64::DecodeTable;
if (size == 0) return 0;
// use tlen to record left size
size_t tlen = size;
unsigned char *cptr = static_cast<unsigned char*>(ptr);
// if anything left, load from previous buffered result
if (num_prev != 0) {
if (num_prev == 2) {
if (tlen >= 2) {
*cptr++ = buf_prev[0];
*cptr++ = buf_prev[1];
tlen -= 2;
num_prev = 0;
} else {
// assert tlen == 1
*cptr++ = buf_prev[0]; --tlen;
buf_prev[0] = buf_prev[1];
num_prev = 1;
}
} else {
// assert num_prev == 1
*cptr++ = buf_prev[0]; --tlen; num_prev = 0;
}
}
if (tlen == 0) return size;
int nvalue;
// note: everything goes with 4 bytes in Base64
// so we process 4 bytes a unit
while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
// first byte
nvalue = DecodeTable[tmp_ch] << 18;
{
// second byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
nvalue |= DecodeTable[tmp_ch] << 12;
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
}
{
// third byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
// handle termination
if (tmp_ch == '=') {
Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch] << 6;
if (tlen) {
*cptr++ = (nvalue >> 8) & 0xFF; --tlen;
} else {
buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
}
}
{
// fourth byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
if (tmp_ch == '=') {
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch];
if (tlen) {
*cptr++ = nvalue & 0xFF; --tlen;
} else {
buf_prev[num_prev ++] = nvalue & 0xFF;
}
}
// get next char
tmp_ch = fgetc(fp);
}
if (kStrictCheck) {
Check(tlen == 0, "Base64InStream: read incomplete");
}
return size - tlen;
}
virtual void Write(const void *ptr, size_t size) {
utils::Error("Base64InStream do not support write");
}
private:
FILE *fp;
int tmp_ch;
int num_prev;
unsigned char buf_prev[2];
// whether we need to do strict check
static const bool kStrictCheck = false;
};
/*! \brief the stream that write to base64, note we take from file pointers */
class Base64OutStream: public IStream {
public:
explicit Base64OutStream(FILE *fp) : fp(fp) {
buf_top = 0;
}
virtual void Write(const void *ptr, size_t size) {
using base64::EncodeTable;
size_t tlen = size;
const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
while (tlen) {
while (buf_top < 3 && tlen != 0) {
buf[++buf_top] = *cptr++; --tlen;
}
if (buf_top == 3) {
// flush 4 bytes out
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
fputc(EncodeTable[buf[3] & 0x3F], fp);
buf_top = 0;
}
}
}
virtual size_t Read(void *ptr, size_t size) {
Error("Base64OutStream do not support read");
return 0;
}
/*!
* \brief finish writing of all current base64 stream, do some post processing
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
*/
inline void Finish(char endch = EOF) {
using base64::EncodeTable;
if (buf_top == 1) {
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
fputc('=', fp);
fputc('=', fp);
}
if (buf_top == 2) {
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
fputc('=', fp);
}
buf_top = 0;
if (endch != EOF) fputc(endch, fp);
}
private:
FILE *fp;
int buf_top;
unsigned char buf[4];
};
} // namespace utils
} // namespace rabit
#endif // RABIT_LEARN_UTILS_BASE64_H_

View File

@ -1,24 +1,38 @@
#include <rabit.h>
/*!
* Copyright (c) 2015 by Contributors
* \file data.h
* \brief simple data structure that could be used by model
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_DATA_H_
#define RABIT_LEARN_DATA_H_
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <limits>
#include <cmath>
#include <rabit.h>
namespace rabit {
// typedef index type
typedef unsigned index_t;
/*! \brief sparse matrix, CSR format */
struct SparseMat {
// sparse matrix entry
struct Entry {
// feature index
unsigned findex;
index_t findex;
// feature value
float fvalue;
};
// sparse vector
struct Vector {
const Entry *data;
unsigned length;
index_t length;
inline const Entry &operator[](size_t i) const {
return data[i];
}
@ -26,7 +40,7 @@ struct SparseMat {
inline Vector operator[](size_t i) const {
Vector v;
v.data = &data[0] + row_ptr[i];
v.length = static_cast<unsigned>(row_ptr[i + 1]-row_ptr[i]);
v.length = static_cast<index_t>(row_ptr[i + 1]-row_ptr[i]);
return v;
}
// load data from LibSVM format
@ -35,7 +49,13 @@ struct SparseMat {
if (!strcmp(fname, "stdin")) {
fi = stdin;
} else {
fi = utils::FopenCheck(fname, "r");
if (strchr(fname, '%') != NULL) {
char s_tmp[256];
snprintf(s_tmp, sizeof(s_tmp), fname, rabit::GetRank());
fi = utils::FopenCheck(s_tmp, "r");
} else {
fi = utils::FopenCheck(fname, "r");
}
}
row_ptr.clear();
row_ptr.push_back(0);
@ -45,9 +65,11 @@ struct SparseMat {
char tmp[1024];
while (fscanf(fi, "%s", tmp) == 1) {
Entry e;
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
unsigned long fidx;
if (sscanf(tmp, "%lu:%f", &fidx, &e.fvalue) == 2) {
e.findex = static_cast<index_t>(fidx);
data.push_back(e);
feat_dim = std::max(e.findex, feat_dim);
feat_dim = std::max(fidx, feat_dim);
} else {
if (!init) {
labels.push_back(label);
@ -61,6 +83,9 @@ struct SparseMat {
labels.push_back(label);
row_ptr.push_back(data.size());
feat_dim += 1;
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
"feature dimension exceed limit of index_t"\
"consider change the index_t to unsigned long");
// close the filed
if (fi != stdin) fclose(fi);
}
@ -68,7 +93,7 @@ struct SparseMat {
return row_ptr.size() - 1;
}
// maximum feature dimension
unsigned feat_dim;
size_t feat_dim;
std::vector<size_t> row_ptr;
std::vector<Entry> data;
std::vector<float> labels;
@ -115,3 +140,4 @@ inline int Random(int value) {
return rand() % value;
}
} // namespace rabit
#endif // RABIT_LEARN_DATA_H_

View File

@ -0,0 +1,40 @@
#ifndef RABIT_LEARN_UTILS_IO_H_
#define RABIT_LEARN_UTILS_IO_H_
/*!
* \file io.h
* \brief additional stream interface
* \author Tianqi Chen
*/
namespace rabit {
namespace utils {
/*! \brief implementation of file i/o stream */
class FileStream : public ISeekStream {
public:
explicit FileStream(FILE *fp) : fp(fp) {}
explicit FileStream(void) {
this->fp = NULL;
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, size, 1, fp);
}
virtual void Write(const void *ptr, size_t size) {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
}
virtual size_t Tell(void) {
return std::ftell(fp);
}
inline void Close(void) {
if (fp != NULL){
std::fclose(fp); fp = NULL;
}
}
private:
FILE *fp;
};
} // namespace utils
} // namespace rabit
#endif // RABIT_LEARN_UTILS_IO_H_

View File

@ -77,7 +77,10 @@ void AllreduceRobust::Allreduce(void *sendrecvbuf_,
PreprocFunction prepare_fun,
void *prepare_arg) {
// skip action in single node
if (world_size == 1) return;
if (world_size == 1) {
if (prepare_fun != NULL) prepare_fun(prepare_arg);
return;
}
bool recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter);
// now we are free to remove the last result, if any
if (resbuf.LastSeqNo() != -1 &&

View File

@ -92,6 +92,7 @@ void Allreduce_(void *sendrecvbuf,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg) {
if (prepare_fun != NULL) prepare_fun(prepare_arg);
}
// code for reduce handle
@ -106,6 +107,8 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {}
void ReduceHandle::Allreduce(void *sendrecvbuf,
size_t type_nbytes, size_t count,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg) {}
void *prepare_arg) {
if (prepare_fun != NULL) prepare_fun(prepare_arg);
}
} // namespace engine
} // namespace rabit