Merge commit '13776a006a4e572720ec4c5b029b54771cf2b35c' into unity

This commit is contained in:
tqchen
2015-02-11 20:33:35 -08:00
21 changed files with 9698 additions and 22 deletions

View File

@@ -0,0 +1,14 @@
# specify tensor path
BIN = linear.rabit
MOCKBIN= linear.mock
MPIBIN =
# objectives that makes up rabit library
OBJ = linear.o
# common build script for programs
include ../common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here
linear.rabit: linear.o lib
linear.mock: linear.o lib

View File

@@ -0,0 +1,33 @@
Linear and Logistic Regression
====
* input format: LibSVM
* Example: [run-linear.sh](run-linear.sh)
Parameters
===
All the parameters can be set by param=value
#### Important Parameters
* objective [default = logistic]
- can be linear or logistic
* base_score [default = 0.5]
- global bias, recommended set to mean value of label
* reg_L1 [default = 0]
- l1 regularization co-efficient
* reg_L2 [default = 1]
- l2 regularization co-efficient
* lbfgs_stop_tol [default = 1e-5]
- relative tolerance level of loss reduction with respect to initial loss
* max_lbfgs_iter [default = 500]
- maximum number of lbfgs iterations
### Optimization Related parameters
* min_lbfgs_iter [default = 5]
- minimum number of lbfgs iterations
* max_linesearch_iter [default = 100]
- maximum number of iterations in linesearch
* linesearch_c1 [default = 1e-4]
- c1 co-efficient in backoff linesearch
* linesarch_backoff [default = 0.5]
- backoff ratio in linesearch

View File

@@ -0,0 +1,239 @@
#include "./linear.h"
#include "../utils/io.h"
#include "../utils/base64.h"
namespace rabit {
namespace linear {
class LinearObjFunction : public solver::IObjFunction<float> {
public:
// training threads
int nthread;
// L2 regularization
float reg_L2;
// model
LinearModel model;
// training data
SparseMat dtrain;
// solver
solver::LBFGSSolver<float> lbfgs;
// constructor
LinearObjFunction(void) {
lbfgs.SetObjFunction(this);
nthread = 1;
reg_L2 = 0.0f;
model.weight = NULL;
task = "train";
model_in = "NULL";
name_pred = "pred.txt";
model_out = "final.model";
}
virtual ~LinearObjFunction(void) {
}
// set parameters
inline void SetParam(const char *name, const char *val) {
model.param.SetParam(name, val);
lbfgs.SetParam(name, val);
if (!strcmp(name, "num_feature")) {
char ndigit[30];
sprintf(ndigit, "%lu", model.param.num_feature + 1);
lbfgs.SetParam("num_dim", ndigit);
}
if (!strcmp(name, "reg_L2")) {
reg_L2 = static_cast<float>(atof(val));
}
if (!strcmp(name, "nthread")) {
nthread = atoi(val);
}
if (!strcmp(name, "task")) task = val;
if (!strcmp(name, "model_in")) model_in = val;
if (!strcmp(name, "model_out")) model_out = val;
if (!strcmp(name, "name_pred")) name_pred = val;
}
inline void Run(void) {
if (model_in != "NULL") {
this->LoadModel(model_in.c_str());
}
if (task == "train") {
lbfgs.Run();
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
} else if (task == "pred") {
this->TaskPred();
} else {
utils::Error("unknown task=%s", task.c_str());
}
}
inline void TaskPred(void) {
utils::Check(model_in != "NULL",
"must set model_in for task=pred");
FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float pred = model.Predict(dtrain[i]);
fprintf(fp, "%g\n", pred);
}
fclose(fp);
printf("Finishing writing to %s\n", name_pred.c_str());
}
inline void LoadModel(const char *fname) {
FILE *fp = utils::FopenCheck(fname, "rb");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
utils::FileStream fi(fp);
utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
// base64 format
if (header == "bs64") {
utils::Base64InStream bsin(fp);
bsin.InitPosition();
model.Load(bsin);
fclose(fp);
return;
} else if (header == "binf") {
model.Load(fi);
fclose(fp);
return;
} else {
utils::Error("invalid model file");
}
}
inline void SaveModel(const char *fname,
const float *wptr,
bool save_base64 = false) {
FILE *fp;
bool use_stdout = false;
if (!strcmp(fname, "stdout")) {
fp = stdout;
use_stdout = true;
} else {
fp = utils::FopenCheck(fname, "wb");
}
utils::FileStream fo(fp);
if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp);
model.Save(bout, wptr);
bout.Finish('\n');
} else {
fo.Write("binf", 4);
model.Save(fo, wptr);
}
if (!use_stdout) {
fclose(fp);
}
}
inline void LoadData(const char *fname) {
dtrain.Load(fname);
}
virtual size_t InitNumDim(void) {
if (model_in == "NULL") {
size_t ndim = dtrain.feat_dim;
rabit::Allreduce<rabit::op::Max>(&ndim, 1);
model.param.num_feature = std::max(ndim, model.param.num_feature);
}
return model.param.num_feature + 1;
}
virtual void InitModel(float *weight, size_t size) {
if (model_in == "NULL") {
memset(weight, 0.0f, size * sizeof(float));
model.param.InitBaseScore();
} else {
rabit::Broadcast(model.weight, size * sizeof(float), 0);
memcpy(weight, model.weight, size * sizeof(float));
}
}
// load model
virtual void Load(rabit::IStream &fi) {
fi.Read(&model.param, sizeof(model.param));
}
virtual void Save(rabit::IStream &fo) const {
fo.Write(&model.param, sizeof(model.param));
}
virtual double Eval(const float *weight, size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
double sum_val = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_val)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float py = model.param.PredictMargin(weight, dtrain[i]);
float fv = model.param.MarginToLoss(dtrain.labels[i], py);
sum_val += fv;
}
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
double sum_sqr = 0.0;
for (size_t i = 0; i < model.param.num_feature; ++i) {
sum_sqr += weight[i] * weight[i];
}
sum_val += 0.5 * reg_L2 * sum_sqr;
}
}
utils::Check(!std::isnan(sum_val), "nan occurs");
return sum_val;
}
virtual void CalcGrad(float *out_grad,
const float *weight,
size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
memset(out_grad, 0.0f, sizeof(float) * size);
double sum_gbias = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_gbias)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
SparseMat::Vector v = dtrain[i];
float py = model.param.Predict(weight, v);
float grad = model.param.PredToGrad(dtrain.labels[i], py);
for (index_t j = 0; j < v.length; ++j) {
out_grad[v[j].findex] += v[j].fvalue * grad;
}
sum_gbias += grad;
}
out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
for (size_t i = 0; i < model.param.num_feature; ++i) {
out_grad[i] += reg_L2 * weight[i];
}
}
}
}
private:
std::string task;
std::string model_in;
std::string model_out;
std::string name_pred;
};
} // namespace linear
} // namespace rabit
int main(int argc, char *argv[]) {
if (argc < 2) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_in> param=val\n");
}
rabit::Finalize();
return 0;
}
rabit::linear::LinearObjFunction linear;
if (!strcmp(argv[1], "stdin")) {
linear.LoadData(argv[1]);
rabit::Init(argc, argv);
} else {
rabit::Init(argc, argv);
linear.LoadData(argv[1]);
}
for (int i = 2; i < argc; ++i) {
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
linear.SetParam(name, val);
}
}
linear.Run();
rabit::Finalize();
return 0;
}

View File

@@ -0,0 +1,133 @@
/*!
* Copyright (c) 2015 by Contributors
* \file linear.h
* \brief Linear and Logistic regression
*
* \author Tianqi Chen
*/
#ifndef RABIT_LINEAR_H_
#define RABIT_LINEAR_H_
#include <omp.h>
#include "../utils/data.h"
#include "../solver/lbfgs.h"
namespace rabit {
namespace linear {
/*! \brief simple linear model */
struct LinearModel {
struct ModelParam {
/*! \brief global bias */
float base_score;
/*! \brief number of features */
size_t num_feature;
/*! \brief loss type*/
int loss_type;
// reserved field
int reserved[16];
// constructor
ModelParam(void) {
base_score = 0.5f;
num_feature = 0;
loss_type = 1;
std::memset(reserved, 0, sizeof(reserved));
}
// initialize base score
inline void InitBaseScore(void) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -std::log(1.0f / base_score - 1.0f);
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("base_score", name)) {
base_score = static_cast<float>(atof(val));
}
if (!strcmp("num_feature", name)) {
num_feature = static_cast<size_t>(atol(val));
}
if (!strcmp("objective", name)) {
if (!strcmp("linear", val)) {
loss_type = 0;
} else if (!strcmp("logistic", val)) {
loss_type = 1;
} else {
utils::Error("unknown objective type %s\n", val);
}
}
}
// transform margin to prediction
inline float MarginToPred(float margin) const {
if (loss_type == 1) {
return 1.0f / (1.0f + std::exp(-margin));
} else {
return margin;
}
}
// margin to loss
inline float MarginToLoss(float label, float margin) const {
if (loss_type == 1) {
float nlogprob;
if (margin > 0.0f) {
nlogprob = std::log(1.0f + std::exp(-margin));
} else {
nlogprob = -margin + std::log(1.0f + std::exp(margin));
}
return label * nlogprob +
(1.0f -label) * (margin + nlogprob);
} else {
float diff = margin - label;
return 0.5f * diff * diff;
}
}
inline float PredToGrad(float label, float pred) const {
return pred - label;
}
inline float PredictMargin(const float *weight,
const SparseMat::Vector &v) const {
// weight[num_feature] is bias
float sum = base_score + weight[num_feature];
for (unsigned i = 0; i < v.length; ++i) {
if (v[i].findex >= num_feature) continue;
sum += weight[v[i].findex] * v[i].fvalue;
}
return sum;
}
inline float Predict(const float *weight,
const SparseMat::Vector &v) const {
return MarginToPred(PredictMargin(weight, v));
}
};
// model parameter
ModelParam param;
// weight corresponding to the model
float *weight;
LinearModel(void) : weight(NULL) {
}
~LinearModel(void) {
if (weight != NULL) delete [] weight;
}
// load model
inline void Load(rabit::IStream &fi) {
fi.Read(&param, sizeof(param));
if (weight == NULL) {
weight = new float[param.num_feature + 1];
}
fi.Read(weight, sizeof(float) * (param.num_feature + 1));
}
inline void Save(rabit::IStream &fo, const float *wptr = NULL) const {
fo.Write(&param, sizeof(param));
if (wptr == NULL) wptr = weight;
fo.Write(wptr, sizeof(float) * (param.num_feature + 1));
}
inline float Predict(const SparseMat::Vector &v) const {
return param.Predict(weight, v);
}
};
} // namespace linear
} // namespace rabit
#endif // RABIT_LINEAR_H_

View File

@@ -0,0 +1,15 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf mushroom.row* *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1

View File

@@ -0,0 +1,17 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf mushroom.row* *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model

View File

@@ -0,0 +1,24 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different rows
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
for l in open(sys.argv[1]):
i = random.randint(0, k-1)
fos[i].write(l)
for f in fos:
f.close()