remake the wrapper

This commit is contained in:
tqchen
2014-08-17 17:43:46 -07:00
parent 2c969ecf14
commit af100dd869
18 changed files with 520 additions and 572 deletions

25
src/README.md Normal file
View File

@@ -0,0 +1,25 @@
Coding Guide
======
Project Logical Layout
=======
* Dependency order: io->learner->gbm->tree
- All module depends on data.h
* tree are implementations of tree construction algorithms.
* gbm is gradient boosting interface, that takes trees and other base learner to do boosting.
- gbm only takes gradient as sufficient statistics, it does not compute the gradient.
* learner is learning module that computes gradient for specific object, and pass it to GBM
File Naming Convention
=======
* The project is templatized, to make it easy to adjust input data structure.
* .h files are data structures and interface, which are needed to use functions in that layer.
* -inl.hpp files are implementations of interface, like cpp file in most project.
- You only need to understand the interface file to understand the usage of that layer
How to Hack the Code
======
* Add objective function: add to learner/objective-inl.hpp and register it in learner/objective.h ```CreateObjFunction```
- You can also directly do it in python
* Add new evaluation metric: add to learner/evaluation-inl.hpp and register it in learner/evaluation.h ```CreateEvaluator```
* Add wrapper for a new language, most likely you can do it by taking the functions in python/xgboost_wrapper.h, which is purely C based, and call these C functions to use xgboost

View File

@@ -226,8 +226,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
if (this->HaveColAccess()) return;
this->InitColData(max_nrow);
}
/*! \brief get the row iterator associated with FMatrix */
/*!
* \brief get the row iterator associated with FMatrix
* this function is not threadsafe, returns iterator stored in FMatrixS
*/
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
iter_->BeforeFirst();
return iter_;
}
/*! \brief set iterator */

View File

@@ -2,6 +2,7 @@
#define _CRT_SECURE_NO_DEPRECATE
#include <string>
#include "./io.h"
#include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp"
// implements data loads using dmatrix simple for now
@@ -12,5 +13,10 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
dmat->CacheLoad(fname, silent, savebuffer);
return dmat;
}
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
utils::Error("not implemented");
}
} // namespace io
} // namespace xgboost

View File

@@ -28,8 +28,9 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent = false, bool savebuff
* SaveDMatrix will choose the best way to materialize the dmatrix.
* \param dmat the dmatrix to be saved
* \param fname file name to be savd
* \param silent whether print message during saving
*/
void SaveDMatrix(const DataMatrix &dmat, const char *fname);
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
} // namespace io
} // namespace xgboost

View File

@@ -23,7 +23,7 @@ namespace io {
class DMatrixSimple : public DataMatrix {
public:
// constructor
DMatrixSimple(void) {
DMatrixSimple(void) : DataMatrix(kMagic) {
this->fmat.set_iter(new OneBatchIter(this));
this->Clear();
}
@@ -36,6 +36,24 @@ class DMatrixSimple : public DataMatrix {
row_data_.clear();
info.Clear();
}
/*! \brief copy content data from source matrix */
inline void CopyFrom(const DataMatrix &src) {
this->info = src.info;
this->Clear();
// clone data content in thos matrix
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const SparseBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
SparseBatch::Inst inst = batch[i];
row_data_.resize(row_data_.size() + inst.length);
memcpy(&row_data_[row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length);
row_ptr_.push_back(row_ptr_.back() + inst.length);
}
}
}
/*!
* \brief add a row to the matrix
* \param feats features
@@ -183,7 +201,7 @@ class DMatrixSimple : public DataMatrix {
protected:
// one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<SparseBatch> {
OneBatchIter(DMatrixSimple *parent)
explicit OneBatchIter(DMatrixSimple *parent)
: at_first_(true), parent_(parent) {}
virtual ~OneBatchIter(void) {}
virtual void BeforeFirst(void) {

View File

@@ -6,6 +6,7 @@
* used for regression/classification/ranking
* \author Tianqi Chen
*/
#include <vector>
#include "../data.h"
namespace xgboost {
@@ -43,7 +44,7 @@ struct MetaInfo {
}
/*! \brief get weight of each instances */
inline float GetWeight(size_t i) const {
if(weights.size() != 0) {
if (weights.size() != 0) {
return weights[i];
} else {
return 1.0f;
@@ -51,7 +52,7 @@ struct MetaInfo {
}
/*! \brief get root index of i-th instance */
inline float GetRoot(size_t i) const {
if(root_index.size() != 0) {
if (root_index.size() != 0) {
return static_cast<float>(root_index[i]);
} else {
return 0;
@@ -76,7 +77,7 @@ struct MetaInfo {
// try to load group information from file, if exists
inline bool TryLoadGroup(const char* fname, bool silent = false) {
FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false;
if (fi == NULL) return false;
group_ptr.push_back(0);
unsigned nline;
while (fscanf(fi, "%u", &nline) == 1) {
@@ -110,6 +111,11 @@ struct MetaInfo {
*/
template<typename FMatrix>
struct DMatrix {
/*!
* \brief magic number associated with this object
* used to check if it is specific instance
*/
const int magic;
/*! \brief meta information about the dataset */
MetaInfo info;
/*! \brief feature matrix about data content */
@@ -120,7 +126,7 @@ struct DMatrix {
*/
void *cache_learner_ptr_;
/*! \brief default constructor */
DMatrix(void) : cache_learner_ptr_(NULL) {}
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
// virtual destructor
virtual ~DMatrix(void){}
};

View File

@@ -39,7 +39,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
if (!strcmp(name, "merror")) return new EvalMatchError();
if (!strcmp(name, "logloss")) return new EvalLogLoss();
if (!strcmp(name, "auc")) return new EvalAuc();
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
if (!strncmp(name, "ams@", 4)) return new EvalAMS(name);
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);

View File

@@ -78,6 +78,7 @@ class BoostLearner {
inline void SetParam(const char *name, const char *val) {
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp("seed", name)) random::Seed(atoi(val));
if (gbm_ == NULL) {
if (!strcmp(name, "objective")) name_obj_ = val;
if (!strcmp(name, "booster")) name_gbm_ = val;
@@ -132,16 +133,24 @@ class BoostLearner {
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
}
}
/*!
* \brief check if data matrix is ready to be used by training,
* if not intialize it
* \param p_train pointer to the matrix used by training
*/
inline void CheckInit(DMatrix<FMatrix> *p_train) const {
p_train->fmat.InitColAccess();
}
/*!
* \brief update the model for one iteration
* \param iter current iteration number
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
this->PredictRaw(*p_train, &preds_);
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(gpair_, train.fmat, train.info.root_index);
}
/*!
* \brief evaluate the model for specific iteration

View File

@@ -48,7 +48,6 @@ class BoostLearnTask{
inline void SetParam(const char *name, const char *val) {
if (!strcmp("silent", name)) silent = atoi(val);
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
if (!strcmp("seed", name)) random::Seed(atoi(val));
if (!strcmp("num_round", name)) num_round = atoi(val);
if (!strcmp("save_period", name)) save_period = atoi(val);
if (!strcmp("eval_train", name)) eval_train = atoi(val);
@@ -103,9 +102,6 @@ class BoostLearnTask{
} else {
// training
data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0);
{// intialize column access
data->fmat.InitColAccess();
}
utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
for (size_t i = 0; i < eval_data_names.size(); ++i) {
deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0));
@@ -139,10 +135,11 @@ class BoostLearnTask{
inline void TaskTrain(void) {
const time_t start = time(NULL);
unsigned long elapsed = 0;
learner.CheckInit(data);
for (int i = 0; i < num_round; ++i) {
elapsed = (unsigned long)(time(NULL) - start);
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(i,data);
learner.UpdateOneIter(i, *data);
std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
fprintf(stderr, "%s\n", res.c_str());
if (save_period != 0 && (i + 1) % save_period == 0) {