remake the wrapper
This commit is contained in:
25
src/README.md
Normal file
25
src/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
Coding Guide
|
||||
======
|
||||
|
||||
Project Logical Layout
|
||||
=======
|
||||
* Dependency order: io->learner->gbm->tree
|
||||
- All module depends on data.h
|
||||
* tree are implementations of tree construction algorithms.
|
||||
* gbm is gradient boosting interface, that takes trees and other base learner to do boosting.
|
||||
- gbm only takes gradient as sufficient statistics, it does not compute the gradient.
|
||||
* learner is learning module that computes gradient for specific object, and pass it to GBM
|
||||
|
||||
File Naming Convention
|
||||
=======
|
||||
* The project is templatized, to make it easy to adjust input data structure.
|
||||
* .h files are data structures and interface, which are needed to use functions in that layer.
|
||||
* -inl.hpp files are implementations of interface, like cpp file in most project.
|
||||
- You only need to understand the interface file to understand the usage of that layer
|
||||
|
||||
How to Hack the Code
|
||||
======
|
||||
* Add objective function: add to learner/objective-inl.hpp and register it in learner/objective.h ```CreateObjFunction```
|
||||
- You can also directly do it in python
|
||||
* Add new evaluation metric: add to learner/evaluation-inl.hpp and register it in learner/evaluation.h ```CreateEvaluator```
|
||||
* Add wrapper for a new language, most likely you can do it by taking the functions in python/xgboost_wrapper.h, which is purely C based, and call these C functions to use xgboost
|
||||
@@ -226,8 +226,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(max_nrow);
|
||||
}
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
* this function is not threadsafe, returns iterator stored in FMatrixS
|
||||
*/
|
||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*! \brief set iterator */
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#include <string>
|
||||
#include "./io.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "simple_dmatrix-inl.hpp"
|
||||
// implements data loads using dmatrix simple for now
|
||||
|
||||
@@ -12,5 +13,10 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
|
||||
dmat->CacheLoad(fname, silent, savebuffer);
|
||||
return dmat;
|
||||
}
|
||||
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
|
||||
utils::Error("not implemented");
|
||||
}
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -28,8 +28,9 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent = false, bool savebuff
|
||||
* SaveDMatrix will choose the best way to materialize the dmatrix.
|
||||
* \param dmat the dmatrix to be saved
|
||||
* \param fname file name to be savd
|
||||
* \param silent whether print message during saving
|
||||
*/
|
||||
void SaveDMatrix(const DataMatrix &dmat, const char *fname);
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -23,7 +23,7 @@ namespace io {
|
||||
class DMatrixSimple : public DataMatrix {
|
||||
public:
|
||||
// constructor
|
||||
DMatrixSimple(void) {
|
||||
DMatrixSimple(void) : DataMatrix(kMagic) {
|
||||
this->fmat.set_iter(new OneBatchIter(this));
|
||||
this->Clear();
|
||||
}
|
||||
@@ -36,6 +36,24 @@ class DMatrixSimple : public DataMatrix {
|
||||
row_data_.clear();
|
||||
info.Clear();
|
||||
}
|
||||
/*! \brief copy content data from source matrix */
|
||||
inline void CopyFrom(const DataMatrix &src) {
|
||||
this->info = src.info;
|
||||
this->Clear();
|
||||
// clone data content in thos matrix
|
||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
row_data_.resize(row_data_.size() + inst.length);
|
||||
memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief add a row to the matrix
|
||||
* \param feats features
|
||||
@@ -183,7 +201,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
protected:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct OneBatchIter: utils::IIterator<SparseBatch> {
|
||||
OneBatchIter(DMatrixSimple *parent)
|
||||
explicit OneBatchIter(DMatrixSimple *parent)
|
||||
: at_first_(true), parent_(parent) {}
|
||||
virtual ~OneBatchIter(void) {}
|
||||
virtual void BeforeFirst(void) {
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
* used for regression/classification/ranking
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -43,7 +44,7 @@ struct MetaInfo {
|
||||
}
|
||||
/*! \brief get weight of each instances */
|
||||
inline float GetWeight(size_t i) const {
|
||||
if(weights.size() != 0) {
|
||||
if (weights.size() != 0) {
|
||||
return weights[i];
|
||||
} else {
|
||||
return 1.0f;
|
||||
@@ -51,7 +52,7 @@ struct MetaInfo {
|
||||
}
|
||||
/*! \brief get root index of i-th instance */
|
||||
inline float GetRoot(size_t i) const {
|
||||
if(root_index.size() != 0) {
|
||||
if (root_index.size() != 0) {
|
||||
return static_cast<float>(root_index[i]);
|
||||
} else {
|
||||
return 0;
|
||||
@@ -76,7 +77,7 @@ struct MetaInfo {
|
||||
// try to load group information from file, if exists
|
||||
inline bool TryLoadGroup(const char* fname, bool silent = false) {
|
||||
FILE *fi = fopen64(fname, "r");
|
||||
if (fi == NULL) return false;
|
||||
if (fi == NULL) return false;
|
||||
group_ptr.push_back(0);
|
||||
unsigned nline;
|
||||
while (fscanf(fi, "%u", &nline) == 1) {
|
||||
@@ -110,6 +111,11 @@ struct MetaInfo {
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
struct DMatrix {
|
||||
/*!
|
||||
* \brief magic number associated with this object
|
||||
* used to check if it is specific instance
|
||||
*/
|
||||
const int magic;
|
||||
/*! \brief meta information about the dataset */
|
||||
MetaInfo info;
|
||||
/*! \brief feature matrix about data content */
|
||||
@@ -120,7 +126,7 @@ struct DMatrix {
|
||||
*/
|
||||
void *cache_learner_ptr_;
|
||||
/*! \brief default constructor */
|
||||
DMatrix(void) : cache_learner_ptr_(NULL) {}
|
||||
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
|
||||
// virtual destructor
|
||||
virtual ~DMatrix(void){}
|
||||
};
|
||||
|
||||
@@ -39,7 +39,7 @@ inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
if (!strcmp(name, "merror")) return new EvalMatchError();
|
||||
if (!strcmp(name, "logloss")) return new EvalLogLoss();
|
||||
if (!strcmp(name, "auc")) return new EvalAuc();
|
||||
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
|
||||
if (!strncmp(name, "ams@", 4)) return new EvalAMS(name);
|
||||
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
||||
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
||||
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
|
||||
|
||||
@@ -78,6 +78,7 @@ class BoostLearner {
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||
if (gbm_ == NULL) {
|
||||
if (!strcmp(name, "objective")) name_obj_ = val;
|
||||
if (!strcmp(name, "booster")) name_gbm_ = val;
|
||||
@@ -132,16 +133,24 @@ class BoostLearner {
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
this->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief check if data matrix is ready to be used by training,
|
||||
* if not intialize it
|
||||
* \param p_train pointer to the matrix used by training
|
||||
*/
|
||||
inline void CheckInit(DMatrix<FMatrix> *p_train) const {
|
||||
p_train->fmat.InitColAccess();
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iter current iteration number
|
||||
* \param p_train pointer to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
|
||||
this->PredictRaw(*p_train, &preds_);
|
||||
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
|
||||
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
|
||||
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
|
||||
this->PredictRaw(train, &preds_);
|
||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||
gbm_->DoBoost(gpair_, train.fmat, train.info.root_index);
|
||||
}
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
|
||||
@@ -48,7 +48,6 @@ class BoostLearnTask{
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("silent", name)) silent = atoi(val);
|
||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||
if (!strcmp("eval_train", name)) eval_train = atoi(val);
|
||||
@@ -103,9 +102,6 @@ class BoostLearnTask{
|
||||
} else {
|
||||
// training
|
||||
data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0);
|
||||
{// intialize column access
|
||||
data->fmat.InitColAccess();
|
||||
}
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i) {
|
||||
deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0));
|
||||
@@ -139,10 +135,11 @@ class BoostLearnTask{
|
||||
inline void TaskTrain(void) {
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
learner.CheckInit(data);
|
||||
for (int i = 0; i < num_round; ++i) {
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||
learner.UpdateOneIter(i,data);
|
||||
learner.UpdateOneIter(i, *data);
|
||||
std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
|
||||
fprintf(stderr, "%s\n", res.c_str());
|
||||
if (save_period != 0 && (i + 1) % save_period == 0) {
|
||||
|
||||
Reference in New Issue
Block a user