complete refactor data.h, now replies on iterator to access column

This commit is contained in:
tqchen@graphlab.com
2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions

View File

@@ -8,7 +8,7 @@
*/
#include <vector>
#include "../data.h"
#include "../utils/io.h"
namespace xgboost {
namespace learner {
/*!
@@ -142,7 +142,6 @@ struct MetaInfo {
* \brief data object used for learning,
* \tparam FMatrix type of feature data source
*/
template<typename FMatrix>
struct DMatrix {
/*!
* \brief magic number associated with this object
@@ -152,7 +151,7 @@ struct DMatrix {
/*! \brief meta information about the dataset */
MetaInfo info;
/*! \brief feature matrix about data content */
FMatrix fmat;
IFMatrix *fmat;
/*!
* \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached
@@ -161,7 +160,9 @@ struct DMatrix {
/*! \brief default constructor */
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
// virtual destructor
virtual ~DMatrix(void){}
virtual ~DMatrix(void){
delete fmat;
}
};
} // namespace learner

View File

@@ -21,7 +21,6 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
template<typename FMatrix>
class BoostLearner {
public:
BoostLearner(void) {
@@ -44,7 +43,7 @@ class BoostLearner {
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
// estimate feature bound
unsigned num_feature = 0;
// assign buffer index
@@ -158,15 +157,15 @@ class BoostLearner {
* if not intialize it
* \param p_train pointer to the matrix used by training
*/
inline void CheckInit(DMatrix<FMatrix> *p_train) {
p_train->fmat.InitColAccess(prob_buffer_row);
inline void CheckInit(DMatrix *p_train) {
p_train->fmat->InitColAccess(prob_buffer_row);
}
/*!
* \brief update the model for one iteration
* \param iter current iteration number
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) {
inline void UpdateOneIter(int iter, const DMatrix &train) {
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@@ -179,7 +178,7 @@ class BoostLearner {
* \return a string corresponding to the evaluation result
*/
inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals,
const std::vector<const DMatrix*> &evals,
const std::vector<std::string> &evname) {
std::string res;
char tmp[256];
@@ -198,7 +197,7 @@ class BoostLearner {
* \param metric name of metric
* \return a pair of <evaluation name, result>
*/
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_);
@@ -213,7 +212,7 @@ class BoostLearner {
* \param output_margin whether to only predict margin value instead of transformed prediction
* \param out_preds output vector that stores the prediction
*/
inline void Predict(const DMatrix<FMatrix> &data,
inline void Predict(const DMatrix &data,
bool output_margin,
std::vector<float> *out_preds) const {
this->PredictRaw(data, out_preds);
@@ -235,7 +234,7 @@ class BoostLearner {
if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@@ -247,7 +246,7 @@ class BoostLearner {
* \param data training data matrix
* \param out_preds output vector that stores the prediction
*/
inline void PredictRaw(const DMatrix<FMatrix> &data,
inline void PredictRaw(const DMatrix &data,
std::vector<float> *out_preds) const {
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
data.info.info, out_preds);
@@ -307,7 +306,7 @@ class BoostLearner {
// model parameter
ModelParam mparam;
// gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_;
gbm::IGradBooster *gbm_;
// name of gbm model used for training
std::string name_gbm_;
// objective fnction
@@ -324,14 +323,14 @@ class BoostLearner {
private:
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix<FMatrix> *mat_;
const DMatrix *mat_;
size_t buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
};
// find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
inline int64_t FindBufferOffset(const DMatrix &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.info.num_row()) {