diff --git a/.gitignore b/.gitignore index 620d3dc8a..f800ecd23 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ *.lai *.la *.a +*~ diff --git a/Makefile b/Makefile index 6a2365ca7..5d3f9d527 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ OBJ = xgboost.o all: $(BIN) $(OBJ) export LDFLAGS= -pthread -lm -xgboost.o: booster/xgboost.cpp +xgboost.o: booster/*.h booster/*.cpp $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/README.md b/README.md index 1c6464162..a0accc96c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,20 @@ xgboost ======= +Creater: Tianqi Chen: tianqi.tchen@gmail.com General Purpose Gradient Boosting Library + +Intention: A stand-alone efficient library to do machine learning in functional space + +Planned key components (TODO): + +(1) Gradient boosting models: + - regression tree + - linear model/lasso +(2) Objectives to support tasks: + - regression + - classification + - ranking + - matrix factorization + - structured prediction +(3) OpenMP support for parallelization(optional) diff --git a/booster/xgboost.h b/booster/xgboost.h index 405901aa1..3646ecc98 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -12,6 +12,7 @@ /*! \brief namespace for xboost package */ namespace xgboost{ + /*! \brief namespace for boosters */ namespace booster{ /*! \brief interface of a gradient boosting learner */ class IBooster{ @@ -19,11 +20,12 @@ namespace xgboost{ // interface for model setting and loading // calling procedure: // (1) booster->SetParam to setting necessary parameters - // (2) if it is first time usage of the model: call booster-> - // if new model to be trained, trainer->init_trainer - // elseif just to load from file, trainer->load_model - // trainer->do_boost - // trainer->save_model + // (2) if it is first time usage of the model: + // call booster->InitModel + // else: + // call booster->LoadModel + // (3) booster->DoBoost to update the model + // (4) booster->Predict to get new prediction /*! * \brief set parameters from outside * \param name name of the parameter @@ -59,7 +61,7 @@ namespace xgboost{ const FMatrixS::Image &feats, const std::vector &root_index ) = 0; /*! - * \brief predict values for given sparse feature + * \brief predict values for given sparse feature vector * NOTE: in tree implementation, this is not threadsafe * \param feat vector in sparse format * \param rid root id of current instance, default = 0 @@ -70,7 +72,7 @@ namespace xgboost{ return 0.0f; } /*! - * \brief predict values for given dense feature + * \brief predict values for given dense feature vector * \param feat feature vector in dense format * \param funknown indicator that the feature is missing * \param rid root id of current instance, default = 0 @@ -88,6 +90,7 @@ namespace xgboost{ */ virtual void PrintInfo( FILE *fo ){} public: + /*! \brief virtual destructor */ virtual ~IBooster( void ){} }; }; diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index fc9e800ff..289f2e056 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -1,5 +1,6 @@ #ifndef _XGBOOST_DATA_H_ #define _XGBOOST_DATA_H_ + /*! * \file xgboost_data.h * \brief the input data structure for gradient boosting @@ -24,7 +25,7 @@ namespace xgboost{ namespace xgboost{ namespace booster{ /*! - * \brief auxlilary feature matrix to store training instance, in sparse CSR format + * \brief feature matrix to store training instance, in sparse CSR format */ class FMatrixS{ public: @@ -35,7 +36,7 @@ namespace xgboost{ /*! \brief array of feature value */ const bst_float *fvalue; /*! \brief size of the data */ - bst_int len; + bst_uint len; }; /*! * \brief remapped image of sparse matrix, @@ -89,12 +90,12 @@ namespace xgboost{ * \param feat sparse feature * \param fstart start bound of feature * \param fend end bound range of feature - * \return the row id addted + * \return the row id of added line */ inline size_t AddRow( const Line &feat, unsigned fstart = 0, unsigned fend = UINT_MAX ){ utils::Assert( feat.len >= 0, "sparse feature length can not be negative" ); unsigned cnt = 0; - for( int i = 0; i < feat.len; i ++ ){ + for( unsigned i = 0; i < feat.len; i ++ ){ if( feat.findex[i] < fstart || feat.findex[i] >= fend ) continue; findex.push_back( feat.findex[i] ); fvalue.push_back( feat.fvalue[i] ); @@ -103,11 +104,27 @@ namespace xgboost{ row_ptr.push_back( row_ptr.back() + cnt ); return row_ptr.size() - 2; } + + /*! + * \brief add a row to the matrix, with data stored in STL container + * \param findex feature index + * \param fvalue feature value + * \return the row id added line + */ + inline size_t AddRow( const std::vector &findex, + const std::vector &fvalue ){ + FMatrixS::Line l; + utils::Assert( findex.size() == fvalue.size() ); + l.findex = &findex[0]; + l.fvalue = &fvalue[0]; + l.len = static_cast( findex.size() ); + return this->AddRow( l ); + } /*! \brief get sparse part of current row */ inline Line operator[]( size_t sidx ) const{ Line sp; utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" ); - sp.len = row_ptr[ sidx + 1 ] - row_ptr[ sidx ]; + sp.len = static_cast( row_ptr[ sidx + 1 ] - row_ptr[ sidx ] ); sp.findex = &findex[ row_ptr[ sidx ] ]; sp.fvalue = &fvalue[ row_ptr[ sidx ] ]; return sp;