diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..6a2365ca7 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +export CC = gcc +export CXX = g++ +export CFLAGS = -Wall -O3 -msse2 + +# specify tensor path +BIN = +OBJ = xgboost.o +.PHONY: clean all + +all: $(BIN) $(OBJ) +export LDFLAGS= -pthread -lm + +xgboost.o: booster/xgboost.cpp + +$(BIN) : + $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +install: + cp -f -r $(BIN) $(INSTALL_PATH) + +clean: + $(RM) $(OBJ) $(BIN) *~ diff --git a/booster/xgboost.cpp b/booster/xgboost.cpp new file mode 100644 index 000000000..f3fb3f3ee --- /dev/null +++ b/booster/xgboost.cpp @@ -0,0 +1,13 @@ +/*! + * \file xgboost.cpp + * \brief bootser implementations + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +// implementation of boosters go to here +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#include +#include "xgboost.h" +#include "../utils/xgboost_utils.h" + + diff --git a/booster/xgboost.h b/booster/xgboost.h new file mode 100644 index 000000000..405901aa1 --- /dev/null +++ b/booster/xgboost.h @@ -0,0 +1,96 @@ +#ifndef _XGBOOST_H_ +#define _XGBOOST_H_ +/*! + * \file xgboost.h + * \brief the general gradient boosting interface + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include "../utils/xgboost_utils.h" +#include "../utils/xgboost_stream.h" +#include "xgboost_data.h" + +/*! \brief namespace for xboost package */ +namespace xgboost{ + namespace booster{ + /*! \brief interface of a gradient boosting learner */ + class IBooster{ + public: + // interface for model setting and loading + // calling procedure: + // (1) booster->SetParam to setting necessary parameters + // (2) if it is first time usage of the model: call booster-> + // if new model to be trained, trainer->init_trainer + // elseif just to load from file, trainer->load_model + // trainer->do_boost + // trainer->save_model + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + virtual void SetParam( const char *name, const char *val ) = 0; + /*! + * \brief load model from stream + * \param fi input stream + */ + virtual void LoadModel( utils::IStream &fi ) = 0; + /*! + * \brief save model to stream + * \param fo output stream + */ + virtual void SaveModel( utils::IStream &fo ) const = 0; + /*! + * \brief initialize solver before training, called before training + * this function is reserved for solver to allocate necessary space and do other preparations + */ + virtual void InitModel( void ) = 0; + public: + /*! + * \brief do gradient boost training for one step, using the information given + * \param grad first order gradient of each instance + * \param hess second order gradient of each instance + * \param feats features of each instance + * \param root_index pre-partitioned root index of each instance, + * root_index.size() can be 0 which indicates that no pre-partition involved + */ + virtual void DoBoost( std::vector &grad, + std::vector &hess, + const FMatrixS::Image &feats, + const std::vector &root_index ) = 0; + /*! + * \brief predict values for given sparse feature + * NOTE: in tree implementation, this is not threadsafe + * \param feat vector in sparse format + * \param rid root id of current instance, default = 0 + * \return prediction + */ + virtual float Predict( const FMatrixS::Line &feat, unsigned rid = 0 ){ + utils::Error( "not implemented" ); + return 0.0f; + } + /*! + * \brief predict values for given dense feature + * \param feat feature vector in dense format + * \param funknown indicator that the feature is missing + * \param rid root id of current instance, default = 0 + * \return prediction + */ + virtual float Predict( const std::vector &feat, + const std::vector &funknown, + unsigned rid = 0 ){ + utils::Error( "not implemented" ); + return 0.0f; + } + /*! + * \brief print information + * \param fo output stream + */ + virtual void PrintInfo( FILE *fo ){} + public: + virtual ~IBooster( void ){} + }; + }; +}; + +#endif diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h new file mode 100644 index 000000000..fc9e800ff --- /dev/null +++ b/booster/xgboost_data.h @@ -0,0 +1,118 @@ +#ifndef _XGBOOST_DATA_H_ +#define _XGBOOST_DATA_H_ +/*! + * \file xgboost_data.h + * \brief the input data structure for gradient boosting + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ + +#include +#include "../utils/xgboost_utils.h" + +namespace xgboost{ + namespace booster{ + /*! \brief interger type used in boost */ + typedef int bst_int; + /*! \brief unsigned interger type used in boost */ + typedef unsigned bst_uint; + /*! \brief float type used in boost */ + typedef float bst_float; + /*! \brief debug option for booster */ + const bool bst_debug = false; + }; +}; +namespace xgboost{ + namespace booster{ + /*! + * \brief auxlilary feature matrix to store training instance, in sparse CSR format + */ + class FMatrixS{ + public: + /*! \brief one row of sparse feature matrix */ + struct Line{ + /*! \brief array of feature index */ + const bst_uint *findex; + /*! \brief array of feature value */ + const bst_float *fvalue; + /*! \brief size of the data */ + bst_int len; + }; + /*! + * \brief remapped image of sparse matrix, + * allows use a subset of sparse matrix, by specifying a rowmap + */ + struct Image{ + public: + Image( const FMatrixS &smat ):smat(smat), row_map( tmp_rowmap ){ + } + Image( const FMatrixS &smat, const std::vector &row_map ) + :smat(smat), row_map(row_map){ + } + /*! \brief get sparse part of current row */ + inline Line operator[]( size_t sidx ) const{ + if( row_map.size() == 0 ) return smat[ sidx ]; + else return smat[ row_map[ sidx ] ]; + } + private: + // used to set the simple case + std::vector tmp_rowmap; + const FMatrixS &smat; + const std::vector &row_map; + }; + public: + // -----Note: unless needed for hacking, these fields should not be accessed directly ----- + /*! \brief row pointer of CSR sparse storage */ + std::vector row_ptr; + /*! \brief index of CSR format */ + std::vector findex; + /*! \brief value of CSR format */ + std::vector fvalue; + public: + /*! \brief constructor */ + FMatrixS( void ){ this->Clear(); } + /*! + * \brief get number of rows + * \return number of rows + */ + inline size_t NumRow( void ) const{ + return row_ptr.size() - 1; + } + /*! \brief clear the storage */ + inline void Clear( void ){ + row_ptr.resize( 0 ); + findex.resize( 0 ); + fvalue.resize( 0 ); + row_ptr.push_back( 0 ); + } + /*! + * \brief add a row to the matrix, but only accept features from fstart to fend + * \param feat sparse feature + * \param fstart start bound of feature + * \param fend end bound range of feature + * \return the row id addted + */ + inline size_t AddRow( const Line &feat, unsigned fstart = 0, unsigned fend = UINT_MAX ){ + utils::Assert( feat.len >= 0, "sparse feature length can not be negative" ); + unsigned cnt = 0; + for( int i = 0; i < feat.len; i ++ ){ + if( feat.findex[i] < fstart || feat.findex[i] >= fend ) continue; + findex.push_back( feat.findex[i] ); + fvalue.push_back( feat.fvalue[i] ); + cnt ++; + } + row_ptr.push_back( row_ptr.back() + cnt ); + return row_ptr.size() - 2; + } + /*! \brief get sparse part of current row */ + inline Line operator[]( size_t sidx ) const{ + Line sp; + utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" ); + sp.len = row_ptr[ sidx + 1 ] - row_ptr[ sidx ]; + sp.findex = &findex[ row_ptr[ sidx ] ]; + sp.fvalue = &fvalue[ row_ptr[ sidx ] ]; + return sp; + } + }; + }; +}; +#endif diff --git a/utils/xgboost_stream.h b/utils/xgboost_stream.h new file mode 100644 index 000000000..e32725d75 --- /dev/null +++ b/utils/xgboost_stream.h @@ -0,0 +1,52 @@ +#ifndef _XGBOOST_STREAM_H_ +#define _XGBOOST_STREAM_H_ + +#include +/*! + * \file xgboost_stream.h + * \brief general stream interface + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +namespace xgboost{ + namespace utils{ + /*! \brief interface of stream I/O, used to serialize tensor data */ + class IStream{ + public: + /*! + * \brief read data from stream + * \param ptr pointer to memory buffer + * \param size size of block + * \return usually is the size of data readed + */ + virtual size_t Read( void *ptr, size_t size ) = 0; + /*! + * \brief write data to stream + * \param ptr pointer to memory buffer + * \param size size of block + */ + virtual void Write( const void *ptr, size_t size ) = 0; + /*! \brief virtual destructor */ + virtual ~IStream( void ){} + }; + + /*! \brief implementation of file i/o stream */ + class FileStream: public IStream{ + private: + FILE *fp; + public: + FileStream( FILE *fp ){ + this->fp = fp; + } + virtual size_t Read( void *ptr, size_t size ){ + return fread( ptr, size, 1, fp ); + } + virtual void Write( const void *ptr, size_t size ){ + fwrite( ptr, size, 1, fp ); + } + inline void Close( void ){ + fclose( fp ); + } + }; + }; +}; +#endif diff --git a/utils/xgboost_utils.h b/utils/xgboost_utils.h new file mode 100644 index 000000000..658e9bf84 --- /dev/null +++ b/utils/xgboost_utils.h @@ -0,0 +1,67 @@ +#ifndef _XGBOOST_UTILS_H_ +#define _XGBOOST_UTILS_H_ + +/*! + * \file xgboost_utils.h + * \brief simple utils to support the code + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ + +#ifdef _MSC_VER +#define fopen64 fopen +#else + +// use 64 bit offset, either to include this header in the beginning, or +#ifdef _FILE_OFFSET_BITS +#if _FILE_OFFSET_BITS == 32 +#warning "FILE OFFSET BITS defined to be 32 bit" +#endif +#endif + +#ifdef __APPLE__ +#define off64_t off_t +#define fopen64 fopen +#endif + +#define _FILE_OFFSET_BITS 64 +extern "C"{ +#include +}; +#include +#endif + +#include +#include + +namespace xgboost{ + /*! \brief namespace for helper utils of the project */ + namespace utils{ + inline void Error( const char *msg ){ + fprintf( stderr, "Error:%s\n",msg ); + exit( -1 ); + } + + inline void Assert( bool exp ){ + if( !exp ) Error( "AssertError" ); + } + + inline void Assert( bool exp, const char *msg ){ + if( !exp ) Error( msg ); + } + + inline void Warning( const char *msg ){ + fprintf( stderr, "warning:%s\n",msg ); + } + /*! \brief replace fopen, report error when the file open fails */ + inline FILE *FopenCheck( const char *fname , const char *flag ){ + FILE *fp = fopen64( fname , flag ); + if( fp == NULL ){ + fprintf( stderr, "can not open file \"%s\"\n",fname ); + exit( -1 ); + } + return fp; + } + }; +}; + +#endif