spelling, wording, and doc fixes in c++ code
I was reading through the code and fixing some things in the comments. Only a few trivial actual code changes were made to make things more readable.
This commit is contained in:
parent
c56c1b9482
commit
c70022e6c4
18
src/data.h
18
src/data.h
@ -14,7 +14,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
/*!
|
||||
* \brief unsigned interger type used in boost,
|
||||
* \brief unsigned integer type used in boost,
|
||||
* used for feature index and row index
|
||||
*/
|
||||
typedef unsigned bst_uint;
|
||||
@ -35,8 +35,8 @@ struct bst_gpair {
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief extra information that might needed by gbm and tree module
|
||||
* these information are not necessarily presented, and can be empty
|
||||
* \brief extra information that might be needed by gbm and tree module
|
||||
* this information is not necessarily present, and can be empty
|
||||
*/
|
||||
struct BoosterInfo {
|
||||
/*! \brief number of rows in the data */
|
||||
@ -53,7 +53,7 @@ struct BoosterInfo {
|
||||
/*! \brief number of rows, number of columns */
|
||||
BoosterInfo(void) : num_row(0), num_col(0) {
|
||||
}
|
||||
/*! \brief get root of ith instance */
|
||||
/*! \brief get root of i-th instance */
|
||||
inline unsigned GetRoot(size_t i) const {
|
||||
return root_index.size() == 0 ? 0 : root_index[i];
|
||||
}
|
||||
@ -120,13 +120,13 @@ struct ColBatch : public SparseBatch {
|
||||
};
|
||||
/**
|
||||
* \brief interface of feature matrix, needed for tree construction
|
||||
* this interface defines two way to access features,
|
||||
* row access is defined by iterator of RowBatch
|
||||
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
|
||||
* this interface defines two ways to access features:
|
||||
* row access is defined by iterator of RowBatch
|
||||
* col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
|
||||
*/
|
||||
class IFMatrix {
|
||||
public:
|
||||
// the interface only need to ganrantee row iter
|
||||
// the interface only need to guarantee row iter
|
||||
// column iter is active, when ColIterator is called, row_iter can be disabled
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
|
||||
@ -142,7 +142,7 @@ class IFMatrix {
|
||||
* \brief check if column access is supported, if not, initialize column access
|
||||
* \param enabled whether certain feature should be included in column access
|
||||
* \param subsample subsample ratio when generating column access
|
||||
* \param max_row_perbatch auxilary information, maximum row used in each column batch
|
||||
* \param max_row_perbatch auxiliary information, maximum row used in each column batch
|
||||
* this is a hint information that can be ignored by the implementation
|
||||
*/
|
||||
virtual void InitColAccess(const std::vector<bool> &enabled,
|
||||
|
||||
@ -58,7 +58,7 @@ class IGradBooster {
|
||||
return false;
|
||||
}
|
||||
/*!
|
||||
* \brief peform update to the model(boosting)
|
||||
* \brief perform update to the model(boosting)
|
||||
* \param p_fmat feature matrix that provide access to features
|
||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||
* this means we do not have buffer index allocated to the gbm
|
||||
@ -88,7 +88,7 @@ class IGradBooster {
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit = 0) = 0;
|
||||
/*!
|
||||
* \brief online prediction funciton, predict score for one instance at a time
|
||||
* \brief online prediction function, predict score for one instance at a time
|
||||
* NOTE: use the batch prediction interface if possible, batch prediction is usually
|
||||
* more efficient than online prediction
|
||||
* This function is NOT threadsafe, make sure you only call from one thread
|
||||
@ -119,7 +119,7 @@ class IGradBooster {
|
||||
/*!
|
||||
* \brief dump the model in text format
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param option extra option of the dumo model
|
||||
* \param option extra option of the dump model
|
||||
* \return a vector of dump for boosters
|
||||
*/
|
||||
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) = 0;
|
||||
|
||||
@ -31,7 +31,7 @@ class GBTree : public IGradBooster {
|
||||
using namespace std;
|
||||
if (!strncmp(name, "bst:", 4)) {
|
||||
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
||||
// set into updaters, if already intialized
|
||||
// set into updaters, if already initialized
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->SetParam(name+4, val);
|
||||
}
|
||||
@ -85,7 +85,7 @@ class GBTree : public IGradBooster {
|
||||
fo.Write(BeginPtr(pred_counter), pred_counter.size() * sizeof(unsigned));
|
||||
}
|
||||
}
|
||||
// initialize the predic buffer
|
||||
// initialize the predict buffer
|
||||
virtual void InitModel(void) {
|
||||
pred_buffer.clear(); pred_counter.clear();
|
||||
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
|
||||
@ -446,7 +446,7 @@ class GBTree : public IGradBooster {
|
||||
int num_roots;
|
||||
/*! \brief number of features to be used by trees */
|
||||
int num_feature;
|
||||
/*! \brief size of predicton buffer allocated used for buffering */
|
||||
/*! \brief size of prediction buffer allocated used for buffering */
|
||||
int64_t num_pbuffer;
|
||||
/*!
|
||||
* \brief how many output group a single instance can produce
|
||||
|
||||
@ -22,7 +22,7 @@ typedef learner::DMatrix DataMatrix;
|
||||
* \param silent whether print message during loading
|
||||
* \param savebuffer whether temporal buffer the file if the file is in text format
|
||||
* \param loadsplit whether we only load a split of input files
|
||||
* such that each worker node get a split of the data
|
||||
* such that each worker node get a split of the data
|
||||
* \param cache_file name of cache_file, used by external memory version
|
||||
* can be NULL, if cache_file is specified, this will be the temporal
|
||||
* space that can be re-used to store intermediate data
|
||||
@ -38,7 +38,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
|
||||
* note: the saved dmatrix format may not be in exactly same as input
|
||||
* SaveDMatrix will choose the best way to materialize the dmatrix.
|
||||
* \param dmat the dmatrix to be saved
|
||||
* \param fname file name to be savd
|
||||
* \param fname file name to be saved
|
||||
* \param silent whether print message during saving
|
||||
*/
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
|
||||
|
||||
@ -31,7 +31,7 @@ struct LibSVMPage : public SparsePage {
|
||||
/*!
|
||||
* \brief libsvm parser that parses the input lines
|
||||
* and returns rows in input data
|
||||
* factry that was used by threadbuffer template
|
||||
* factory that was used by threadbuffer template
|
||||
*/
|
||||
class LibSVMPageFactory {
|
||||
public:
|
||||
|
||||
@ -200,7 +200,7 @@ class FMatrixPage : public IFMatrix {
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
/*! \brief get number of columns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_size_.size();
|
||||
@ -246,7 +246,7 @@ class FMatrixPage : public IFMatrix {
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief colmun based iterator
|
||||
* \brief column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
size_t ncol = this->NumCol();
|
||||
@ -290,8 +290,10 @@ class FMatrixPage : public IFMatrix {
|
||||
fo->Write(col_size_);
|
||||
}
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \brief initialize column data
|
||||
* \param enabled the list of enabled columns
|
||||
* \param pkeep probability to keep a row
|
||||
* \param max_row_perbatch maximum row per batch
|
||||
*/
|
||||
inline void InitColData(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
@ -319,7 +321,7 @@ class FMatrixPage : public IFMatrix {
|
||||
bytes_write += spage;
|
||||
double tnow = rabit::utils::GetTime();
|
||||
double tdiff = tnow - tstart;
|
||||
utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
|
||||
utils::Printf("Writing to %s in %g MB/s, %lu MB written\n",
|
||||
col_data_name_.c_str(),
|
||||
(bytes_write >> 20UL) / tdiff,
|
||||
(bytes_write >> 20UL));
|
||||
|
||||
@ -51,7 +51,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
inline void CopyFrom(const DataMatrix &src) {
|
||||
this->Clear();
|
||||
this->info = src.info;
|
||||
// clone data content in thos matrix
|
||||
// clone data contents from src matrix
|
||||
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
@ -313,7 +313,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
private:
|
||||
// whether is at first
|
||||
bool at_first_;
|
||||
// pointer to parient
|
||||
// pointer to parent
|
||||
DMatrixSimple *parent_;
|
||||
// temporal space for batch
|
||||
RowBatch batch_;
|
||||
|
||||
@ -40,7 +40,7 @@ class FMatrixS : public IFMatrix {
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
/*! \brief get number of columns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_size_.size();
|
||||
@ -83,7 +83,7 @@ class FMatrixS : public IFMatrix {
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief colmun based iterator
|
||||
* \brief column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
size_t ncol = this->NumCol();
|
||||
@ -112,7 +112,7 @@ class FMatrixS : public IFMatrix {
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \brief initialize column data
|
||||
* \param enabled the list of enabled columns
|
||||
* \param pkeep probability to keep a row
|
||||
* \param max_row_perbatch maximum row per batch
|
||||
|
||||
@ -33,8 +33,7 @@ class SparsePage {
|
||||
return offset.size() - 1;
|
||||
}
|
||||
/*!
|
||||
* \brief load the by providing a list of interested segments
|
||||
* only the interested segments are loaded
|
||||
* \brief load only the segments we are interested in
|
||||
* \param fi the input stream of the file
|
||||
* \param sorted_index_set sorted index of segments we are interested in
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
|
||||
@ -35,7 +35,7 @@ struct MetaInfo {
|
||||
std::vector<float> weights;
|
||||
/*!
|
||||
* \brief initialized margins,
|
||||
* if specified, xgboost will start from this init margin
|
||||
* if specified, xgboost will start from this initial margin
|
||||
* can be used to specify initial prediction to boost from
|
||||
*/
|
||||
std::vector<float> base_margin;
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief base class of elementwise evaluation
|
||||
* \brief base class of element-wise evaluation
|
||||
* \tparam Derived the name of subclass
|
||||
*/
|
||||
template<typename Derived>
|
||||
@ -57,7 +57,7 @@ struct EvalEWiseBase : public IEvaluator {
|
||||
*/
|
||||
inline static float EvalRow(float label, float pred);
|
||||
/*!
|
||||
* \brief to be overide by subclas, final trasnformation
|
||||
* \brief to be overridden by subclass, final transformation
|
||||
* \param esum the sum statistics returned by EvalRow
|
||||
* \param wsum sum of weight
|
||||
*/
|
||||
@ -109,7 +109,7 @@ struct EvalError : public EvalEWiseBase<EvalError> {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief loglikelihood of poission distribution */
|
||||
/*! \brief log-likelihood of Poission distribution */
|
||||
struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
|
||||
virtual const char *Name(void) const {
|
||||
return "poisson-nloglik";
|
||||
@ -174,7 +174,7 @@ struct EvalMClassBase : public IEvaluator {
|
||||
const float *pred,
|
||||
size_t nclass);
|
||||
/*!
|
||||
* \brief to be overide by subclas, final trasnformation
|
||||
* \brief to be overridden by subclass, final transformation
|
||||
* \param esum the sum statistics returned by EvalRow
|
||||
* \param wsum sum of weight
|
||||
*/
|
||||
@ -367,7 +367,7 @@ struct EvalPrecisionRatio : public IEvaluator{
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
/*! \brief Area under curve, for both classification and rank */
|
||||
/*! \brief Area Under Curve, for both classification and rank */
|
||||
struct EvalAuc : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info,
|
||||
@ -382,7 +382,7 @@ struct EvalAuc : public IEvaluator {
|
||||
utils::Check(gptr.back() == info.labels.size(),
|
||||
"EvalAuc: group structure must match number of prediction");
|
||||
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||
// sum statictis
|
||||
// sum statistics
|
||||
double sum_auc = 0.0f;
|
||||
#pragma omp parallel reduction(+:sum_auc)
|
||||
{
|
||||
@ -404,13 +404,16 @@ struct EvalAuc : public IEvaluator {
|
||||
// keep bucketing predictions in same bucket
|
||||
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
buf_neg = buf_pos = 0.0f;
|
||||
}
|
||||
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
|
||||
buf_pos += ctr * wt;
|
||||
buf_neg += (1.0f - ctr) * wt;
|
||||
}
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
sum_npos += buf_pos;
|
||||
sum_nneg += buf_neg;
|
||||
// check weird conditions
|
||||
utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
|
||||
"AUC: the dataset only contains pos or neg samples");
|
||||
@ -443,7 +446,8 @@ struct EvalRankList : public IEvaluator {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"label size predict size not match");
|
||||
// quick consistency when group is not available
|
||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
|
||||
std::vector<unsigned> tgptr(2, 0);
|
||||
tgptr[1] = static_cast<unsigned>(preds.size());
|
||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
||||
utils::Assert(gptr.back() == preds.size(),
|
||||
@ -468,7 +472,7 @@ struct EvalRankList : public IEvaluator {
|
||||
float dat[2];
|
||||
dat[0] = static_cast<float>(sum_metric);
|
||||
dat[1] = static_cast<float>(ngroup);
|
||||
// approximately estimate auc using mean
|
||||
// approximately estimate the metric using mean
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
return dat[0] / dat[1];
|
||||
} else {
|
||||
@ -500,14 +504,14 @@ struct EvalRankList : public IEvaluator {
|
||||
bool minus_;
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
/*! \brief Precision at N, for both classification and rank */
|
||||
struct EvalPrecision : public EvalRankList{
|
||||
public:
|
||||
explicit EvalPrecision(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
// calculate Preicsion
|
||||
// calculate Precision
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned nhit = 0;
|
||||
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
|
||||
@ -517,7 +521,7 @@ struct EvalPrecision : public EvalRankList{
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief NDCG */
|
||||
/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
|
||||
struct EvalNDCG : public EvalRankList{
|
||||
public:
|
||||
explicit EvalNDCG(const char *name) : EvalRankList(name) {}
|
||||
@ -549,7 +553,7 @@ struct EvalNDCG : public EvalRankList{
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
/*! \brief Mean Average Precision at N, for both classification and rank */
|
||||
struct EvalMAP : public EvalRankList {
|
||||
public:
|
||||
explicit EvalMAP(const char *name) : EvalRankList(name) {}
|
||||
|
||||
@ -45,7 +45,7 @@ inline static int FindMaxIndex(const std::vector<float>& rec) {
|
||||
return FindMaxIndex(BeginPtr(rec), rec.size());
|
||||
}
|
||||
|
||||
// perform numerical safe logsum
|
||||
// perform numerically safe logsum
|
||||
inline float LogSum(float x, float y) {
|
||||
if (x < y) {
|
||||
return y + std::log(std::exp(x - y) + 1.0f);
|
||||
@ -53,7 +53,7 @@ inline float LogSum(float x, float y) {
|
||||
return x + std::log(std::exp(y - x) + 1.0f);
|
||||
}
|
||||
}
|
||||
// numerical safe logsum
|
||||
// numerically safe logsum
|
||||
inline float LogSum(const float *rec, size_t size) {
|
||||
float mx = rec[0];
|
||||
for (size_t i = 1; i < size; ++i) {
|
||||
@ -66,11 +66,11 @@ inline float LogSum(const float *rec, size_t size) {
|
||||
return mx + std::log(sum);
|
||||
}
|
||||
|
||||
// comparator functions for sorting pairs in descending order
|
||||
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
|
||||
inline static bool CmpSecond(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.second > b.second;
|
||||
|
||||
@ -22,8 +22,8 @@ namespace xgboost {
|
||||
/*! \brief namespace for learning algorithm */
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief learner that takes do gradient boosting on specific objective functions
|
||||
* and do training and prediction
|
||||
* \brief learner that performs gradient boosting for a specific objective function.
|
||||
* It does training and prediction.
|
||||
*/
|
||||
class BoostLearner : public rabit::Serializable {
|
||||
public:
|
||||
@ -258,7 +258,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
}
|
||||
/*!
|
||||
* \brief check if data matrix is ready to be used by training,
|
||||
* if not intialize it
|
||||
* if not initialize it
|
||||
* \param p_train pointer to the matrix used by training
|
||||
*/
|
||||
inline void CheckInit(DMatrix *p_train) {
|
||||
@ -283,7 +283,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iter current iteration number
|
||||
* \param p_train pointer to the data matrix
|
||||
* \param train reference to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
||||
if (seed_per_iteration != 0 || rabit::IsDistributed()) {
|
||||
@ -342,6 +342,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
* \param out_preds output vector that stores the prediction
|
||||
* \param ntree_limit limit number of trees used for boosted tree
|
||||
* predictor, when it equals 0, this means we are using all the trees
|
||||
* \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
|
||||
*/
|
||||
inline void Predict(const DMatrix &data,
|
||||
bool output_margin,
|
||||
@ -358,7 +359,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief online prediction funciton, predict score for one instance at a time
|
||||
* \brief online prediction function, predict score for one instance at a time
|
||||
* NOTE: use the batch prediction interface if possible, batch prediction is usually
|
||||
* more efficient than online prediction
|
||||
* This function is NOT threadsafe, make sure you only call from one thread
|
||||
@ -367,7 +368,6 @@ class BoostLearner : public rabit::Serializable {
|
||||
* \param output_margin whether to only predict margin value instead of transformed prediction
|
||||
* \param out_preds output vector to hold the predictions
|
||||
* \param ntree_limit limit the number of trees used in prediction
|
||||
* \param root_index the root index
|
||||
* \sa Predict
|
||||
*/
|
||||
inline void Predict(const SparseBatch::Inst &inst,
|
||||
@ -452,7 +452,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
float base_score;
|
||||
/* \brief number of features */
|
||||
unsigned num_feature;
|
||||
/* \brief number of class, if it is multi-class classification */
|
||||
/* \brief number of classes, if it is multi-class classification */
|
||||
int num_class;
|
||||
/*! \brief whether the model itself is saved with pbuffer */
|
||||
int saved_with_pbuffer;
|
||||
@ -495,7 +495,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
int updater_mode;
|
||||
// cached size of predict buffer
|
||||
size_t pred_buffer_size;
|
||||
// maximum buffred row value
|
||||
// maximum buffered row value
|
||||
float prob_buffer_row;
|
||||
// evaluation set
|
||||
EvalSet evaluator_;
|
||||
@ -505,13 +505,13 @@ class BoostLearner : public rabit::Serializable {
|
||||
gbm::IGradBooster *gbm_;
|
||||
// name of gbm model used for training
|
||||
std::string name_gbm_;
|
||||
// objective fnction
|
||||
// objective function
|
||||
IObjFunction *obj_;
|
||||
// name of objective function
|
||||
std::string name_obj_;
|
||||
// configurations
|
||||
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||
// temporal storages for prediciton
|
||||
// temporal storages for prediction
|
||||
std::vector<float> preds_;
|
||||
// gradient pairs
|
||||
std::vector<bst_gpair> gpair_;
|
||||
@ -527,7 +527,7 @@ class BoostLearner : public rabit::Serializable {
|
||||
CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
|
||||
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
||||
};
|
||||
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||
// find internal buffer offset for certain matrix, if not exist, return -1
|
||||
inline int64_t FindBufferOffset(const DMatrix &mat) const {
|
||||
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
||||
|
||||
@ -84,7 +84,7 @@ struct LossType {
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient(float predt, float label) const {
|
||||
// cap second order gradient to postive value
|
||||
// cap second order gradient to positive value
|
||||
const float eps = 1e-16f;
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return 1.0f;
|
||||
|
||||
@ -68,7 +68,7 @@ class IObjFunction{
|
||||
// factory function
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief factory funciton to create objective function by name */
|
||||
/*! \brief factory function to create objective function by name */
|
||||
inline IObjFunction* CreateObjFunction(const char *name) {
|
||||
using namespace std;
|
||||
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
|
||||
|
||||
@ -321,9 +321,9 @@ class TreeModel {
|
||||
*/
|
||||
inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
|
||||
utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
|
||||
"Tree::SaveModel");
|
||||
"TreeModel::SaveModel");
|
||||
utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
|
||||
"Tree::SaveModel");
|
||||
"TreeModel::SaveModel");
|
||||
fo.Write(¶m, sizeof(Param));
|
||||
utils::Assert(param.num_nodes != 0, "invalid model");
|
||||
fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size());
|
||||
@ -462,7 +462,7 @@ class TreeModel {
|
||||
|
||||
/*! \brief node statistics used in regression tree */
|
||||
struct RTreeNodeStat {
|
||||
/*! \brief loss chg caused by current split */
|
||||
/*! \brief loss change caused by current split */
|
||||
float loss_chg;
|
||||
/*! \brief sum of hessian values, used to measure coverage of data */
|
||||
float sum_hess;
|
||||
@ -485,7 +485,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
public:
|
||||
/*!
|
||||
* \brief dense feature vector that can be taken by RegTree
|
||||
* to do tranverse efficiently
|
||||
* to do traverse efficiently
|
||||
* and can be construct from sparse feature vector
|
||||
*/
|
||||
struct FVec {
|
||||
@ -498,7 +498,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
int flag;
|
||||
};
|
||||
std::vector<Entry> data;
|
||||
/*! \brief intialize the vector with size vector */
|
||||
/*! \brief initialize the vector with size vector */
|
||||
inline void Init(size_t size) {
|
||||
Entry e; e.flag = -1;
|
||||
data.resize(size);
|
||||
@ -529,14 +529,14 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
};
|
||||
/*!
|
||||
* \brief get the leaf index
|
||||
* \param feats dense feature vector, if the feature is missing the field is set to NaN
|
||||
* \param root_gid starting root index of the instance
|
||||
* \param feat dense feature vector, if the feature is missing the field is set to NaN
|
||||
* \param root_id starting root index of the instance
|
||||
* \return the leaf index of the given feature
|
||||
*/
|
||||
inline int GetLeafIndex(const FVec&feat, unsigned root_id = 0) const {
|
||||
inline int GetLeafIndex(const FVec &feat, unsigned root_id = 0) const {
|
||||
// start from groups that belongs to current data
|
||||
int pid = static_cast<int>(root_id);
|
||||
// tranverse tree
|
||||
// traverse tree
|
||||
while (!(*this)[ pid ].is_leaf()) {
|
||||
unsigned split_index = (*this)[pid].split_index();
|
||||
pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
|
||||
@ -546,7 +546,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
/*!
|
||||
* \brief get the prediction of regression tree, only accepts dense feature vector
|
||||
* \param feats dense feature vector, if the feature is missing the field is set to NaN
|
||||
* \param root_gid starting root index of the instance
|
||||
* \param root_id starting root index of the instance
|
||||
* \return the leaf index of the given feature
|
||||
*/
|
||||
inline float Predict(const FVec &feat, unsigned root_id = 0) const {
|
||||
|
||||
@ -32,7 +32,7 @@ struct TrainParam{
|
||||
// default direction choice
|
||||
int default_direction;
|
||||
// maximum delta update we can add in weight estimation
|
||||
// this parameter can be used to stablize update
|
||||
// this parameter can be used to stabilize update
|
||||
// default=0 means no constraint on weight delta
|
||||
float max_delta_step;
|
||||
// whether we want to do subsample
|
||||
@ -51,7 +51,7 @@ struct TrainParam{
|
||||
int size_leaf_vector;
|
||||
// option for parallelization
|
||||
int parallel_option;
|
||||
// option to open cacheline optimizaton
|
||||
// option to open cacheline optimization
|
||||
int cache_opt;
|
||||
// number of threads to be used for tree construction,
|
||||
// if OpenMP is enabled, if equals 0, use system default
|
||||
@ -132,7 +132,7 @@ struct TrainParam{
|
||||
}
|
||||
}
|
||||
}
|
||||
// calculate cost of loss function with four stati
|
||||
// calculate cost of loss function with four statistics
|
||||
inline double CalcGain(double sum_grad, double sum_hess,
|
||||
double test_grad, double test_hess) const {
|
||||
double w = CalcWeight(sum_grad, sum_hess);
|
||||
@ -167,7 +167,7 @@ struct TrainParam{
|
||||
inline bool need_backward_search(float col_density, bool indicator) const {
|
||||
return this->default_direction != 2;
|
||||
}
|
||||
/*! \brief given the loss change, whether we need to invode prunning */
|
||||
/*! \brief given the loss change, whether we need to invoke pruning */
|
||||
inline bool need_prune(double loss_chg, int depth) const {
|
||||
return loss_chg < this->min_split_loss;
|
||||
}
|
||||
@ -235,7 +235,7 @@ struct GradStats {
|
||||
const bst_gpair &b = gpair[ridx];
|
||||
this->Add(b.grad, b.hess);
|
||||
}
|
||||
/*! \brief caculate leaf weight */
|
||||
/*! \brief calculate leaf weight */
|
||||
inline double CalcWeight(const TrainParam ¶m) const {
|
||||
return param.CalcWeight(sum_grad, sum_hess);
|
||||
}
|
||||
@ -362,10 +362,10 @@ struct SplitEntry{
|
||||
/*! \brief constructor */
|
||||
SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
|
||||
/*!
|
||||
* \brief decides whether a we can replace current entry with the statistics given
|
||||
* This function gives better priority to lower index when loss_chg equals
|
||||
* not the best way, but helps to give consistent result during multi-thread execution
|
||||
* \param loss_chg the loss reduction get through the split
|
||||
* \brief decides whether we can replace current entry with the given statistics
|
||||
* This function gives better priority to lower index when loss_chg == new_loss_chg.
|
||||
* Not the best way, but helps to give consistent result during multi-thread execution.
|
||||
* \param new_loss_chg the loss reduction get through the split
|
||||
* \param split_index the feature index where the split is on
|
||||
*/
|
||||
inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
|
||||
@ -392,9 +392,9 @@ struct SplitEntry{
|
||||
}
|
||||
/*!
|
||||
* \brief update the split entry, replace it if e is better
|
||||
* \param loss_chg loss reduction of new candidate
|
||||
* \param new_loss_chg loss reduction of new candidate
|
||||
* \param split_index feature index to split on
|
||||
* \param split_value the split point
|
||||
* \param new_split_value the split point
|
||||
* \param default_left whether the missing value goes to left
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
|
||||
@ -26,11 +26,11 @@ class IUpdater {
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief peform update to the tree models
|
||||
* \brief perform update to the tree models
|
||||
* \param gpair the gradient pair statistics of the data
|
||||
* \param p_fmat feature matrix that provide access to features
|
||||
* \param info extra side information that may be need, such as root index
|
||||
* \param trees pointer to the trees to be updated, upater will change the content of the tree
|
||||
* \param trees references the trees to be updated, updater will change the content of trees
|
||||
* note: all the trees in the vector are updated, with the same statistics,
|
||||
* but maybe different random seeds, usually one tree is passed in at a time,
|
||||
* there can be multiple trees when we train random forest style model
|
||||
@ -53,7 +53,7 @@ class IUpdater {
|
||||
virtual ~IUpdater(void) {}
|
||||
};
|
||||
/*!
|
||||
* \brief create a updater based on name
|
||||
* \brief create an updater based on name
|
||||
* \param name name of updater
|
||||
* \return return the updater instance
|
||||
*/
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief colunwise update to construct a tree */
|
||||
/*! \brief column-wise update to construct a tree */
|
||||
template<typename TStats>
|
||||
class ColMaker: public IUpdater {
|
||||
public:
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
/*! \brief pruner that prunes a tree after growing finishes */
|
||||
class TreePruner: public IUpdater {
|
||||
public:
|
||||
virtual ~TreePruner(void) {}
|
||||
@ -56,7 +56,7 @@ class TreePruner: public IUpdater {
|
||||
return npruned;
|
||||
}
|
||||
}
|
||||
/*! \brief do prunning of a tree */
|
||||
/*! \brief do pruning of a tree */
|
||||
inline void DoPrune(RegTree &tree) { // NOLINT(*)
|
||||
int npruned = 0;
|
||||
// initialize auxiliary statistics
|
||||
@ -69,7 +69,7 @@ class TreePruner: public IUpdater {
|
||||
}
|
||||
}
|
||||
if (silent == 0) {
|
||||
utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
utils::Printf("tree pruning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
|
||||
}
|
||||
}
|
||||
|
||||
@ -91,7 +91,7 @@ class Base64InStream: public IStream {
|
||||
* call this function before actually start read
|
||||
*/
|
||||
inline void InitPosition(void) {
|
||||
// get a charater
|
||||
// get a character
|
||||
do {
|
||||
tmp_ch = reader_.GetChar();
|
||||
} while (isspace(tmp_ch));
|
||||
@ -223,7 +223,7 @@ class Base64OutStream: public IStream {
|
||||
}
|
||||
/*!
|
||||
* \brief finish writing of all current base64 stream, do some post processing
|
||||
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
|
||||
* \param endch character to put to end of stream, if it is EOF, then nothing will be done
|
||||
*/
|
||||
inline void Finish(char endch = EOF) {
|
||||
using base64::EncodeTable;
|
||||
|
||||
@ -58,7 +58,7 @@ class FeatMap {
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type(size_t idx) const {
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::type feature index exceed bound");
|
||||
return types_[idx];
|
||||
}
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ class IIterator {
|
||||
* \param val value of parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) {}
|
||||
/*! \brief initalize the iterator so that we can use the iterator */
|
||||
/*! \brief initialize the iterator so that we can use the iterator */
|
||||
virtual void Init(void) {}
|
||||
/*! \brief set before first of the item */
|
||||
virtual void BeforeFirst(void) = 0;
|
||||
|
||||
@ -214,7 +214,7 @@ struct WQSummary {
|
||||
/*!
|
||||
* \brief set current summary to be merged summary of sa and sb
|
||||
* \param sa first input summary to be merged
|
||||
* \param sb second input summar to be merged
|
||||
* \param sb second input summary to be merged
|
||||
*/
|
||||
inline void SetCombine(const WQSummary &sa,
|
||||
const WQSummary &sb) {
|
||||
@ -329,7 +329,7 @@ struct WQSummary {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief try to do efficient prunning */
|
||||
/*! \brief try to do efficient pruning */
|
||||
template<typename DType, typename RType>
|
||||
struct WXQSummary : public WQSummary<DType, RType> {
|
||||
// redefine entry type
|
||||
@ -364,7 +364,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
|
||||
RType mrange = 0;
|
||||
{
|
||||
// first scan, grab all the big chunk
|
||||
// moviing block index
|
||||
// moving block index
|
||||
size_t bid = 0;
|
||||
for (size_t i = 1; i < src.size; ++i) {
|
||||
if (CheckLarge(src.data[i], chunk)) {
|
||||
@ -574,7 +574,7 @@ struct GKSummary {
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief template for all quantle sketch algorithm
|
||||
* \brief template for all quantile sketch algorithm
|
||||
* that uses merge/prune scheme
|
||||
* \tparam DType type of data content
|
||||
* \tparam RType type of rank
|
||||
@ -605,7 +605,7 @@ class QuantileSketchTemplate {
|
||||
}
|
||||
/*!
|
||||
* \brief set the space to be merge of all Summary arrays
|
||||
* \param begin begining position in th summary array
|
||||
* \param begin beginning position in the summary array
|
||||
* \param end ending position in the Summary array
|
||||
*/
|
||||
inline void SetMerge(const Summary *begin,
|
||||
@ -664,7 +664,7 @@ class QuantileSketchTemplate {
|
||||
}
|
||||
};
|
||||
/*!
|
||||
* \brief intialize the quantile sketch, given the performance specification
|
||||
* \brief initialize the quantile sketch, given the performance specification
|
||||
* \param maxn maximum number of data points can be feed into sketch
|
||||
* \param eps accuracy level of summary
|
||||
*/
|
||||
@ -688,7 +688,7 @@ class QuantileSketchTemplate {
|
||||
}
|
||||
/*!
|
||||
* \brief add an element to a sketch
|
||||
* \param x the elemented added to the sketch
|
||||
* \param x the element added to the sketch
|
||||
*/
|
||||
inline void Push(DType x, RType w = 1) {
|
||||
if (w == static_cast<RType>(0)) return;
|
||||
|
||||
@ -27,7 +27,7 @@ inline void Seed(unsigned seed) {
|
||||
inline double Uniform(void) {
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0); // NOLINT(*)
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
/*! \brief return a real number uniform in (0,1) */
|
||||
inline double NextDouble2(void) {
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0); // NOLINT(*)
|
||||
}
|
||||
|
||||
@ -21,8 +21,8 @@ namespace utils {
|
||||
#if !defined(XGBOOST_STRICT_CXX98_)
|
||||
/*!
|
||||
* \brief buffered loading iterator that uses multithread
|
||||
* this template method will assume the following paramters
|
||||
* \tparam Elem elememt type to be buffered
|
||||
* this template method will assume the following parameters
|
||||
* \tparam Elem element type to be buffered
|
||||
* \tparam ElemFactory factory type to implement in order to use thread buffer
|
||||
*/
|
||||
template<typename Elem, typename ElemFactory>
|
||||
@ -45,7 +45,7 @@ class ThreadBuffer {
|
||||
/*!
|
||||
* \brief initalize the buffered iterator
|
||||
* \param param a initialize parameter that will pass to factory, ignore it if not necessary
|
||||
* \return false if the initlization can't be done, e.g. buffer file hasn't been created
|
||||
* \return false if the initialization can't be done, e.g. buffer file hasn't been created
|
||||
*/
|
||||
inline bool Init(void) {
|
||||
if (!factory.Init()) return false;
|
||||
@ -61,7 +61,7 @@ class ThreadBuffer {
|
||||
inline void BeforeFirst(void) {
|
||||
// wait till last loader end
|
||||
loading_end.Wait();
|
||||
// critcal zone
|
||||
// critical zone
|
||||
current_buf = 1;
|
||||
factory.BeforeFirst();
|
||||
// reset terminate limit
|
||||
|
||||
@ -62,7 +62,7 @@ const int kPrintBuffer = 1 << 12;
|
||||
|
||||
#ifndef XGBOOST_CUSTOMIZE_MSG_
|
||||
/*!
|
||||
* \brief handling of Assert error, caused by in-apropriate input
|
||||
* \brief handling of Assert error, caused by inappropriate input
|
||||
* \param msg error message
|
||||
*/
|
||||
inline void HandleAssertError(const char *msg) {
|
||||
@ -70,7 +70,7 @@ inline void HandleAssertError(const char *msg) {
|
||||
exit(-1);
|
||||
}
|
||||
/*!
|
||||
* \brief handling of Check error, caused by in-apropriate input
|
||||
* \brief handling of Check error, caused by inappropriate input
|
||||
* \param msg error message
|
||||
*/
|
||||
inline void HandleCheckError(const char *msg) {
|
||||
@ -157,7 +157,7 @@ inline std::FILE *FopenCheck(const char *fname, const char *flag) {
|
||||
return fp;
|
||||
}
|
||||
} // namespace utils
|
||||
// easy utils that can be directly acessed in xgboost
|
||||
// easy utils that can be directly accessed in xgboost
|
||||
/*! \brief get the beginning address of a vector */
|
||||
template<typename T>
|
||||
inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user