start unity refactor
This commit is contained in:
80
utils/fmap.h
Normal file
80
utils/fmap.h
Normal file
@@ -0,0 +1,80 @@
|
||||
#ifndef XGBOOST_UTILS_FMAP_H_
|
||||
#define XGBOOST_UTILS_FMAP_H_
|
||||
/*!
|
||||
* \file fmap.h
|
||||
* \brief helper class that holds the feature names and interpretations
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*! \brief helper class that holds the feature names and interpretations */
|
||||
class FeatMap {
|
||||
public:
|
||||
enum Type {
|
||||
kIndicator = 0,
|
||||
kQuantitive = 1,
|
||||
kInteger = 2,
|
||||
kFloat = 3
|
||||
};
|
||||
// function definitions
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(const char *fname) {
|
||||
FILE *fi = utils::FopenCheck(fname, "r");
|
||||
this->LoadText(fi);
|
||||
fclose(fi);
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(FILE *fi) {
|
||||
int fid;
|
||||
char fname[1256], ftype[1256];
|
||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
||||
this->PushBack(fid, fname, ftype);
|
||||
}
|
||||
}
|
||||
/*!\brief push back feature map */
|
||||
inline void PushBack(int fid, const char *fname, const char *ftype) {
|
||||
utils::Check(fid == static_cast<int>(names_.size()), "invalid fmap format");
|
||||
names_.push_back(std::string(fname));
|
||||
types_.push_back(GetType(ftype));
|
||||
}
|
||||
inline void Clear(void) {
|
||||
names_.clear(); types_.clear();
|
||||
}
|
||||
/*! \brief number of known features */
|
||||
size_t size(void) const {
|
||||
return names_.size();
|
||||
}
|
||||
/*! \brief return name of specific feature */
|
||||
const char* name(size_t idx) const {
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return names_[idx].c_str();
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type(size_t idx) const {
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return types_[idx];
|
||||
}
|
||||
|
||||
private:
|
||||
inline static Type GetType(const char *tname) {
|
||||
if (!strcmp("i", tname)) return kIndicator;
|
||||
if (!strcmp("q", tname)) return kQuantitive;
|
||||
if (!strcmp("int", tname)) return kInteger;
|
||||
if (!strcmp("float", tname)) return kFloat;
|
||||
utils::Error("unknown feature type, use i for indicator and q for quantity");
|
||||
return kIndicator;
|
||||
}
|
||||
/*! \brief name of the feature */
|
||||
std::vector<std::string> names_;
|
||||
/*! \brief type of the feature */
|
||||
std::vector<Type> types_;
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_FMAP_H_
|
||||
104
utils/io.h
Normal file
104
utils/io.h
Normal file
@@ -0,0 +1,104 @@
|
||||
#ifndef XGBOOST_UTILS_IO_H
|
||||
#define XGBOOST_UTILS_IO_H
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "./utils.h"
|
||||
/*!
|
||||
* \file io.h
|
||||
* \brief general stream interface for serialization, I/O
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
*/
|
||||
class IStream {
|
||||
public:
|
||||
/*!
|
||||
* \brief read data from stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
* \return usually is the size of data readed
|
||||
*/
|
||||
virtual size_t Read(void *ptr, size_t size) = 0;
|
||||
/*!
|
||||
* \brief write data to stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
*/
|
||||
virtual void Write(const void *ptr, size_t size) = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IStream(void) {}
|
||||
|
||||
public:
|
||||
// helper functions to write various of data structures
|
||||
/*!
|
||||
* \brief binary serialize a vector
|
||||
* \param vec vector to be serialized
|
||||
*/
|
||||
template<typename T>
|
||||
inline void Write(const std::vector<T> &vec) {
|
||||
uint64_t sz = vec.size();
|
||||
this->Write(&sz, sizeof(sz));
|
||||
this->Write(&vec[0], sizeof(T) * sz);
|
||||
}
|
||||
/*!
|
||||
* \brief binary load a vector
|
||||
* \param out_vec vector to be loaded
|
||||
* \return whether load is successfull
|
||||
*/
|
||||
template<typename T>
|
||||
inline bool Read(std::vector<T> *out_vec) {
|
||||
uint64_t sz;
|
||||
if (this->Read(&sz, sizeof(sz)) == 0) return false;
|
||||
out_vec->resize(sz);
|
||||
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief binary serialize a string
|
||||
* \param str the string to be serialized
|
||||
*/
|
||||
inline void Write(const std::string &str) {
|
||||
uint64_t sz = str.length();
|
||||
this->Write(&sz, sizeof(sz));
|
||||
this->Write(&str[0], sizeof(char) * sz);
|
||||
}
|
||||
/*!
|
||||
* \brief binary load a string
|
||||
* \param out_str string to be loaded
|
||||
* \return whether load is successful
|
||||
*/
|
||||
inline bool Read(std::string *out_str) {
|
||||
uint64_t sz;
|
||||
if (this->Read(&sz, sizeof(sz)) == 0) return false;
|
||||
out_str->resize(sz);
|
||||
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream : public IStream {
|
||||
private:
|
||||
FILE *fp;
|
||||
public:
|
||||
explicit FileStream(FILE *fp) {
|
||||
this->fp = fp;
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return fread(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
inline void Close(void) {
|
||||
fclose(fp);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
40
utils/iterator.h
Normal file
40
utils/iterator.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef XGBOOST_UTILS_ITERATOR_H
|
||||
#define XGBOOST_UTILS_ITERATOR_H
|
||||
#include <cstdio>
|
||||
/*!
|
||||
* \file iterator.h
|
||||
* \brief itertator interface
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief iterator interface
|
||||
* \tparam DType data type
|
||||
*/
|
||||
template<typename DType>
|
||||
class IIterator {
|
||||
public:
|
||||
/*!
|
||||
* \brief set the parameter
|
||||
* \param name name of parameter
|
||||
* \param val value of parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*! \brief initalize the iterator so that we can use the iterator */
|
||||
virtual void Init(void) = 0;
|
||||
/*! \brief set before first of the item */
|
||||
virtual void BeforeFirst(void) = 0;
|
||||
/*! \brief move to next item */
|
||||
virtual bool Next(void) = 0;
|
||||
/*! \brief get current data */
|
||||
virtual const DType &Value(void) const = 0;
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
virtual ~IIterator(void) {}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
|
||||
123
utils/matrix_csr.h
Normal file
123
utils/matrix_csr.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#ifndef XGBOOST_UTILS_MATRIX_CSR_H_
|
||||
#define XGBOOST_UTILS_MATRIX_CSR_H_
|
||||
/*!
|
||||
* \file matrix_csr.h
|
||||
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
* can be used to convert row major CSR to column major CSR
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
* \tparam whether enabling the usage of aclist, this option must be enabled manually
|
||||
*/
|
||||
template<typename IndexType, bool UseAcList = false>
|
||||
struct SparseCSRMBuilder {
|
||||
private:
|
||||
/*! \brief dummy variable used in the indicator matrix construction */
|
||||
std::vector<size_t> dummy_aclist;
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> &rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> &findex;
|
||||
/*! \brief a list of active rows, used when many rows are empty */
|
||||
std::vector<size_t> &aclist;
|
||||
|
||||
public:
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
|
||||
Assert(!UseAcList, "enabling bug");
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
|
||||
Assert(UseAcList, "must manually enable the option use aclist");
|
||||
}
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief step 1: initialize the number of rows in the data, not necessary exact
|
||||
* \nrows number of rows in the matrix, can be smaller than expected
|
||||
*/
|
||||
inline void InitBudget(size_t nrows = 0) {
|
||||
if (!UseAcList) {
|
||||
rptr.clear();
|
||||
rptr.resize(nrows + 1, 0);
|
||||
} else {
|
||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||
this->Cleanup();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief step 2: add budget to each rows, this function is called when aclist is used
|
||||
* \param row_id the id of the row
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget(size_t row_id, size_t nelem = 1) {
|
||||
if (rptr.size() < row_id + 2) {
|
||||
rptr.resize(row_id + 2, 0);
|
||||
}
|
||||
if (UseAcList) {
|
||||
if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
|
||||
}
|
||||
rptr[row_id + 1] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage(void) {
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
if (!UseAcList) {
|
||||
for (size_t i = 1; i < rptr.size(); i++) {
|
||||
size_t rlen = rptr[i];
|
||||
rptr[i] = start;
|
||||
start += rlen;
|
||||
}
|
||||
} else {
|
||||
// case with active list
|
||||
std::sort(aclist.begin(), aclist.end());
|
||||
for (size_t i = 0; i < aclist.size(); i++) {
|
||||
size_t ridx = aclist[i];
|
||||
size_t rlen = rptr[ridx + 1];
|
||||
rptr[ridx + 1] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize(start);
|
||||
}
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
*/
|
||||
inline void PushElem(size_t row_id, IndexType col_id) {
|
||||
size_t &rp = rptr[row_id + 1];
|
||||
findex[rp++] = col_id;
|
||||
}
|
||||
/*!
|
||||
* \brief step 5: only needed when aclist is used
|
||||
* clean up the rptr for next usage
|
||||
*/
|
||||
inline void Cleanup(void) {
|
||||
Assert(UseAcList, "this function can only be called use AcList");
|
||||
for (size_t i = 0; i < aclist.size(); i++) {
|
||||
const size_t ridx = aclist[i];
|
||||
rptr[ridx] = 0; rptr[ridx + 1] = 0;
|
||||
}
|
||||
aclist.clear();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
@@ -1,12 +1,10 @@
|
||||
#ifndef XGBOOST_OMP_H
|
||||
#define XGBOOST_OMP_H
|
||||
#ifndef XGBOOST_UTILS_OMP_H_
|
||||
#define XGBOOST_UTILS_OMP_H_
|
||||
/*!
|
||||
* \file xgboost_omp.h
|
||||
* \file omp.h
|
||||
* \brief header to handle OpenMP compatibility issues
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#else
|
||||
@@ -15,4 +13,4 @@ inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
inline void omp_set_num_threads(int nthread) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif // XGBOOST_UTILS_OMP_H_
|
||||
102
utils/random.h
Normal file
102
utils/random.h
Normal file
@@ -0,0 +1,102 @@
|
||||
#ifndef XGBOOST_UTILS_RANDOM_H_
|
||||
#define XGBOOST_UTILS_RANDOM_H_
|
||||
/*!
|
||||
* \file xgboost_random.h
|
||||
* \brief PRNG to support random number generation
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*
|
||||
* Use standard PRNG from stdlib
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "./utils.h"
|
||||
|
||||
/*! namespace of PRNG */
|
||||
namespace xgboost {
|
||||
namespace random {
|
||||
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed(uint32_t seed) {
|
||||
srand(seed);
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(void) {
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(void) {
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||
}
|
||||
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32(void) {
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32(uint32_t n) {
|
||||
return (uint32_t)floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal() {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
|
||||
return x * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D(double &xx, double &yy) {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
double t = sqrt(-2.0 * log(s) / s);
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal(double mu, double sigma) {
|
||||
return SampleNormal() * sigma + mu;
|
||||
}
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary(double p) {
|
||||
return NextDouble() < p;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Shuffle(T *data, size_t sz) {
|
||||
if (sz == 0) return;
|
||||
for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
|
||||
std::swap(data[i], data[NextUInt32(i + 1)]);
|
||||
}
|
||||
}
|
||||
// random shuffle the data inside, require PRNG
|
||||
template<typename T>
|
||||
inline void Shuffle(std::vector<T> &data) {
|
||||
Shuffle(&data[0], data.size());
|
||||
}
|
||||
|
||||
/*! \brief random number generator with independent random number seed*/
|
||||
struct Random{
|
||||
/*! \brief set random number seed */
|
||||
inline void Seed(unsigned sd) {
|
||||
this->rseed = sd;
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double RandDouble(void) {
|
||||
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||
}
|
||||
// random number seed
|
||||
unsigned rseed;
|
||||
};
|
||||
} // namespace random
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_RANDOM_H_
|
||||
94
utils/utils.h
Normal file
94
utils/utils.h
Normal file
@@ -0,0 +1,94 @@
|
||||
#ifndef XGBOOST_UTILS_UTILS_H_
|
||||
#define XGBOOST_UTILS_UTILS_H_
|
||||
/*!
|
||||
* \file utils.h
|
||||
* \brief simple utils to support the code
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#ifdef _MSC_VER
|
||||
#define fopen64 fopen
|
||||
#else
|
||||
#ifdef _FILE_OFFSET_BITS
|
||||
#if _FILE_OFFSET_BITS == 32
|
||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define off64_t off_t
|
||||
#define fopen64 fopen
|
||||
#endif
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
extern "C" {
|
||||
#include <sys/types.h>
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short int uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long uint64_t;
|
||||
typedef long int64_t;
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils {
|
||||
|
||||
/*! \brief assert an condition is true, use this to handle debug information */
|
||||
inline void Assert(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
fprintf(stderr, "AssertError:");
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*!\brief same as assert, but this is intended to be used as message for user*/
|
||||
inline void Check(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief report error message, same as check */
|
||||
inline void Error(const char *fmt, ...) {
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag) {
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
||||
return fp;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_UTILS_H_
|
||||
@@ -1,123 +0,0 @@
|
||||
#ifndef XGBOOST_FMAP_H
|
||||
#define XGBOOST_FMAP_H
|
||||
/*!
|
||||
* \file xgboost_fmap.h
|
||||
* \brief helper class that holds the feature names and interpretations
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "xgboost_utils.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*! \brief helper class that holds the feature names and interpretations */
|
||||
class FeatMap{
|
||||
public:
|
||||
enum Type{
|
||||
kIndicator = 0,
|
||||
kQuantitive = 1,
|
||||
kInteger = 2,
|
||||
kFloat = 3
|
||||
};
|
||||
public:
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(const char *fname){
|
||||
FILE *fi = utils::FopenCheck(fname, "r");
|
||||
this->LoadText(fi);
|
||||
fclose(fi);
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(FILE *fi){
|
||||
int fid;
|
||||
char fname[1256], ftype[1256];
|
||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
|
||||
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
||||
names_.push_back(std::string(fname));
|
||||
types_.push_back(GetType(ftype));
|
||||
}
|
||||
}
|
||||
/*! \brief number of known features */
|
||||
size_t size(void) const{
|
||||
return names_.size();
|
||||
}
|
||||
/*! \brief return name of specific feature */
|
||||
const char* name(size_t idx) const{
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return names_[idx].c_str();
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type(size_t idx) const{
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return types_[idx];
|
||||
}
|
||||
private:
|
||||
inline static Type GetType(const char *tname){
|
||||
if (!strcmp("i", tname)) return kIndicator;
|
||||
if (!strcmp("q", tname)) return kQuantitive;
|
||||
if (!strcmp("int", tname)) return kInteger;
|
||||
if (!strcmp("float", tname)) return kFloat;
|
||||
utils::Error("unknown feature type, use i for indicator and q for quantity");
|
||||
return kIndicator;
|
||||
}
|
||||
private:
|
||||
/*! \brief name of the feature */
|
||||
std::vector<std::string> names_;
|
||||
/*! \brief type of the feature */
|
||||
std::vector<Type> types_;
|
||||
};
|
||||
}; // namespace utils
|
||||
|
||||
namespace utils{
|
||||
/*! \brief feature constraint, allow or disallow some feature during training */
|
||||
class FeatConstrain{
|
||||
public:
|
||||
FeatConstrain(void){
|
||||
default_state_ = +1;
|
||||
}
|
||||
/*!\brief set parameters */
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
int a, b;
|
||||
if (!strcmp(name, "fban")){
|
||||
this->ParseRange(val, a, b);
|
||||
this->SetRange(a, b, -1);
|
||||
}
|
||||
if (!strcmp(name, "fpass")){
|
||||
this->ParseRange(val, a, b);
|
||||
this->SetRange(a, b, +1);
|
||||
}
|
||||
if (!strcmp(name, "fdefault")){
|
||||
default_state_ = atoi(val);
|
||||
}
|
||||
}
|
||||
/*! \brief whether constrain is specified */
|
||||
inline bool HasConstrain(void) const {
|
||||
return state_.size() != 0 && default_state_ == 1;
|
||||
}
|
||||
/*! \brief whether a feature index is banned or not */
|
||||
inline bool NotBanned(unsigned index) const{
|
||||
int rt = index < state_.size() ? state_[index] : default_state_;
|
||||
if (rt == 0) rt = default_state_;
|
||||
return rt == 1;
|
||||
}
|
||||
private:
|
||||
inline void SetRange(int a, int b, int st){
|
||||
if (b >(int)state_.size()) state_.resize(b, 0);
|
||||
for (int i = a; i < b; ++i){
|
||||
state_[i] = st;
|
||||
}
|
||||
}
|
||||
inline void ParseRange(const char *val, int &a, int &b){
|
||||
if (sscanf(val, "%d-%d", &a, &b) == 2) return;
|
||||
utils::Assert(sscanf(val, "%d", &a) == 1);
|
||||
b = a + 1;
|
||||
}
|
||||
/*! \brief default state */
|
||||
int default_state_;
|
||||
/*! \brief whether the state here is, +1:pass, -1: ban, 0:default */
|
||||
std::vector<int> state_;
|
||||
};
|
||||
}; // namespace utils
|
||||
}; // namespace xgboost
|
||||
#endif // XGBOOST_FMAP_H
|
||||
@@ -1,157 +0,0 @@
|
||||
/*!
|
||||
* \file xgboost_matrix_csr.h
|
||||
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#ifndef XGBOOST_MATRIX_CSR_H
|
||||
#define XGBOOST_MATRIX_CSR_H
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "xgboost_utils.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
* can be used to convert row major CSR to column major CSR
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
* \tparam whether enabling the usage of aclist, this option must be enabled manually
|
||||
*/
|
||||
template<typename IndexType, bool UseAcList = false>
|
||||
struct SparseCSRMBuilder{
|
||||
private:
|
||||
/*! \brief dummy variable used in the indicator matrix construction */
|
||||
std::vector<size_t> dummy_aclist;
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> &rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> &findex;
|
||||
/*! \brief a list of active rows, used when many rows are empty */
|
||||
std::vector<size_t> &aclist;
|
||||
public:
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist){
|
||||
Assert(!UseAcList, "enabling bug");
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(p_aclist){
|
||||
Assert(UseAcList, "must manually enable the option use aclist");
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief step 1: initialize the number of rows in the data, not necessary exact
|
||||
* \nrows number of rows in the matrix, can be smaller than expected
|
||||
*/
|
||||
inline void InitBudget(size_t nrows = 0){
|
||||
if (!UseAcList){
|
||||
rptr.clear();
|
||||
rptr.resize(nrows + 1, 0);
|
||||
}
|
||||
else{
|
||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||
this->Cleanup();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief step 2: add budget to each rows, this function is called when aclist is used
|
||||
* \param row_id the id of the row
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget(size_t row_id, size_t nelem = 1){
|
||||
if (rptr.size() < row_id + 2){
|
||||
rptr.resize(row_id + 2, 0);
|
||||
}
|
||||
if (UseAcList){
|
||||
if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
|
||||
}
|
||||
rptr[row_id + 1] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage(void){
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
if (!UseAcList){
|
||||
for (size_t i = 1; i < rptr.size(); i++){
|
||||
size_t rlen = rptr[i];
|
||||
rptr[i] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
else{
|
||||
// case with active list
|
||||
std::sort(aclist.begin(), aclist.end());
|
||||
|
||||
for (size_t i = 0; i < aclist.size(); i++){
|
||||
size_t ridx = aclist[i];
|
||||
size_t rlen = rptr[ridx + 1];
|
||||
rptr[ridx + 1] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize(start);
|
||||
}
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
*/
|
||||
inline void PushElem(size_t row_id, IndexType col_id){
|
||||
size_t &rp = rptr[row_id + 1];
|
||||
findex[rp++] = col_id;
|
||||
}
|
||||
/*!
|
||||
* \brief step 5: only needed when aclist is used
|
||||
* clean up the rptr for next usage
|
||||
*/
|
||||
inline void Cleanup(void){
|
||||
Assert(UseAcList, "this function can only be called use AcList");
|
||||
for (size_t i = 0; i < aclist.size(); i++){
|
||||
const size_t ridx = aclist[i];
|
||||
rptr[ridx] = 0; rptr[ridx + 1] = 0;
|
||||
}
|
||||
aclist.clear();
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief simple sparse matrix container
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
*/
|
||||
template<typename IndexType>
|
||||
struct SparseCSRMat{
|
||||
private:
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> findex;
|
||||
public:
|
||||
/*! \brief matrix builder*/
|
||||
SparseCSRMBuilder<IndexType> builder;
|
||||
public:
|
||||
SparseCSRMat(void) :builder(rptr, findex){
|
||||
}
|
||||
public:
|
||||
/*! \return number of rows in the matrx */
|
||||
inline size_t NumRow(void) const{
|
||||
return rptr.size() - 1;
|
||||
}
|
||||
/*! \return number of elements r-th row */
|
||||
inline size_t NumElem(size_t r) const{
|
||||
return rptr[r + 1] - rptr[r];
|
||||
}
|
||||
/*! \return r-th row */
|
||||
inline const IndexType *operator[](size_t r) const{
|
||||
return &findex[rptr[r]];
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
@@ -1,148 +0,0 @@
|
||||
#ifndef XGBOOST_RANDOM_H
|
||||
#define XGBOOST_RANDOM_H
|
||||
/*!
|
||||
* \file xgboost_random.h
|
||||
* \brief PRNG to support random number generation
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*
|
||||
* Use standard PRNG from stdlib
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short int uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
/*! namespace of PRNG */
|
||||
namespace xgboost{
|
||||
namespace random{
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed(uint32_t seed){
|
||||
srand(seed);
|
||||
}
|
||||
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(){
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(){
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||
}
|
||||
};
|
||||
|
||||
namespace random{
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32(void){
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32(uint32_t n){
|
||||
return (uint32_t)floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal(){
|
||||
double x, y, s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
|
||||
return x * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D(double &xx, double &yy){
|
||||
double x, y, s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
double t = sqrt(-2.0 * log(s) / s);
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal(double mu, double sigma){
|
||||
return SampleNormal() * sigma + mu;
|
||||
}
|
||||
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary(double p){
|
||||
return NextDouble() < p;
|
||||
}
|
||||
|
||||
/*! \brief return distribution from Gamma( alpha, beta ) */
|
||||
inline double SampleGamma(double alpha, double beta) {
|
||||
if (alpha < 1.0) {
|
||||
double u;
|
||||
do {
|
||||
u = NextDouble();
|
||||
} while (u == 0.0);
|
||||
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
||||
}
|
||||
else {
|
||||
double d, c, x, v, u;
|
||||
d = alpha - 1.0 / 3.0;
|
||||
c = 1.0 / sqrt(9.0 * d);
|
||||
do {
|
||||
do {
|
||||
x = SampleNormal();
|
||||
v = 1.0 + c*x;
|
||||
} while (v <= 0.0);
|
||||
v = v * v * v;
|
||||
u = NextDouble();
|
||||
} while ((u >= (1.0 - 0.0331 * (x*x) * (x*x)))
|
||||
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))));
|
||||
return d * v / beta;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Exchange(T &a, T &b){
|
||||
T c;
|
||||
c = a;
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Shuffle(T *data, size_t sz){
|
||||
if (sz == 0) return;
|
||||
for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
|
||||
Exchange(data[i], data[NextUInt32(i + 1)]);
|
||||
}
|
||||
}
|
||||
// random shuffle the data inside, require PRNG
|
||||
template<typename T>
|
||||
inline void Shuffle(std::vector<T> &data){
|
||||
Shuffle(&data[0], data.size());
|
||||
}
|
||||
};
|
||||
|
||||
namespace random{
|
||||
/*! \brief random number generator with independent random number seed*/
|
||||
struct Random{
|
||||
/*! \brief set random number seed */
|
||||
inline void Seed( unsigned sd ){
|
||||
this->rseed = sd;
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double RandDouble( void ){
|
||||
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||
}
|
||||
// random number seed
|
||||
unsigned rseed;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,54 +0,0 @@
|
||||
#ifndef XGBOOST_STREAM_H
|
||||
#define XGBOOST_STREAM_H
|
||||
|
||||
#include <cstdio>
|
||||
/*!
|
||||
* \file xgboost_stream.h
|
||||
* \brief general stream interface for serialization
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
*/
|
||||
class IStream{
|
||||
public:
|
||||
/*!
|
||||
* \brief read data from stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
* \return usually is the size of data readed
|
||||
*/
|
||||
virtual size_t Read(void *ptr, size_t size) = 0;
|
||||
/*!
|
||||
* \brief write data to stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
*/
|
||||
virtual void Write(const void *ptr, size_t size) = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IStream(void){}
|
||||
};
|
||||
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream : public IStream{
|
||||
private:
|
||||
FILE *fp;
|
||||
public:
|
||||
FileStream(FILE *fp){
|
||||
this->fp = fp;
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size){
|
||||
return fread(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size){
|
||||
fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
inline void Close(void){
|
||||
fclose(fp);
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
@@ -1,70 +0,0 @@
|
||||
#ifndef XGBOOST_UTILS_H
|
||||
#define XGBOOST_UTILS_H
|
||||
/*!
|
||||
* \file xgboost_utils.h
|
||||
* \brief simple utils to support the code
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#ifdef _MSC_VER
|
||||
#define fopen64 fopen
|
||||
#else
|
||||
|
||||
// use 64 bit offset, either to include this header in the beginning, or
|
||||
#ifdef _FILE_OFFSET_BITS
|
||||
#if _FILE_OFFSET_BITS == 32
|
||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define off64_t off_t
|
||||
#define fopen64 fopen
|
||||
#endif
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
extern "C"{
|
||||
#include <sys/types.h>
|
||||
};
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace xgboost{
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils{
|
||||
inline void Error(const char *msg){
|
||||
fprintf(stderr, "Error:%s\n", msg);
|
||||
fflush(stderr);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
inline void Assert(bool exp){
|
||||
if (!exp) Error("AssertError");
|
||||
}
|
||||
|
||||
inline void Assert(bool exp, const char *msg){
|
||||
if (!exp) Error(msg);
|
||||
}
|
||||
|
||||
inline void Warning(const char *msg){
|
||||
fprintf(stderr, "warning:%s\n", msg);
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag){
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
if (fp == NULL){
|
||||
fprintf(stderr, "can not open file \"%s\" \n", fname);
|
||||
fflush(stderr);
|
||||
exit(-1);
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user