Refactor parts of fast histogram utilities (#3564)
* Refactor parts of fast histogram utilities * Removed byte packing from column matrix
This commit is contained in:
parent
3c72654e3b
commit
bbb771f32e
@ -8,47 +8,14 @@
|
|||||||
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||||
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||||
|
|
||||||
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
|
||||||
\
|
|
||||||
switch(dtype) { \
|
|
||||||
case xgboost::common::uint32: { \
|
|
||||||
using DType = uint32_t; \
|
|
||||||
OP; \
|
|
||||||
break; \
|
|
||||||
} \
|
|
||||||
case xgboost::common::uint16: { \
|
|
||||||
using DType = uint16_t; \
|
|
||||||
OP; \
|
|
||||||
break; \
|
|
||||||
} \
|
|
||||||
case xgboost::common::uint8: { \
|
|
||||||
using DType = uint8_t; \
|
|
||||||
OP; \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
LOG(FATAL) << "don't recognize type flag" << dtype; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
}
|
|
||||||
|
|
||||||
#include <type_traits>
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "hist_util.h"
|
#include "hist_util.h"
|
||||||
#include "../tree/fast_hist_param.h"
|
|
||||||
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
using tree::FastHistParam;
|
|
||||||
|
|
||||||
/*! \brief indicator of data type used for storing bin id's in a column. */
|
|
||||||
enum DataType {
|
|
||||||
uint8 = 1,
|
|
||||||
uint16 = 2,
|
|
||||||
uint32 = 4
|
|
||||||
};
|
|
||||||
|
|
||||||
/*! \brief column type */
|
/*! \brief column type */
|
||||||
enum ColumnType {
|
enum ColumnType {
|
||||||
@ -58,14 +25,36 @@ enum ColumnType {
|
|||||||
|
|
||||||
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
/*! \brief a column storage, to be used with ApplySplit. Note that each
|
||||||
bin id is stored as index[i] + index_base. */
|
bin id is stored as index[i] + index_base. */
|
||||||
template<typename T>
|
|
||||||
class Column {
|
class Column {
|
||||||
public:
|
public:
|
||||||
ColumnType type;
|
Column(ColumnType type, const uint32_t* index, uint32_t index_base,
|
||||||
const T* index;
|
const size_t* row_ind, size_t len)
|
||||||
uint32_t index_base;
|
: type_(type),
|
||||||
const size_t* row_ind;
|
index_(index),
|
||||||
size_t len;
|
index_base_(index_base),
|
||||||
|
row_ind_(row_ind),
|
||||||
|
len_(len) {}
|
||||||
|
size_t Size() const { return len_; }
|
||||||
|
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
|
||||||
|
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
|
||||||
|
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
|
||||||
|
// column.GetGlobalBinIdx(idx)
|
||||||
|
uint32_t GetBaseIdx() const { return index_base_; }
|
||||||
|
ColumnType GetType() const { return type_; }
|
||||||
|
size_t GetRowIdx(size_t idx) const {
|
||||||
|
return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx];
|
||||||
|
}
|
||||||
|
bool IsMissing(size_t idx) const {
|
||||||
|
return index_[idx] == std::numeric_limits<uint32_t>::max();
|
||||||
|
}
|
||||||
|
const size_t* GetRowData() const { return row_ind_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ColumnType type_;
|
||||||
|
const uint32_t* index_;
|
||||||
|
uint32_t index_base_;
|
||||||
|
const size_t* row_ind_;
|
||||||
|
const size_t len_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief a collection of columns, with support for construction from
|
/*! \brief a collection of columns, with support for construction from
|
||||||
@ -79,13 +68,8 @@ class ColumnMatrix {
|
|||||||
|
|
||||||
// construct column matrix from GHistIndexMatrix
|
// construct column matrix from GHistIndexMatrix
|
||||||
inline void Init(const GHistIndexMatrix& gmat,
|
inline void Init(const GHistIndexMatrix& gmat,
|
||||||
const FastHistParam& param) {
|
double sparse_threshold) {
|
||||||
this->dtype = static_cast<DataType>(param.colmat_dtype);
|
const auto nfeature = static_cast<bst_uint>(gmat.cut.row_ptr.size() - 1);
|
||||||
/* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
|
|
||||||
slot of internal buffer. */
|
|
||||||
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
|
|
||||||
|
|
||||||
const auto nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
|
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||||
|
|
||||||
// identify type of each column
|
// identify type of each column
|
||||||
@ -93,19 +77,16 @@ class ColumnMatrix {
|
|||||||
type_.resize(nfeature);
|
type_.resize(nfeature);
|
||||||
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
|
||||||
|
|
||||||
uint32_t max_val = 0;
|
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
||||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
|
||||||
max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
|
|
||||||
});
|
|
||||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||||
CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
|
CHECK_LE(gmat.cut.row_ptr[fid + 1] - gmat.cut.row_ptr[fid], max_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
gmat.GetFeatureCounts(&feature_counts_[0]);
|
gmat.GetFeatureCounts(&feature_counts_[0]);
|
||||||
// classify features
|
// classify features
|
||||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||||
if (static_cast<double>(feature_counts_[fid])
|
if (static_cast<double>(feature_counts_[fid])
|
||||||
< param.sparse_threshold * nrow) {
|
< sparse_threshold * nrow) {
|
||||||
type_[fid] = kSparseColumn;
|
type_[fid] = kSparseColumn;
|
||||||
} else {
|
} else {
|
||||||
type_[fid] = kDenseColumn;
|
type_[fid] = kDenseColumn;
|
||||||
@ -131,28 +112,23 @@ class ColumnMatrix {
|
|||||||
boundary_[fid].row_ind_end = accum_row_ind_;
|
boundary_[fid].row_ind_end = accum_row_ind_;
|
||||||
}
|
}
|
||||||
|
|
||||||
index_.resize((boundary_[nfeature - 1].index_end
|
index_.resize(boundary_[nfeature - 1].index_end);
|
||||||
+ (packing_factor_ - 1)) / packing_factor_);
|
|
||||||
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
|
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
|
||||||
|
|
||||||
// store least bin id for each feature
|
// store least bin id for each feature
|
||||||
index_base_.resize(nfeature);
|
index_base_.resize(nfeature);
|
||||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||||
index_base_[fid] = gmat.cut->row_ptr[fid];
|
index_base_[fid] = gmat.cut.row_ptr[fid];
|
||||||
}
|
}
|
||||||
|
|
||||||
// pre-fill index_ for dense columns
|
// pre-fill index_ for dense columns
|
||||||
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
for (bst_uint fid = 0; fid < nfeature; ++fid) {
|
||||||
if (type_[fid] == kDenseColumn) {
|
if (type_[fid] == kDenseColumn) {
|
||||||
const size_t ibegin = boundary_[fid].index_begin;
|
const size_t ibegin = boundary_[fid].index_begin;
|
||||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
uint32_t* begin = &index_[ibegin];
|
||||||
const size_t block_offset = ibegin / packing_factor_;
|
uint32_t* end = begin + nrow;
|
||||||
const size_t elem_offset = ibegin % packing_factor_;
|
std::fill(begin, end, std::numeric_limits<uint32_t>::max());
|
||||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
// max() indicates missing values
|
||||||
DType* end = begin + nrow;
|
|
||||||
std::fill(begin, end, std::numeric_limits<DType>::max());
|
|
||||||
// max() indicates missing values
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,23 +143,15 @@ class ColumnMatrix {
|
|||||||
size_t fid = 0;
|
size_t fid = 0;
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
const uint32_t bin_id = gmat.index[i];
|
const uint32_t bin_id = gmat.index[i];
|
||||||
while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
|
while (bin_id >= gmat.cut.row_ptr[fid + 1]) {
|
||||||
++fid;
|
++fid;
|
||||||
}
|
}
|
||||||
if (type_[fid] == kDenseColumn) {
|
if (type_[fid] == kDenseColumn) {
|
||||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
uint32_t* begin = &index_[boundary_[fid].index_begin];
|
||||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
begin[rid] = bin_id - index_base_[fid];
|
||||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
||||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
|
||||||
begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
XGBOOST_TYPE_SWITCH(this->dtype, {
|
uint32_t* begin = &index_[boundary_[fid].index_begin];
|
||||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
|
||||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
||||||
DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
|
|
||||||
begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
|
|
||||||
});
|
|
||||||
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
|
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
|
||||||
++num_nonzeros[fid];
|
++num_nonzeros[fid];
|
||||||
}
|
}
|
||||||
@ -193,29 +161,13 @@ class ColumnMatrix {
|
|||||||
|
|
||||||
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
|
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
|
||||||
to determine type of bin id's */
|
to determine type of bin id's */
|
||||||
template<typename T>
|
inline Column GetColumn(unsigned fid) const {
|
||||||
inline Column<T> GetColumn(unsigned fid) const {
|
Column c(type_[fid], &index_[boundary_[fid].index_begin], index_base_[fid],
|
||||||
const bool valid_type = std::is_same<T, uint32_t>::value
|
&row_ind_[boundary_[fid].row_ind_begin],
|
||||||
|| std::is_same<T, uint16_t>::value
|
boundary_[fid].index_end - boundary_[fid].index_begin);
|
||||||
|| std::is_same<T, uint8_t>::value;
|
|
||||||
CHECK(valid_type);
|
|
||||||
|
|
||||||
Column<T> c;
|
|
||||||
|
|
||||||
c.type = type_[fid];
|
|
||||||
const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
|
|
||||||
const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
|
|
||||||
c.index = reinterpret_cast<const T*>(&index_[block_offset]) + elem_offset;
|
|
||||||
c.index_base = index_base_[fid];
|
|
||||||
c.row_ind = &row_ind_[boundary_[fid].row_ind_begin];
|
|
||||||
c.len = boundary_[fid].index_end - boundary_[fid].index_begin;
|
|
||||||
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
DataType dtype;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct ColumnBoundary {
|
struct ColumnBoundary {
|
||||||
// indicate where each column's index and row_ind is stored.
|
// indicate where each column's index and row_ind is stored.
|
||||||
@ -233,8 +185,6 @@ class ColumnMatrix {
|
|||||||
std::vector<size_t> row_ind_;
|
std::vector<size_t> row_ind_;
|
||||||
std::vector<ColumnBoundary> boundary_;
|
std::vector<ColumnBoundary> boundary_;
|
||||||
|
|
||||||
size_t packing_factor_; // how many integers are stored in each slot of index_
|
|
||||||
|
|
||||||
// index_base_[fid]: least bin id for feature fid
|
// index_base_[fid]: least bin id for feature fid
|
||||||
std::vector<uint32_t> index_base_;
|
std::vector<uint32_t> index_base_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -114,12 +114,23 @@ void HistCutMatrix::Init
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {
|
||||||
CHECK(cut != nullptr); // NOLINT
|
unsigned fid = e.index;
|
||||||
|
auto cbegin = cut.begin() + row_ptr[fid];
|
||||||
|
auto cend = cut.begin() + row_ptr[fid + 1];
|
||||||
|
CHECK(cbegin != cend);
|
||||||
|
auto it = std::upper_bound(cbegin, cend, e.fvalue);
|
||||||
|
if (it == cend) it = cend - 1;
|
||||||
|
uint32_t idx = static_cast<uint32_t>(it - cut.begin());
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
|
||||||
|
cut.Init(p_fmat, max_num_bins);
|
||||||
auto iter = p_fmat->RowIterator();
|
auto iter = p_fmat->RowIterator();
|
||||||
|
|
||||||
const int nthread = omp_get_max_threads();
|
const int nthread = omp_get_max_threads();
|
||||||
const uint32_t nbins = cut->row_ptr.back();
|
const uint32_t nbins = cut.row_ptr.back();
|
||||||
hit_count.resize(nbins, 0);
|
hit_count.resize(nbins, 0);
|
||||||
hit_count_tloc_.resize(nthread * nbins, 0);
|
hit_count_tloc_.resize(nthread * nbins, 0);
|
||||||
|
|
||||||
@ -133,8 +144,8 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
|||||||
}
|
}
|
||||||
index.resize(row_ptr.back());
|
index.resize(row_ptr.back());
|
||||||
|
|
||||||
CHECK_GT(cut->cut.size(), 0U);
|
CHECK_GT(cut.cut.size(), 0U);
|
||||||
CHECK_EQ(cut->row_ptr.back(), cut->cut.size());
|
CHECK_EQ(cut.row_ptr.back(), cut.cut.size());
|
||||||
|
|
||||||
auto bsize = static_cast<omp_ulong>(batch.Size());
|
auto bsize = static_cast<omp_ulong>(batch.Size());
|
||||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||||
@ -145,13 +156,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
|||||||
SparsePage::Inst inst = batch[i];
|
SparsePage::Inst inst = batch[i];
|
||||||
CHECK_EQ(ibegin + inst.length, iend);
|
CHECK_EQ(ibegin + inst.length, iend);
|
||||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||||
unsigned fid = inst[j].index;
|
uint32_t idx = cut.GetBinIdx(inst[j]);
|
||||||
auto cbegin = cut->cut.begin() + cut->row_ptr[fid];
|
|
||||||
auto cend = cut->cut.begin() + cut->row_ptr[fid + 1];
|
|
||||||
CHECK(cbegin != cend);
|
|
||||||
auto it = std::upper_bound(cbegin, cend, inst[j].fvalue);
|
|
||||||
if (it == cend) it = cend - 1;
|
|
||||||
uint32_t idx = static_cast<uint32_t>(it - cut->cut.begin());
|
|
||||||
index[ibegin + j] = idx;
|
index[ibegin + j] = idx;
|
||||||
++hit_count_tloc_[tid * nbins + idx];
|
++hit_count_tloc_[tid * nbins + idx];
|
||||||
}
|
}
|
||||||
@ -167,14 +172,13 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static size_t GetConflictCount(const std::vector<bool>& mark,
|
static size_t GetConflictCount(const std::vector<bool>& mark,
|
||||||
const Column<T>& column,
|
const Column& column,
|
||||||
size_t max_cnt) {
|
size_t max_cnt) {
|
||||||
size_t ret = 0;
|
size_t ret = 0;
|
||||||
if (column.type == xgboost::common::kDenseColumn) {
|
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||||
for (size_t i = 0; i < column.len; ++i) {
|
for (size_t i = 0; i < column.Size(); ++i) {
|
||||||
if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
|
if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max() && mark[i]) {
|
||||||
++ret;
|
++ret;
|
||||||
if (ret > max_cnt) {
|
if (ret > max_cnt) {
|
||||||
return max_cnt + 1;
|
return max_cnt + 1;
|
||||||
@ -182,8 +186,8 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < column.len; ++i) {
|
for (size_t i = 0; i < column.Size(); ++i) {
|
||||||
if (mark[column.row_ind[i]]) {
|
if (mark[column.GetRowIdx(i)]) {
|
||||||
++ret;
|
++ret;
|
||||||
if (ret > max_cnt) {
|
if (ret > max_cnt) {
|
||||||
return max_cnt + 1;
|
return max_cnt + 1;
|
||||||
@ -194,30 +198,28 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline void
|
inline void
|
||||||
MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
|
MarkUsed(std::vector<bool>* p_mark, const Column& column) {
|
||||||
std::vector<bool>& mark = *p_mark;
|
std::vector<bool>& mark = *p_mark;
|
||||||
if (column.type == xgboost::common::kDenseColumn) {
|
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||||
for (size_t i = 0; i < column.len; ++i) {
|
for (size_t i = 0; i < column.Size(); ++i) {
|
||||||
if (column.index[i] != std::numeric_limits<T>::max()) {
|
if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max()) {
|
||||||
mark[i] = true;
|
mark[i] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < column.len; ++i) {
|
for (size_t i = 0; i < column.Size(); ++i) {
|
||||||
mark[column.row_ind[i]] = true;
|
mark[column.GetRowIdx(i)] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline std::vector<std::vector<unsigned>>
|
inline std::vector<std::vector<unsigned>>
|
||||||
FindGroups_(const std::vector<unsigned>& feature_list,
|
FindGroups(const std::vector<unsigned>& feature_list,
|
||||||
const std::vector<size_t>& feature_nnz,
|
const std::vector<size_t>& feature_nnz,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
size_t nrow,
|
size_t nrow,
|
||||||
const FastHistParam& param) {
|
const FastHistParam& param) {
|
||||||
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
/* Goal: Bundle features together that has little or no "overlap", i.e.
|
||||||
only a few data points should have nonzero values for
|
only a few data points should have nonzero values for
|
||||||
member features.
|
member features.
|
||||||
@ -231,7 +233,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
|||||||
= static_cast<size_t>(param.max_conflict_rate * nrow);
|
= static_cast<size_t>(param.max_conflict_rate * nrow);
|
||||||
|
|
||||||
for (auto fid : feature_list) {
|
for (auto fid : feature_list) {
|
||||||
const Column<T>& column = colmat.GetColumn<T>(fid);
|
const Column& column = colmat.GetColumn(fid);
|
||||||
|
|
||||||
const size_t cur_fid_nnz = feature_nnz[fid];
|
const size_t cur_fid_nnz = feature_nnz[fid];
|
||||||
bool need_new_group = true;
|
bool need_new_group = true;
|
||||||
@ -276,24 +278,12 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
|||||||
return groups;
|
return groups;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::vector<std::vector<unsigned>>
|
|
||||||
FindGroups(const std::vector<unsigned>& feature_list,
|
|
||||||
const std::vector<size_t>& feature_nnz,
|
|
||||||
const ColumnMatrix& colmat,
|
|
||||||
size_t nrow,
|
|
||||||
const FastHistParam& param) {
|
|
||||||
XGBOOST_TYPE_SWITCH(colmat.dtype, {
|
|
||||||
return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
|
|
||||||
});
|
|
||||||
return std::vector<std::vector<unsigned>>(); // to avoid warning message
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::vector<std::vector<unsigned>>
|
inline std::vector<std::vector<unsigned>>
|
||||||
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
const FastHistParam& param) {
|
const FastHistParam& param) {
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||||
const size_t nfeature = gmat.cut->row_ptr.size() - 1;
|
const size_t nfeature = gmat.cut.row_ptr.size() - 1;
|
||||||
|
|
||||||
std::vector<unsigned> feature_list(nfeature);
|
std::vector<unsigned> feature_list(nfeature);
|
||||||
std::iota(feature_list.begin(), feature_list.end(), 0);
|
std::iota(feature_list.begin(), feature_list.end(), 0);
|
||||||
@ -346,10 +336,10 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
|||||||
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||||
const ColumnMatrix& colmat,
|
const ColumnMatrix& colmat,
|
||||||
const FastHistParam& param) {
|
const FastHistParam& param) {
|
||||||
cut_ = gmat.cut;
|
cut_ = &gmat.cut;
|
||||||
|
|
||||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||||
const uint32_t nbins = gmat.cut->row_ptr.back();
|
const uint32_t nbins = gmat.cut.row_ptr.back();
|
||||||
|
|
||||||
/* step 1: form feature groups */
|
/* step 1: form feature groups */
|
||||||
auto groups = FastFeatureGrouping(gmat, colmat, param);
|
auto groups = FastFeatureGrouping(gmat, colmat, param);
|
||||||
@ -359,8 +349,8 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
|||||||
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
|
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
|
||||||
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
|
for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
|
||||||
for (auto& fid : groups[group_id]) {
|
for (auto& fid : groups[group_id]) {
|
||||||
const uint32_t bin_begin = gmat.cut->row_ptr[fid];
|
const uint32_t bin_begin = gmat.cut.row_ptr[fid];
|
||||||
const uint32_t bin_end = gmat.cut->row_ptr[fid + 1];
|
const uint32_t bin_end = gmat.cut.row_ptr[fid + 1];
|
||||||
for (uint32_t bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
|
for (uint32_t bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
|
||||||
bin2block[bin_id] = group_id;
|
bin2block[bin_id] = group_id;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -75,6 +75,7 @@ struct HistCutMatrix {
|
|||||||
std::vector<bst_float> min_val;
|
std::vector<bst_float> min_val;
|
||||||
/*! \brief the cut field */
|
/*! \brief the cut field */
|
||||||
std::vector<bst_float> cut;
|
std::vector<bst_float> cut;
|
||||||
|
uint32_t GetBinIdx(const Entry &e);
|
||||||
/*! \brief Get histogram bound for fid */
|
/*! \brief Get histogram bound for fid */
|
||||||
inline HistCutUnit operator[](bst_uint fid) const {
|
inline HistCutUnit operator[](bst_uint fid) const {
|
||||||
return {dmlc::BeginPtr(cut) + row_ptr[fid],
|
return {dmlc::BeginPtr(cut) + row_ptr[fid],
|
||||||
@ -122,18 +123,18 @@ struct GHistIndexMatrix {
|
|||||||
/*! \brief hit count of each index */
|
/*! \brief hit count of each index */
|
||||||
std::vector<size_t> hit_count;
|
std::vector<size_t> hit_count;
|
||||||
/*! \brief The corresponding cuts */
|
/*! \brief The corresponding cuts */
|
||||||
const HistCutMatrix* cut;
|
HistCutMatrix cut;
|
||||||
// Create a global histogram matrix, given cut
|
// Create a global histogram matrix, given cut
|
||||||
void Init(DMatrix* p_fmat);
|
void Init(DMatrix* p_fmat, int max_num_bins);
|
||||||
// get i-th row
|
// get i-th row
|
||||||
inline GHistIndexRow operator[](size_t i) const {
|
inline GHistIndexRow operator[](size_t i) const {
|
||||||
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
||||||
}
|
}
|
||||||
inline void GetFeatureCounts(size_t* counts) const {
|
inline void GetFeatureCounts(size_t* counts) const {
|
||||||
auto nfeature = cut->row_ptr.size() - 1;
|
auto nfeature = cut.row_ptr.size() - 1;
|
||||||
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
||||||
auto ibegin = cut->row_ptr[fid];
|
auto ibegin = cut.row_ptr[fid];
|
||||||
auto iend = cut->row_ptr[fid + 1];
|
auto iend = cut.row_ptr[fid + 1];
|
||||||
for (auto i = ibegin; i < iend; ++i) {
|
for (auto i = ibegin; i < iend; ++i) {
|
||||||
counts[fid] += hit_count[i];
|
counts[fid] += hit_count[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,8 +12,6 @@ namespace tree {
|
|||||||
|
|
||||||
/*! \brief training parameters for histogram-based training */
|
/*! \brief training parameters for histogram-based training */
|
||||||
struct FastHistParam : public dmlc::Parameter<FastHistParam> {
|
struct FastHistParam : public dmlc::Parameter<FastHistParam> {
|
||||||
// integral data type to be used with columnar data storage
|
|
||||||
enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 }; // NOLINT
|
|
||||||
int colmat_dtype;
|
int colmat_dtype;
|
||||||
// percentage threshold for treating a feature as sparse
|
// percentage threshold for treating a feature as sparse
|
||||||
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
|
// e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
|
||||||
@ -32,14 +30,6 @@ struct FastHistParam : public dmlc::Parameter<FastHistParam> {
|
|||||||
|
|
||||||
// declare the parameters
|
// declare the parameters
|
||||||
DMLC_DECLARE_PARAMETER(FastHistParam) {
|
DMLC_DECLARE_PARAMETER(FastHistParam) {
|
||||||
DMLC_DECLARE_FIELD(colmat_dtype)
|
|
||||||
.set_default(static_cast<int>(DataType::uint32))
|
|
||||||
.add_enum("uint8", static_cast<int>(DataType::uint8))
|
|
||||||
.add_enum("uint16", static_cast<int>(DataType::uint16))
|
|
||||||
.add_enum("uint32", static_cast<int>(DataType::uint32))
|
|
||||||
.describe("Integral data type to be used with columnar data storage."
|
|
||||||
"May carry marginal performance implications. Reserved for "
|
|
||||||
"advanced use");
|
|
||||||
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
|
DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
|
||||||
.describe("percentage threshold for treating a feature as sparse");
|
.describe("percentage threshold for treating a feature as sparse");
|
||||||
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
|
DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(0)
|
||||||
|
|||||||
@ -69,10 +69,8 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
GradStats::CheckInfo(dmat->Info());
|
GradStats::CheckInfo(dmat->Info());
|
||||||
if (is_gmat_initialized_ == false) {
|
if (is_gmat_initialized_ == false) {
|
||||||
double tstart = dmlc::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
hmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
|
gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
|
||||||
gmat_.cut = &hmat_;
|
column_matrix_.Init(gmat_, fhparam_.sparse_threshold);
|
||||||
gmat_.Init(dmat);
|
|
||||||
column_matrix_.Init(gmat_, fhparam_);
|
|
||||||
if (fhparam_.enable_feature_grouping > 0) {
|
if (fhparam_.enable_feature_grouping > 0) {
|
||||||
gmatb_.Init(gmat_, column_matrix_, fhparam_);
|
gmatb_.Init(gmat_, column_matrix_, fhparam_);
|
||||||
}
|
}
|
||||||
@ -112,8 +110,6 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
// training parameter
|
// training parameter
|
||||||
TrainParam param_;
|
TrainParam param_;
|
||||||
FastHistParam fhparam_;
|
FastHistParam fhparam_;
|
||||||
// data sketch
|
|
||||||
HistCutMatrix hmat_;
|
|
||||||
// quantized data matrix
|
// quantized data matrix
|
||||||
GHistIndexMatrix gmat_;
|
GHistIndexMatrix gmat_;
|
||||||
// (optional) data matrix with feature grouping
|
// (optional) data matrix with feature grouping
|
||||||
@ -376,7 +372,7 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
// clear local prediction cache
|
// clear local prediction cache
|
||||||
leaf_value_cache_.clear();
|
leaf_value_cache_.clear();
|
||||||
// initialize histogram collection
|
// initialize histogram collection
|
||||||
uint32_t nbins = gmat.cut->row_ptr.back();
|
uint32_t nbins = gmat.cut.row_ptr.back();
|
||||||
hist_.Init(nbins);
|
hist_.Init(nbins);
|
||||||
|
|
||||||
// initialize histogram builder
|
// initialize histogram builder
|
||||||
@ -413,7 +409,7 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
const size_t ncol = info.num_col_;
|
const size_t ncol = info.num_col_;
|
||||||
const size_t nnz = info.num_nonzero_;
|
const size_t nnz = info.num_nonzero_;
|
||||||
// number of discrete bins for feature 0
|
// number of discrete bins for feature 0
|
||||||
const uint32_t nbins_f0 = gmat.cut->row_ptr[1] - gmat.cut->row_ptr[0];
|
const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
|
||||||
if (nrow * ncol == nnz) {
|
if (nrow * ncol == nnz) {
|
||||||
// dense data with zero-based indexing
|
// dense data with zero-based indexing
|
||||||
data_layout_ = kDenseDataZeroBased;
|
data_layout_ = kDenseDataZeroBased;
|
||||||
@ -454,7 +450,7 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
choose the column that has a least positive number of discrete bins.
|
choose the column that has a least positive number of discrete bins.
|
||||||
For dense data (with no missing value),
|
For dense data (with no missing value),
|
||||||
the sum of gradient histogram is equal to snode[nid] */
|
the sum of gradient histogram is equal to snode[nid] */
|
||||||
const std::vector<uint32_t>& row_ptr = gmat.cut->row_ptr;
|
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||||
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
||||||
uint32_t min_nbins_per_feature = 0;
|
uint32_t min_nbins_per_feature = 0;
|
||||||
for (bst_uint i = 0; i < nfeature; ++i) {
|
for (bst_uint i = 0; i < nfeature; ++i) {
|
||||||
@ -516,19 +512,6 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
const HistCollection& hist,
|
const HistCollection& hist,
|
||||||
const DMatrix& fmat,
|
const DMatrix& fmat,
|
||||||
RegTree* p_tree) {
|
RegTree* p_tree) {
|
||||||
XGBOOST_TYPE_SWITCH(column_matrix.dtype, {
|
|
||||||
ApplySplitSpecialize<DType>(nid, gmat, column_matrix, hist, fmat,
|
|
||||||
p_tree);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline void ApplySplitSpecialize(int nid,
|
|
||||||
const GHistIndexMatrix& gmat,
|
|
||||||
const ColumnMatrix& column_matrix,
|
|
||||||
const HistCollection& hist,
|
|
||||||
const DMatrix& fmat,
|
|
||||||
RegTree* p_tree) {
|
|
||||||
// TODO(hcho3): support feature sampling by levels
|
// TODO(hcho3): support feature sampling by levels
|
||||||
|
|
||||||
/* 1. Create child nodes */
|
/* 1. Create child nodes */
|
||||||
@ -552,23 +535,23 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
const bool default_left = (*p_tree)[nid].DefaultLeft();
|
const bool default_left = (*p_tree)[nid].DefaultLeft();
|
||||||
const bst_uint fid = (*p_tree)[nid].SplitIndex();
|
const bst_uint fid = (*p_tree)[nid].SplitIndex();
|
||||||
const bst_float split_pt = (*p_tree)[nid].SplitCond();
|
const bst_float split_pt = (*p_tree)[nid].SplitCond();
|
||||||
const uint32_t lower_bound = gmat.cut->row_ptr[fid];
|
const uint32_t lower_bound = gmat.cut.row_ptr[fid];
|
||||||
const uint32_t upper_bound = gmat.cut->row_ptr[fid + 1];
|
const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
|
||||||
int32_t split_cond = -1;
|
int32_t split_cond = -1;
|
||||||
// convert floating-point split_pt into corresponding bin_id
|
// convert floating-point split_pt into corresponding bin_id
|
||||||
// split_cond = -1 indicates that split_pt is less than all known cut points
|
// split_cond = -1 indicates that split_pt is less than all known cut points
|
||||||
CHECK_LT(upper_bound,
|
CHECK_LT(upper_bound,
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||||
for (uint32_t i = lower_bound; i < upper_bound; ++i) {
|
for (uint32_t i = lower_bound; i < upper_bound; ++i) {
|
||||||
if (split_pt == gmat.cut->cut[i]) {
|
if (split_pt == gmat.cut.cut[i]) {
|
||||||
split_cond = static_cast<int32_t>(i);
|
split_cond = static_cast<int32_t>(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto& rowset = row_set_collection_[nid];
|
const auto& rowset = row_set_collection_[nid];
|
||||||
|
|
||||||
Column<T> column = column_matrix.GetColumn<T>(fid);
|
Column column = column_matrix.GetColumn(fid);
|
||||||
if (column.type == xgboost::common::kDenseColumn) {
|
if (column.GetType() == xgboost::common::kDenseColumn) {
|
||||||
ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
|
ApplySplitDenseData(rowset, gmat, &row_split_tloc_, column, split_cond,
|
||||||
default_left);
|
default_left);
|
||||||
} else {
|
} else {
|
||||||
@ -580,11 +563,10 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
|
nid, row_split_tloc_, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild());
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
inline void ApplySplitDenseData(const RowSetCollection::Elem rowset,
|
inline void ApplySplitDenseData(const RowSetCollection::Elem rowset,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
const Column<T>& column,
|
const Column& column,
|
||||||
bst_int split_cond,
|
bst_int split_cond,
|
||||||
bool default_left) {
|
bool default_left) {
|
||||||
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
std::vector<RowSetCollection::Split>& row_split_tloc = *p_row_split_tloc;
|
||||||
@ -598,24 +580,22 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
auto& left = row_split_tloc[tid].left;
|
auto& left = row_split_tloc[tid].left;
|
||||||
auto& right = row_split_tloc[tid].right;
|
auto& right = row_split_tloc[tid].right;
|
||||||
size_t rid[kUnroll];
|
size_t rid[kUnroll];
|
||||||
T rbin[kUnroll];
|
uint32_t rbin[kUnroll];
|
||||||
for (int k = 0; k < kUnroll; ++k) {
|
for (int k = 0; k < kUnroll; ++k) {
|
||||||
rid[k] = rowset.begin[i + k];
|
rid[k] = rowset.begin[i + k];
|
||||||
}
|
}
|
||||||
for (int k = 0; k < kUnroll; ++k) {
|
for (int k = 0; k < kUnroll; ++k) {
|
||||||
rbin[k] = column.index[rid[k]];
|
rbin[k] = column.GetFeatureBinIdx(rid[k]);
|
||||||
}
|
}
|
||||||
for (int k = 0; k < kUnroll; ++k) { // NOLINT
|
for (int k = 0; k < kUnroll; ++k) { // NOLINT
|
||||||
if (rbin[k] == std::numeric_limits<T>::max()) { // missing value
|
if (rbin[k] == std::numeric_limits<uint32_t>::max()) { // missing value
|
||||||
if (default_left) {
|
if (default_left) {
|
||||||
left.push_back(rid[k]);
|
left.push_back(rid[k]);
|
||||||
} else {
|
} else {
|
||||||
right.push_back(rid[k]);
|
right.push_back(rid[k]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
CHECK_LT(rbin[k] + column.index_base,
|
if (static_cast<int32_t>(rbin[k] + column.GetBaseIdx()) <= split_cond) {
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
if (static_cast<int32_t>(rbin[k] + column.index_base) <= split_cond) {
|
|
||||||
left.push_back(rid[k]);
|
left.push_back(rid[k]);
|
||||||
} else {
|
} else {
|
||||||
right.push_back(rid[k]);
|
right.push_back(rid[k]);
|
||||||
@ -627,17 +607,15 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
auto& left = row_split_tloc[nthread_-1].left;
|
auto& left = row_split_tloc[nthread_-1].left;
|
||||||
auto& right = row_split_tloc[nthread_-1].right;
|
auto& right = row_split_tloc[nthread_-1].right;
|
||||||
const size_t rid = rowset.begin[i];
|
const size_t rid = rowset.begin[i];
|
||||||
const T rbin = column.index[rid];
|
const uint32_t rbin = column.GetFeatureBinIdx(rid);
|
||||||
if (rbin == std::numeric_limits<T>::max()) { // missing value
|
if (rbin == std::numeric_limits<uint32_t>::max()) { // missing value
|
||||||
if (default_left) {
|
if (default_left) {
|
||||||
left.push_back(rid);
|
left.push_back(rid);
|
||||||
} else {
|
} else {
|
||||||
right.push_back(rid);
|
right.push_back(rid);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
CHECK_LT(rbin + column.index_base,
|
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
if (static_cast<int32_t>(rbin + column.index_base) <= split_cond) {
|
|
||||||
left.push_back(rid);
|
left.push_back(rid);
|
||||||
} else {
|
} else {
|
||||||
right.push_back(rid);
|
right.push_back(rid);
|
||||||
@ -646,11 +624,10 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
inline void ApplySplitSparseData(const RowSetCollection::Elem rowset,
|
inline void ApplySplitSparseData(const RowSetCollection::Elem rowset,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
std::vector<RowSetCollection::Split>* p_row_split_tloc,
|
||||||
const Column<T>& column,
|
const Column& column,
|
||||||
bst_uint lower_bound,
|
bst_uint lower_bound,
|
||||||
bst_uint upper_bound,
|
bst_uint upper_bound,
|
||||||
bst_int split_cond,
|
bst_int split_cond,
|
||||||
@ -665,27 +642,25 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
const size_t iend = (tid + 1) * nrows / nthread_;
|
const size_t iend = (tid + 1) * nrows / nthread_;
|
||||||
if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range
|
if (ibegin < iend) { // ensure that [ibegin, iend) is nonempty range
|
||||||
// search first nonzero row with index >= rowset[ibegin]
|
// search first nonzero row with index >= rowset[ibegin]
|
||||||
const size_t* p = std::lower_bound(column.row_ind,
|
const size_t* p = std::lower_bound(column.GetRowData(),
|
||||||
column.row_ind + column.len,
|
column.GetRowData() + column.Size(),
|
||||||
rowset.begin[ibegin]);
|
rowset.begin[ibegin]);
|
||||||
|
|
||||||
auto& left = row_split_tloc[tid].left;
|
auto& left = row_split_tloc[tid].left;
|
||||||
auto& right = row_split_tloc[tid].right;
|
auto& right = row_split_tloc[tid].right;
|
||||||
if (p != column.row_ind + column.len && *p <= rowset.begin[iend - 1]) {
|
if (p != column.GetRowData() + column.Size() && *p <= rowset.begin[iend - 1]) {
|
||||||
size_t cursor = p - column.row_ind;
|
size_t cursor = p - column.GetRowData();
|
||||||
|
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
const size_t rid = rowset.begin[i];
|
const size_t rid = rowset.begin[i];
|
||||||
while (cursor < column.len
|
while (cursor < column.Size()
|
||||||
&& column.row_ind[cursor] < rid
|
&& column.GetRowIdx(cursor) < rid
|
||||||
&& column.row_ind[cursor] <= rowset.begin[iend - 1]) {
|
&& column.GetRowIdx(cursor) <= rowset.begin[iend - 1]) {
|
||||||
++cursor;
|
++cursor;
|
||||||
}
|
}
|
||||||
if (cursor < column.len && column.row_ind[cursor] == rid) {
|
if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
|
||||||
const T rbin = column.index[cursor];
|
const uint32_t rbin = column.GetFeatureBinIdx(cursor);
|
||||||
CHECK_LT(rbin + column.index_base,
|
if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
|
||||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
|
||||||
if (static_cast<int32_t>(rbin + column.index_base) <= split_cond) {
|
|
||||||
left.push_back(rid);
|
left.push_back(rid);
|
||||||
} else {
|
} else {
|
||||||
right.push_back(rid);
|
right.push_back(rid);
|
||||||
@ -733,7 +708,7 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
For dense data (with no missing value),
|
For dense data (with no missing value),
|
||||||
the sum of gradient histogram is equal to snode[nid] */
|
the sum of gradient histogram is equal to snode[nid] */
|
||||||
GHistRow hist = hist_[nid];
|
GHistRow hist = hist_[nid];
|
||||||
const std::vector<uint32_t>& row_ptr = gmat.cut->row_ptr;
|
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||||
|
|
||||||
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
const uint32_t ibegin = row_ptr[fid_least_bins_];
|
||||||
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
const uint32_t iend = row_ptr[fid_least_bins_ + 1];
|
||||||
@ -771,8 +746,8 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
CHECK(d_step == +1 || d_step == -1);
|
CHECK(d_step == +1 || d_step == -1);
|
||||||
|
|
||||||
// aliases
|
// aliases
|
||||||
const std::vector<uint32_t>& cut_ptr = gmat.cut->row_ptr;
|
const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
|
||||||
const std::vector<bst_float>& cut_val = gmat.cut->cut;
|
const std::vector<bst_float>& cut_val = gmat.cut.cut;
|
||||||
|
|
||||||
// statistics on both sides of split
|
// statistics on both sides of split
|
||||||
GradStats c(param_);
|
GradStats c(param_);
|
||||||
@ -821,7 +796,7 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
snode.root_gain);
|
snode.root_gain);
|
||||||
if (i == imin) {
|
if (i == imin) {
|
||||||
// for leftmost bin, left bound is the smallest feature value
|
// for leftmost bin, left bound is the smallest feature value
|
||||||
split_pt = gmat.cut->min_val[fid];
|
split_pt = gmat.cut.min_val[fid];
|
||||||
} else {
|
} else {
|
||||||
split_pt = cut_val[i - 1];
|
split_pt = cut_val[i - 1];
|
||||||
}
|
}
|
||||||
|
|||||||
51
tests/cpp/common/test_column_matrix.cc
Normal file
51
tests/cpp/common/test_column_matrix.cc
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#include "../../../src/common/column_matrix.h"
|
||||||
|
#include "../helpers.h"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace common {
|
||||||
|
TEST(DenseColumn, Test) {
|
||||||
|
auto dmat = CreateDMatrix(100, 10, 0.0);
|
||||||
|
GHistIndexMatrix gmat;
|
||||||
|
gmat.Init(dmat.get(), 256);
|
||||||
|
ColumnMatrix column_matrix;
|
||||||
|
column_matrix.Init(gmat, 0.2);
|
||||||
|
|
||||||
|
for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
|
||||||
|
for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
|
||||||
|
auto col = column_matrix.GetColumn(j);
|
||||||
|
EXPECT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
|
||||||
|
col.GetGlobalBinIdx(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SparseColumn, Test) {
|
||||||
|
auto dmat = CreateDMatrix(100, 1, 0.85);
|
||||||
|
GHistIndexMatrix gmat;
|
||||||
|
gmat.Init(dmat.get(), 256);
|
||||||
|
ColumnMatrix column_matrix;
|
||||||
|
column_matrix.Init(gmat, 0.5);
|
||||||
|
auto col = column_matrix.GetColumn(0);
|
||||||
|
ASSERT_EQ(col.Size(), gmat.index.size());
|
||||||
|
for (auto i = 0ull; i < col.Size(); i++) {
|
||||||
|
EXPECT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
|
||||||
|
col.GetGlobalBinIdx(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DenseColumnWithMissing, Test) {
|
||||||
|
auto dmat = CreateDMatrix(100, 1, 0.5);
|
||||||
|
GHistIndexMatrix gmat;
|
||||||
|
gmat.Init(dmat.get(), 256);
|
||||||
|
ColumnMatrix column_matrix;
|
||||||
|
column_matrix.Init(gmat, 0.2);
|
||||||
|
auto col = column_matrix.GetColumn(0);
|
||||||
|
for (auto i = 0ull; i < col.Size(); i++) {
|
||||||
|
if (col.IsMissing(i)) continue;
|
||||||
|
EXPECT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
|
||||||
|
col.GetGlobalBinIdx(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace common
|
||||||
|
} // namespace xgboost
|
||||||
@ -67,59 +67,4 @@ TEST(MetaInfo, SaveLoadBinary) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(MetaInfo, LoadQid) {
|
TEST(MetaInfo, LoadQid) {
|
||||||
std::string tmp_file = TempFileName();
|
|
||||||
{
|
|
||||||
std::unique_ptr<dmlc::Stream> fs(
|
|
||||||
dmlc::Stream::Create(tmp_file.c_str(), "w"));
|
|
||||||
dmlc::ostream os(fs.get());
|
|
||||||
os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
|
|
||||||
2 qid:1 1:0 2:0 3:1 4:0.1 5:1
|
|
||||||
1 qid:1 1:0 2:1 3:0 4:0.4 5:0
|
|
||||||
1 qid:1 1:0 2:0 3:1 4:0.3 5:0
|
|
||||||
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
|
|
||||||
2 qid:2 1:1 2:0 3:1 4:0.4 5:0
|
|
||||||
1 qid:2 1:0 2:0 3:1 4:0.1 5:0
|
|
||||||
1 qid:2 1:0 2:0 3:1 4:0.2 5:0
|
|
||||||
2 qid:3 1:0 2:0 3:1 4:0.1 5:1
|
|
||||||
3 qid:3 1:1 2:1 3:0 4:0.3 5:0
|
|
||||||
4 qid:3 1:1 2:0 3:0 4:0.4 5:1
|
|
||||||
1 qid:3 1:0 2:1 3:1 4:0.5 5:0)qid";
|
|
||||||
os.set_stream(nullptr);
|
|
||||||
}
|
|
||||||
std::unique_ptr<xgboost::DMatrix> dmat(
|
|
||||||
xgboost::DMatrix::Load(tmp_file, true, false, "libsvm"));
|
|
||||||
std::remove(tmp_file.c_str());
|
|
||||||
|
|
||||||
const xgboost::MetaInfo& info = dmat->Info();
|
|
||||||
const std::vector<uint64_t> expected_qids{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
|
|
||||||
const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
|
|
||||||
CHECK(info.qids_ == expected_qids);
|
|
||||||
CHECK(info.group_ptr_ == expected_group_ptr);
|
|
||||||
CHECK_GE(info.kVersion, info.kVersionQidAdded);
|
|
||||||
|
|
||||||
const std::vector<size_t> expected_offset{
|
|
||||||
0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
|
|
||||||
};
|
|
||||||
const std::vector<xgboost::Entry> expected_data{
|
|
||||||
{1, 1}, {2, 1}, {3, 0}, {4, 0.2}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 1},
|
|
||||||
{1, 0}, {2, 1}, {3, 0}, {4, 0.4}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.3}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.2}, {5, 0},
|
|
||||||
{1, 1}, {2, 0}, {3, 1}, {4, 0.4}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.2}, {5, 0},
|
|
||||||
{1, 0}, {2, 0}, {3, 1}, {4, 0.1}, {5, 1},
|
|
||||||
{1, 1}, {2, 1}, {3, 0}, {4, 0.3}, {5, 0},
|
|
||||||
{1, 1}, {2, 0}, {3, 0}, {4, 0.4}, {5, 1},
|
|
||||||
{1, 0}, {2, 1}, {3, 1}, {4, 0.5}, {5, 0}
|
|
||||||
};
|
|
||||||
dmlc::DataIter<xgboost::SparsePage>* iter = dmat->RowIterator();
|
|
||||||
iter->BeforeFirst();
|
|
||||||
CHECK(iter->Next());
|
|
||||||
const xgboost::SparsePage& batch = iter->Value();
|
|
||||||
CHECK_EQ(batch.base_rowid, 0);
|
|
||||||
CHECK(batch.offset == expected_offset);
|
|
||||||
CHECK(batch.data == expected_data);
|
|
||||||
CHECK(!iter->Next());
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,11 +18,8 @@ TEST(gpu_hist_experimental, TestSparseShard) {
|
|||||||
int columns = 80;
|
int columns = 80;
|
||||||
int max_bins = 4;
|
int max_bins = 4;
|
||||||
auto dmat = CreateDMatrix(rows, columns, 0.9f);
|
auto dmat = CreateDMatrix(rows, columns, 0.9f);
|
||||||
common::HistCutMatrix hmat;
|
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
hmat.Init(dmat.get(), max_bins);
|
gmat.Init(dmat.get(),max_bins);
|
||||||
gmat.cut = &hmat;
|
|
||||||
gmat.Init(dmat.get());
|
|
||||||
TrainParam p;
|
TrainParam p;
|
||||||
p.max_depth = 6;
|
p.max_depth = 6;
|
||||||
|
|
||||||
@ -32,7 +29,7 @@ TEST(gpu_hist_experimental, TestSparseShard) {
|
|||||||
const SparsePage& batch = iter->Value();
|
const SparsePage& batch = iter->Value();
|
||||||
DeviceShard shard(0, 0, 0, rows, p);
|
DeviceShard shard(0, 0, 0, rows, p);
|
||||||
shard.InitRowPtrs(batch);
|
shard.InitRowPtrs(batch);
|
||||||
shard.InitCompressedData(hmat, batch);
|
shard.InitCompressedData(gmat.cut, batch);
|
||||||
CHECK(!iter->Next());
|
CHECK(!iter->Next());
|
||||||
|
|
||||||
ASSERT_LT(shard.row_stride, columns);
|
ASSERT_LT(shard.row_stride, columns);
|
||||||
@ -40,7 +37,7 @@ TEST(gpu_hist_experimental, TestSparseShard) {
|
|||||||
auto host_gidx_buffer = shard.gidx_buffer.AsVector();
|
auto host_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
|
|
||||||
common::CompressedIterator<uint32_t> gidx(host_gidx_buffer.data(),
|
common::CompressedIterator<uint32_t> gidx(host_gidx_buffer.data(),
|
||||||
hmat.row_ptr.back() + 1);
|
gmat.cut.row_ptr.back() + 1);
|
||||||
|
|
||||||
for (int i = 0; i < rows; i++) {
|
for (int i = 0; i < rows; i++) {
|
||||||
int row_offset = 0;
|
int row_offset = 0;
|
||||||
@ -60,11 +57,8 @@ TEST(gpu_hist_experimental, TestDenseShard) {
|
|||||||
int columns = 80;
|
int columns = 80;
|
||||||
int max_bins = 4;
|
int max_bins = 4;
|
||||||
auto dmat = CreateDMatrix(rows, columns, 0);
|
auto dmat = CreateDMatrix(rows, columns, 0);
|
||||||
common::HistCutMatrix hmat;
|
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
hmat.Init(dmat.get(), max_bins);
|
gmat.Init(dmat.get(),max_bins);
|
||||||
gmat.cut = &hmat;
|
|
||||||
gmat.Init(dmat.get());
|
|
||||||
TrainParam p;
|
TrainParam p;
|
||||||
p.max_depth = 6;
|
p.max_depth = 6;
|
||||||
|
|
||||||
@ -75,7 +69,7 @@ TEST(gpu_hist_experimental, TestDenseShard) {
|
|||||||
|
|
||||||
DeviceShard shard(0, 0, 0, rows, p);
|
DeviceShard shard(0, 0, 0, rows, p);
|
||||||
shard.InitRowPtrs(batch);
|
shard.InitRowPtrs(batch);
|
||||||
shard.InitCompressedData(hmat, batch);
|
shard.InitCompressedData(gmat.cut, batch);
|
||||||
CHECK(!iter->Next());
|
CHECK(!iter->Next());
|
||||||
|
|
||||||
ASSERT_EQ(shard.row_stride, columns);
|
ASSERT_EQ(shard.row_stride, columns);
|
||||||
@ -83,7 +77,7 @@ TEST(gpu_hist_experimental, TestDenseShard) {
|
|||||||
auto host_gidx_buffer = shard.gidx_buffer.AsVector();
|
auto host_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
|
|
||||||
common::CompressedIterator<uint32_t> gidx(host_gidx_buffer.data(),
|
common::CompressedIterator<uint32_t> gidx(host_gidx_buffer.data(),
|
||||||
hmat.row_ptr.back() + 1);
|
gmat.cut.row_ptr.back() + 1);
|
||||||
|
|
||||||
for (int i = 0; i < gmat.index.size(); i++) {
|
for (int i = 0; i < gmat.index.size(); i++) {
|
||||||
ASSERT_EQ(gidx[i], gmat.index[i]);
|
ASSERT_EQ(gidx[i], gmat.index[i]);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user