Additional improvements for gblinear (#3134)

* fix rebase conflict

* [core] additional gblinear improvements

* [R] callback for gblinear coefficients history

* force eta=1 for gblinear python tests

* add top_k to GreedyFeatureSelector

* set eta=1 in shotgun test

* [core] fix SparsePage processing in gblinear; col-wise multithreading in greedy updater

* set sorted flag within TryInitColData

* gblinear tests: use scale, add external memory test

* fix multiclass for greedy updater

* fix whitespace

* fix typo
This commit is contained in:
Vadim Khotilovich
2018-03-13 01:27:13 -05:00
committed by GitHub
parent a1b48afa41
commit 706be4e5d4
18 changed files with 750 additions and 260 deletions

View File

@@ -119,7 +119,7 @@ ColIterator(const std::vector<bst_uint>& fset) {
}
bool SparsePageDMatrix::TryInitColData() {
bool SparsePageDMatrix::TryInitColData(bool sorted) {
// load meta data.
std::vector<std::string> cache_shards = common::Split(cache_info_, ':');
{
@@ -140,6 +140,8 @@ bool SparsePageDMatrix::TryInitColData() {
files.push_back(std::move(fdata));
}
col_iter_.reset(new ColPageIter(std::move(files)));
// warning: no attempt to check here whether the cached data was sorted
col_iter_->sorted = sorted;
return true;
}
@@ -147,7 +149,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
float pkeep,
size_t max_row_perbatch, bool sorted) {
if (HaveColAccess(sorted)) return;
if (TryInitColData()) return;
if (TryInitColData(sorted)) return;
const MetaInfo& info = this->info();
if (max_row_perbatch == std::numeric_limits<size_t>::max()) {
max_row_perbatch = kMaxRowPerBatch;
@@ -291,8 +293,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
fo.reset(nullptr);
}
// initialize column data
CHECK(TryInitColData());
col_iter_->sorted = sorted;
CHECK(TryInitColData(sorted));
}
} // namespace data

View File

@@ -116,7 +116,7 @@ class SparsePageDMatrix : public DMatrix {
* \brief Try to initialize column data.
* \return true if data already exists, false if they do not.
*/
bool TryInitColData();
bool TryInitColData(bool sorted);
// source data pointer.
std::unique_ptr<DataSource> source_;
// the cache prefix