Fix categorical data with external memory. (#10433)
This commit is contained in:
parent
a8ddbac163
commit
b4cc350ec5
@ -43,7 +43,7 @@ def make_batches(
|
|||||||
class Iterator(xgboost.DataIter):
|
class Iterator(xgboost.DataIter):
|
||||||
"""A custom iterator for loading files in batches."""
|
"""A custom iterator for loading files in batches."""
|
||||||
|
|
||||||
def __init__(self, file_paths: List[Tuple[str, str]]):
|
def __init__(self, file_paths: List[Tuple[str, str]]) -> None:
|
||||||
self._file_paths = file_paths
|
self._file_paths = file_paths
|
||||||
self._it = 0
|
self._it = 0
|
||||||
# XGBoost will generate some cache files under current directory with the prefix
|
# XGBoost will generate some cache files under current directory with the prefix
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2024 by XGBoost Contributors
|
* Copyright 2017-2024, XGBoost Contributors
|
||||||
* \file hist_util.h
|
* \file hist_util.h
|
||||||
* \brief Utility for fast histogram aggregation
|
* \brief Utility for fast histogram aggregation
|
||||||
* \author Philip Cho, Tianqi Chen
|
* \author Philip Cho, Tianqi Chen
|
||||||
@ -11,7 +11,6 @@
|
|||||||
#include <cstdint> // for uint32_t
|
#include <cstdint> // for uint32_t
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
*/
|
*/
|
||||||
#include "gradient_index.h"
|
#include "gradient_index.h"
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <utility> // for forward
|
#include <utility> // for forward
|
||||||
@ -126,8 +125,8 @@ INSTANTIATION_PUSH(data::ColumnarAdapterBatch)
|
|||||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
||||||
auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
|
auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
|
||||||
// Must resize instead of allocating a new one. This function is called everytime a
|
// Must resize instead of allocating a new one. This function is called everytime a
|
||||||
// new batch is pushed, and we grow the size accordingly without loosing the data the
|
// new batch is pushed, and we grow the size accordingly without loosing the data in
|
||||||
// previous batches.
|
// the previous batches.
|
||||||
using T = decltype(t);
|
using T = decltype(t);
|
||||||
std::size_t n_bytes = sizeof(T) * n_index;
|
std::size_t n_bytes = sizeof(T) * n_index;
|
||||||
CHECK_GE(n_bytes, this->data.size());
|
CHECK_GE(n_bytes, this->data.size());
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2021-2023, XGBoost contributors
|
* Copyright 2021-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||||
#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
|
||||||
@ -23,6 +23,15 @@ inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResour
|
|||||||
if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
|
if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
bool has_cat{false};
|
||||||
|
if (!fi->Read(&has_cat)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
decltype(cuts->MaxCategory()) max_cat{0};
|
||||||
|
if (!fi->Read(&max_cat)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cuts->SetCategorical(has_cat, max_cat);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,6 +41,8 @@ inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
|
|||||||
bytes += common::WriteVec(fo, cuts.Values());
|
bytes += common::WriteVec(fo, cuts.Values());
|
||||||
bytes += common::WriteVec(fo, cuts.Ptrs());
|
bytes += common::WriteVec(fo, cuts.Ptrs());
|
||||||
bytes += common::WriteVec(fo, cuts.MinValues());
|
bytes += common::WriteVec(fo, cuts.MinValues());
|
||||||
|
bytes += fo->Write(cuts.HasCategorical());
|
||||||
|
bytes += fo->Write(cuts.MaxCategory());
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
} // namespace xgboost::data
|
} // namespace xgboost::data
|
||||||
|
|||||||
@ -52,6 +52,21 @@ def test_single_batch(tree_method: str = "approx") -> None:
|
|||||||
assert from_np.get_dump() == from_it.get_dump()
|
assert from_np.get_dump() == from_it.get_dump()
|
||||||
|
|
||||||
|
|
||||||
|
def test_with_cat_single() -> None:
|
||||||
|
X, y = tm.make_categorical(
|
||||||
|
n_samples=128, n_features=3, n_categories=6, onehot=False
|
||||||
|
)
|
||||||
|
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), enable_categorical=True)
|
||||||
|
from_it = xgb.train({}, Xy, num_boost_round=3)
|
||||||
|
|
||||||
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
from_Xy = xgb.train({}, Xy, num_boost_round=3)
|
||||||
|
|
||||||
|
jit = from_it.save_raw(raw_format="json")
|
||||||
|
jxy = from_Xy.save_raw(raw_format="json")
|
||||||
|
assert jit == jxy
|
||||||
|
|
||||||
|
|
||||||
def run_data_iterator(
|
def run_data_iterator(
|
||||||
n_samples_per_batch: int,
|
n_samples_per_batch: int,
|
||||||
n_features: int,
|
n_features: int,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user