Handle missing values in dataframe with category dtype. (#7331)
* Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
* Copyright 2020-2021 by XGBoost Contributors
|
||||
* \file categorical.h
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
||||
@@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
|
||||
return !s_cats.Check(cat);
|
||||
}
|
||||
|
||||
inline void CheckCat(bst_cat_t cat) {
|
||||
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
|
||||
"should be non-negative.";
|
||||
}
|
||||
|
||||
struct IsCatOp {
|
||||
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
||||
return ft == FeatureType::kCategorical;
|
||||
|
||||
@@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
||||
info.feature_types.SetDevice(device);
|
||||
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
||||
CHECK(!d_feature_types.empty());
|
||||
auto &column_sizes_scan = *p_column_sizes_scan;
|
||||
|
||||
@@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
|
||||
|
||||
void SortByWeight(dh::device_vector<float>* weights,
|
||||
dh::device_vector<Entry>* sorted_entries);
|
||||
|
||||
void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan);
|
||||
} // namespace detail
|
||||
|
||||
// Compute sketch on DMatrix.
|
||||
@@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
|
||||
size_t sketch_batch_num_elements = 0);
|
||||
|
||||
template <typename AdapterBatch>
|
||||
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
||||
size_t begin, size_t end, float missing,
|
||||
SketchContainer* sketch_container, int num_cuts) {
|
||||
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
int device, size_t columns, size_t begin, size_t end,
|
||||
float missing, SketchContainer *sketch_container,
|
||||
int num_cuts) {
|
||||
// Copy current subset of valid elements into temporary storage and sort
|
||||
dh::device_vector<Entry> sorted_entries;
|
||||
dh::caching_device_vector<size_t> column_sizes_scan;
|
||||
@@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
||||
thrust::make_counting_iterator(0llu),
|
||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
||||
cuts_ptr.SetDevice(device);
|
||||
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
||||
columns, num_cuts, device,
|
||||
&cuts_ptr,
|
||||
@@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||
sorted_entries.end(), detail::EntryCompareOp());
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
auto const &h_cuts_ptr = cuts_ptr.HostVector();
|
||||
// Extract the cuts from all columns concurrently
|
||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
||||
@@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
|
||||
detail::SortByWeight(&temp_weights, &sorted_entries);
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
|
||||
@@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
device, num_cuts_per_feature, false);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||
ProcessSlidingWindow(batch, device, num_cols,
|
||||
begin, end, missing, sketch_container, num_cuts_per_feature);
|
||||
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
||||
sketch_container, num_cuts_per_feature);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
|
||||
#include "array_interface.h"
|
||||
#include "../c_api/c_api_error.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
@@ -80,6 +81,24 @@ struct COOTuple {
|
||||
float value{0};
|
||||
};
|
||||
|
||||
struct IsValidFunctor {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
||||
return !(common::CheckNAN(e.value) || e.value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
||||
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/**
|
||||
|
||||
@@ -999,18 +999,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
||||
|
||||
// Second pass over batch, placing elements in correct position
|
||||
|
||||
auto is_valid = data::IsValidFunctor{missing};
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
exec.Run([&]() {
|
||||
int tid = omp_get_thread_num();
|
||||
size_t begin = tid*thread_size;
|
||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
||||
size_t begin = tid * thread_size;
|
||||
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
auto line = batch.GetLine(i);
|
||||
for (auto j = 0ull; j < line.Size(); j++) {
|
||||
auto element = line.GetElement(j);
|
||||
const size_t key = (element.row_idx - base_rowid);
|
||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
||||
if (is_valid(element)) {
|
||||
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,29 +15,6 @@
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
__device__ bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
__device__ bool operator()(const data::COOTuple& e) const {
|
||||
if (common::CheckNAN(e.value) || e.value == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
__device__ bool operator()(const Entry& e) const {
|
||||
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
friend class CudfAdapter;
|
||||
|
||||
|
||||
@@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
||||
|
||||
if (batches == 1) {
|
||||
this->info_ = std::move(proxy->Info());
|
||||
this->info_.num_nonzero_ = nnz;
|
||||
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -580,6 +580,7 @@ struct GPUHistMakerDevice {
|
||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||
<< "Categorical feature value too large.";
|
||||
auto cat = common::AsCat(candidate.split.fvalue);
|
||||
common::CheckCat(cat);
|
||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
||||
LBitField32 cats_bits(split_cats);
|
||||
cats_bits.Set(cat);
|
||||
|
||||
Reference in New Issue
Block a user