Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.
This commit is contained in:
Jiaming Yuan
2021-10-28 03:33:54 +08:00
committed by GitHub
parent 2eee87423c
commit ac9bfaa4f2
13 changed files with 301 additions and 103 deletions

View File

@@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
info.feature_types.SetDevice(device);
auto d_feature_types = info.feature_types.ConstDeviceSpan();
CHECK(!d_feature_types.empty());
auto &column_sizes_scan = *p_column_sizes_scan;