Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.
This commit is contained in:
Jiaming Yuan
2021-10-28 03:33:54 +08:00
committed by GitHub
parent 2eee87423c
commit ac9bfaa4f2
13 changed files with 301 additions and 103 deletions

View File

@@ -21,6 +21,7 @@
#include "array_interface.h"
#include "../c_api/c_api_error.h"
#include "../common/math.h"
namespace xgboost {
namespace data {
@@ -80,6 +81,24 @@ struct COOTuple {
float value{0};
};
struct IsValidFunctor {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
XGBOOST_DEVICE bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
return !(common::CheckNAN(e.value) || e.value == missing);
}
XGBOOST_DEVICE bool operator()(const Entry& e) const {
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
}
};
namespace detail {
/**