Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.
This commit is contained in:
Jiaming Yuan
2021-10-28 03:33:54 +08:00
committed by GitHub
parent 2eee87423c
commit ac9bfaa4f2
13 changed files with 301 additions and 103 deletions

View File

@@ -21,6 +21,7 @@
#include "array_interface.h"
#include "../c_api/c_api_error.h"
#include "../common/math.h"
namespace xgboost {
namespace data {
@@ -80,6 +81,24 @@ struct COOTuple {
float value{0};
};
struct IsValidFunctor {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
XGBOOST_DEVICE bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
return !(common::CheckNAN(e.value) || e.value == missing);
}
XGBOOST_DEVICE bool operator()(const Entry& e) const {
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
}
};
namespace detail {
/**

View File

@@ -999,18 +999,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
// Second pass over batch, placing elements in correct position
auto is_valid = data::IsValidFunctor{missing};
#pragma omp parallel num_threads(nthread)
{
exec.Run([&]() {
int tid = omp_get_thread_num();
size_t begin = tid*thread_size;
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
size_t begin = tid * thread_size;
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
for (size_t i = begin; i < end; ++i) {
auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j);
const size_t key = (element.row_idx - base_rowid);
if (!common::CheckNAN(element.value) && element.value != missing) {
if (is_valid(element)) {
builder.Push(key, Entry(element.column_idx, element.value), tid);
}
}

View File

@@ -15,29 +15,6 @@
namespace xgboost {
namespace data {
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
__device__ bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
__device__ bool operator()(const data::COOTuple& e) const {
if (common::CheckNAN(e.value) || e.value == missing) {
return false;
}
return true;
}
__device__ bool operator()(const Entry& e) const {
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
return false;
}
return true;
}
};
class CudfAdapterBatch : public detail::NoMetaInfo {
friend class CudfAdapter;

View File

@@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
if (batches == 1) {
this->info_ = std::move(proxy->Info());
this->info_.num_nonzero_ = nnz;
CHECK_EQ(proxy->Info().labels_.Size(), 0);
}