Handle missing values in dataframe with category dtype. (#7331)
* Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching.
This commit is contained in:
@@ -21,6 +21,7 @@
|
||||
|
||||
#include "array_interface.h"
|
||||
#include "../c_api/c_api_error.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
@@ -80,6 +81,24 @@ struct COOTuple {
|
||||
float value{0};
|
||||
};
|
||||
|
||||
struct IsValidFunctor {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
||||
return !(common::CheckNAN(e.value) || e.value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
||||
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/**
|
||||
|
||||
@@ -999,18 +999,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
||||
|
||||
// Second pass over batch, placing elements in correct position
|
||||
|
||||
auto is_valid = data::IsValidFunctor{missing};
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
exec.Run([&]() {
|
||||
int tid = omp_get_thread_num();
|
||||
size_t begin = tid*thread_size;
|
||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
||||
size_t begin = tid * thread_size;
|
||||
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
auto line = batch.GetLine(i);
|
||||
for (auto j = 0ull; j < line.Size(); j++) {
|
||||
auto element = line.GetElement(j);
|
||||
const size_t key = (element.row_idx - base_rowid);
|
||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
||||
if (is_valid(element)) {
|
||||
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,29 +15,6 @@
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
__device__ bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
__device__ bool operator()(const data::COOTuple& e) const {
|
||||
if (common::CheckNAN(e.value) || e.value == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
__device__ bool operator()(const Entry& e) const {
|
||||
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
friend class CudfAdapter;
|
||||
|
||||
|
||||
@@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
||||
|
||||
if (batches == 1) {
|
||||
this->info_ = std::move(proxy->Info());
|
||||
this->info_.num_nonzero_ = nnz;
|
||||
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user