Remove column major specialization. (#5755)
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -118,14 +118,18 @@ struct EllpackLoader {
|
||||
}
|
||||
};
|
||||
|
||||
struct CuPyAdapterLoader {
|
||||
data::CupyAdapterBatch batch;
|
||||
template <typename Batch>
|
||||
struct DeviceAdapterLoader {
|
||||
Batch batch;
|
||||
bst_feature_t columns;
|
||||
float* smem;
|
||||
bool use_shared;
|
||||
|
||||
DEV_INLINE CuPyAdapterLoader(data::CupyAdapterBatch const batch, bool use_shared,
|
||||
bst_feature_t num_features, bst_row_t num_rows, size_t entry_start) :
|
||||
using BatchT = Batch;
|
||||
|
||||
DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
|
||||
bst_feature_t num_features, bst_row_t num_rows,
|
||||
size_t entry_start) :
|
||||
batch{batch},
|
||||
columns{num_features},
|
||||
use_shared{use_shared} {
|
||||
@@ -155,39 +159,6 @@ struct CuPyAdapterLoader {
|
||||
}
|
||||
};
|
||||
|
||||
struct CuDFAdapterLoader {
|
||||
data::CudfAdapterBatch batch;
|
||||
bst_feature_t columns;
|
||||
float* smem;
|
||||
bool use_shared;
|
||||
|
||||
DEV_INLINE CuDFAdapterLoader(data::CudfAdapterBatch const batch, bool use_shared,
|
||||
bst_feature_t num_features,
|
||||
bst_row_t num_rows, size_t entry_start)
|
||||
: batch{batch}, columns{num_features}, use_shared{use_shared} {
|
||||
extern __shared__ float _smem[];
|
||||
smem = _smem;
|
||||
if (use_shared) {
|
||||
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
size_t shared_elements = blockDim.x * num_features;
|
||||
dh::BlockFill(smem, shared_elements, nanf(""));
|
||||
__syncthreads();
|
||||
if (global_idx < num_rows) {
|
||||
for (size_t i = 0; i < columns; ++i) {
|
||||
smem[threadIdx.x * columns + i] = batch.GetValue(global_idx, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
DEV_INLINE float GetFvalue(bst_row_t ridx, bst_feature_t fidx) const {
|
||||
if (use_shared) {
|
||||
return smem[threadIdx.x * columns + fidx];
|
||||
}
|
||||
return batch.GetValue(ridx, fidx);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Loader>
|
||||
__device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
|
||||
Loader* loader) {
|
||||
@@ -429,7 +400,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
out_preds->Size() == dmat->Info().num_row_);
|
||||
}
|
||||
|
||||
template <typename Adapter, typename Loader, typename Batch>
|
||||
template <typename Adapter, typename Loader>
|
||||
void DispatchedInplacePredict(dmlc::any const &x,
|
||||
const gbm::GBTreeModel &model, float missing,
|
||||
PredictionCacheEntry *out_preds,
|
||||
@@ -439,22 +410,22 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, tree_begin, tree_end, this->generic_param_->gpu_id);
|
||||
|
||||
auto m = dmlc::get<Adapter>(x);
|
||||
CHECK_EQ(m.NumColumns(), model.learner_model_param->num_feature)
|
||||
auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
|
||||
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
|
||||
<< "Number of columns in data must equal to trained model.";
|
||||
CHECK_EQ(this->generic_param_->gpu_id, m.DeviceIdx())
|
||||
CHECK_EQ(this->generic_param_->gpu_id, m->DeviceIdx())
|
||||
<< "XGBoost is running on device: " << this->generic_param_->gpu_id << ", "
|
||||
<< "but data is on: " << m.DeviceIdx();
|
||||
<< "but data is on: " << m->DeviceIdx();
|
||||
MetaInfo info;
|
||||
info.num_col_ = m.NumColumns();
|
||||
info.num_row_ = m.NumRows();
|
||||
info.num_col_ = m->NumColumns();
|
||||
info.num_row_ = m->NumRows();
|
||||
this->InitOutPredictions(info, &(out_preds->predictions), model);
|
||||
|
||||
const uint32_t BLOCK_THREADS = 128;
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(info.num_row_, BLOCK_THREADS));
|
||||
|
||||
auto shared_memory_bytes =
|
||||
static_cast<size_t>(sizeof(float) * m.NumColumns() * BLOCK_THREADS);
|
||||
static_cast<size_t>(sizeof(float) * m->NumColumns() * BLOCK_THREADS);
|
||||
bool use_shared = true;
|
||||
if (shared_memory_bytes > max_shared_memory_bytes) {
|
||||
shared_memory_bytes = 0;
|
||||
@@ -463,22 +434,24 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
size_t entry_start = 0;
|
||||
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
|
||||
PredictKernel<Loader, Batch>,
|
||||
m.Value(),
|
||||
PredictKernel<Loader, typename Loader::BatchT>,
|
||||
m->Value(),
|
||||
dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
|
||||
dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
|
||||
tree_begin, tree_end, m.NumColumns(), info.num_row_,
|
||||
tree_begin, tree_end, m->NumColumns(), info.num_row_,
|
||||
entry_start, use_shared, output_groups);
|
||||
}
|
||||
|
||||
void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
|
||||
float missing, PredictionCacheEntry *out_preds,
|
||||
uint32_t tree_begin, unsigned tree_end) const override {
|
||||
if (x.type() == typeid(data::CupyAdapter)) {
|
||||
this->DispatchedInplacePredict<data::CupyAdapter, CuPyAdapterLoader, data::CupyAdapterBatch>(
|
||||
if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
|
||||
this->DispatchedInplacePredict<
|
||||
data::CupyAdapter, DeviceAdapterLoader<data::CupyAdapterBatch>>(
|
||||
x, model, missing, out_preds, tree_begin, tree_end);
|
||||
} else if (x.type() == typeid(data::CudfAdapter)) {
|
||||
this->DispatchedInplacePredict<data::CudfAdapter, CuDFAdapterLoader, data::CudfAdapterBatch>(
|
||||
} else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
|
||||
this->DispatchedInplacePredict<
|
||||
data::CudfAdapter, DeviceAdapterLoader<data::CudfAdapterBatch>>(
|
||||
x, model, missing, out_preds, tree_begin, tree_end);
|
||||
} else {
|
||||
LOG(FATAL) << "Only CuPy and CuDF are supported by GPU Predictor.";
|
||||
|
||||
Reference in New Issue
Block a user