Remove column major specialization. (#5755)

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2020-06-05 16:19:14 +08:00
committed by GitHub
parent bd9d57f579
commit cacff9232a
10 changed files with 70 additions and 204 deletions

View File

@@ -118,14 +118,18 @@ struct EllpackLoader {
}
};
struct CuPyAdapterLoader {
data::CupyAdapterBatch batch;
template <typename Batch>
struct DeviceAdapterLoader {
Batch batch;
bst_feature_t columns;
float* smem;
bool use_shared;
DEV_INLINE CuPyAdapterLoader(data::CupyAdapterBatch const batch, bool use_shared,
bst_feature_t num_features, bst_row_t num_rows, size_t entry_start) :
using BatchT = Batch;
DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
bst_feature_t num_features, bst_row_t num_rows,
size_t entry_start) :
batch{batch},
columns{num_features},
use_shared{use_shared} {
@@ -155,39 +159,6 @@ struct CuPyAdapterLoader {
}
};
struct CuDFAdapterLoader {
data::CudfAdapterBatch batch;
bst_feature_t columns;
float* smem;
bool use_shared;
DEV_INLINE CuDFAdapterLoader(data::CudfAdapterBatch const batch, bool use_shared,
bst_feature_t num_features,
bst_row_t num_rows, size_t entry_start)
: batch{batch}, columns{num_features}, use_shared{use_shared} {
extern __shared__ float _smem[];
smem = _smem;
if (use_shared) {
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
size_t shared_elements = blockDim.x * num_features;
dh::BlockFill(smem, shared_elements, nanf(""));
__syncthreads();
if (global_idx < num_rows) {
for (size_t i = 0; i < columns; ++i) {
smem[threadIdx.x * columns + i] = batch.GetValue(global_idx, i);
}
}
}
__syncthreads();
}
DEV_INLINE float GetFvalue(bst_row_t ridx, bst_feature_t fidx) const {
if (use_shared) {
return smem[threadIdx.x * columns + fidx];
}
return batch.GetValue(ridx, fidx);
}
};
template <typename Loader>
__device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
Loader* loader) {
@@ -429,7 +400,7 @@ class GPUPredictor : public xgboost::Predictor {
out_preds->Size() == dmat->Info().num_row_);
}
template <typename Adapter, typename Loader, typename Batch>
template <typename Adapter, typename Loader>
void DispatchedInplacePredict(dmlc::any const &x,
const gbm::GBTreeModel &model, float missing,
PredictionCacheEntry *out_preds,
@@ -439,22 +410,22 @@ class GPUPredictor : public xgboost::Predictor {
DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, this->generic_param_->gpu_id);
auto m = dmlc::get<Adapter>(x);
CHECK_EQ(m.NumColumns(), model.learner_model_param->num_feature)
auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
CHECK_EQ(this->generic_param_->gpu_id, m.DeviceIdx())
CHECK_EQ(this->generic_param_->gpu_id, m->DeviceIdx())
<< "XGBoost is running on device: " << this->generic_param_->gpu_id << ", "
<< "but data is on: " << m.DeviceIdx();
<< "but data is on: " << m->DeviceIdx();
MetaInfo info;
info.num_col_ = m.NumColumns();
info.num_row_ = m.NumRows();
info.num_col_ = m->NumColumns();
info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model);
const uint32_t BLOCK_THREADS = 128;
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(info.num_row_, BLOCK_THREADS));
auto shared_memory_bytes =
static_cast<size_t>(sizeof(float) * m.NumColumns() * BLOCK_THREADS);
static_cast<size_t>(sizeof(float) * m->NumColumns() * BLOCK_THREADS);
bool use_shared = true;
if (shared_memory_bytes > max_shared_memory_bytes) {
shared_memory_bytes = 0;
@@ -463,22 +434,24 @@ class GPUPredictor : public xgboost::Predictor {
size_t entry_start = 0;
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
PredictKernel<Loader, Batch>,
m.Value(),
PredictKernel<Loader, typename Loader::BatchT>,
m->Value(),
dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
tree_begin, tree_end, m.NumColumns(), info.num_row_,
tree_begin, tree_end, m->NumColumns(), info.num_row_,
entry_start, use_shared, output_groups);
}
void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
float missing, PredictionCacheEntry *out_preds,
uint32_t tree_begin, unsigned tree_end) const override {
if (x.type() == typeid(data::CupyAdapter)) {
this->DispatchedInplacePredict<data::CupyAdapter, CuPyAdapterLoader, data::CupyAdapterBatch>(
if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
this->DispatchedInplacePredict<
data::CupyAdapter, DeviceAdapterLoader<data::CupyAdapterBatch>>(
x, model, missing, out_preds, tree_begin, tree_end);
} else if (x.type() == typeid(data::CudfAdapter)) {
this->DispatchedInplacePredict<data::CudfAdapter, CuDFAdapterLoader, data::CudfAdapterBatch>(
} else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
this->DispatchedInplacePredict<
data::CudfAdapter, DeviceAdapterLoader<data::CudfAdapterBatch>>(
x, model, missing, out_preds, tree_begin, tree_end);
} else {
LOG(FATAL) << "Only CuPy and CuDF are supported by GPU Predictor.";