From 1ba67061675a81b7f0cd8527338c54d05b92ff79 Mon Sep 17 00:00:00 2001 From: sriramch <33358417+sriramch@users.noreply.github.com> Date: Fri, 6 Mar 2020 18:31:35 -0800 Subject: [PATCH] - create a gpu metrics (internal) registry (#5387) * - create a gpu metrics (internal) registry - the objective is to separate the cpu and gpu implementations such that they evolve indepedently. to that end, this approach will: - preserve the same metrics configuration (from the end user perspective) - internally delegate the responsibility to the gpu metrics builder when there is a valid device present - decouple the gpu metrics builder from the cpu ones to prevent misuse - move away from including the cuda file from within the cc file and segregate the code via ifdef's --- src/metric/metric.cc | 50 +++++++++++++++----- src/metric/metric_common.h | 44 +++++++++++++++-- src/metric/rank_metric.cc | 96 +++++++++++++++++++++++++++++++------- 3 files changed, 159 insertions(+), 31 deletions(-) diff --git a/src/metric/metric.cc b/src/metric/metric.cc index a501fded9..7d495adb4 100644 --- a/src/metric/metric.cc +++ b/src/metric/metric.cc @@ -7,12 +7,11 @@ #include #include -namespace dmlc { -DMLC_REGISTRY_ENABLE(::xgboost::MetricReg); -} +#include "metric_common.h" namespace xgboost { -Metric* Metric::Create(const std::string& name, GenericParameter const* tparam) { +template +Metric* CreateMetricImpl(const std::string& name, GenericParameter const* tparam) { std::string buf = name; std::string prefix = name; const char* param; @@ -26,29 +25,58 @@ Metric* Metric::Create(const std::string& name, GenericParameter const* tparam) prefix = buf; param = nullptr; } - auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str()); + auto *e = ::dmlc::Registry::Get()->Find(prefix.c_str()); if (e == nullptr) { - LOG(FATAL) << "Unknown metric function " << name; + return nullptr; } auto p_metric = (e->body)(param); - p_metric->tparam_ = tparam; return p_metric; } else { std::string prefix = buf.substr(0, pos); - auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str()); + auto *e = ::dmlc::Registry::Get()->Find(prefix.c_str()); if (e == nullptr) { - LOG(FATAL) << "Unknown metric function " << name; + return nullptr; } auto p_metric = (e->body)(buf.substr(pos + 1, buf.length()).c_str()); - p_metric->tparam_ = tparam; return p_metric; } } + +Metric * +Metric::Create(const std::string& name, GenericParameter const* tparam) { + auto metric = CreateMetricImpl(name, tparam); + if (metric == nullptr) { + LOG(FATAL) << "Unknown metric function " << name; + } + + metric->tparam_ = tparam; + return metric; +} + +Metric * +GPUMetric::CreateGPUMetric(const std::string& name, GenericParameter const* tparam) { + auto metric = CreateMetricImpl(name, tparam); + if (metric == nullptr) { + LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name + << ". Resorting to the CPU builder"; + return metric; + } + + // Narrowing reference only for the compiler to allow assignment to a base class member. + // As such, using this narrowed reference to refer to derived members will be an illegal op. + // This is moot, as this type is stateless. + static_cast(metric)->tparam_ = tparam; + return metric; +} } // namespace xgboost +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::MetricReg); +DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg); +} + namespace xgboost { namespace metric { - // List of files that will be force linked in static links. DMLC_REGISTRY_LINK_TAG(elementwise_metric); DMLC_REGISTRY_LINK_TAG(multiclass_metric); diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h index 303118f9e..1549edf1e 100644 --- a/src/metric/metric_common.h +++ b/src/metric/metric_common.h @@ -1,18 +1,54 @@ /*! - * Copyright 2018-2019 by Contributors - * \file metric_param.cc + * Copyright 2018-2020 by Contributors + * \file metric_common.h */ #ifndef XGBOOST_METRIC_METRIC_COMMON_H_ #define XGBOOST_METRIC_METRIC_COMMON_H_ -#include -#include #include #include #include "../common/common.h" namespace xgboost { + +// This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not +// present already. This is created when there is a device ordinal present and if xgboost +// is compiled with CUDA support +struct GPUMetric : Metric { + static Metric *CreateGPUMetric(const std::string& name, GenericParameter const* tparam); +}; + +/*! + * \brief Internal registry entries for GPU Metric factory functions. + * The additional parameter const char* param gives the value after @, can be null. + * For example, metric map@3, then: param == "3". + */ +struct MetricGPUReg + : public dmlc::FunctionRegEntryBase > { +}; + +/*! + * \brief Macro to register metric computed on GPU. + * + * \code + * // example of registering a objective ndcg@k + * XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg") + * .describe("NDCG metric computer on GPU.") + * .set_body([](const char* param) { + * int at_k = atoi(param); + * return new NDCG(at_k); + * }); + * \endcode + */ + +// Note: Metric names registered in the GPU registry should follow this convention: +// - GPU metric types should be registered with the same name as the non GPU metric types +#define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name) \ + ::xgboost::MetricGPUReg& __make_ ## MetricGPUReg ## _ ## UniqueId ## __ = \ + ::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name) + namespace metric { // Ranking config to be used on device and host diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index 5ffdc003f..0a9310fc7 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -1,9 +1,25 @@ /*! - * Copyright 2015 by Contributors - * \file rank_metric.cc - * \brief prediction rank based metrics. - * \author Kailong Chen, Tianqi Chen + * Copyright 2020 XGBoost contributors */ +// When device ordinal is present, we would want to build the metrics on the GPU. It is *not* +// possible for a valid device ordinal to be present for non GPU builds. However, it is possible +// for an invalid device ordinal to be specified in GPU builds - to train/predict and/or compute +// the metrics on CPU. To accommodate these scenarios, the following is done for the metrics +// accelarated on the GPU. +// - An internal GPU registry holds all the GPU metric types (defined in the .cu file) +// - An instance of the appropriate gpu metric type is created when a device ordinal is present +// - If the creation is successful, the metric computation is done on the device +// - else, it falls back on the CPU +// - The GPU metric types are *only* registered when xgboost is built for GPUs +// +// This is done for 2 reasons: +// - Clear separation of CPU and GPU logic +// - Sorting datasets containing large number of rows is (much) faster when parallel sort +// semantics is used on the CPU. The __gnu_parallel/concurrency primitives needed to perform +// this cannot be used when the translation unit is compiled using the 'nvcc' compiler (as the +// corresponding headers that brings in those function declaration can't be included with CUDA). +// This precludes the CPU and GPU logic to coexist inside a .cu file + #include #include #include @@ -143,6 +159,10 @@ struct EvalAMS : public Metric { /*! \brief Area Under Curve, for both classification and rank computed on CPU */ struct EvalAuc : public Metric { private: + // This is used to compute the AUC metrics on the GPU - for ranking tasks and + // for training jobs that run on the GPU. + std::unique_ptr auc_gpu_; + template bst_float Eval(const HostDeviceVector &preds, const MetaInfo &info, @@ -239,6 +259,18 @@ struct EvalAuc : public Metric { const bool is_ranking_task = !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_; + // Check if we have a GPU assignment; else, revert back to CPU + if (tparam_->gpu_id >= 0 && is_ranking_task) { + if (!auc_gpu_) { + // Check and see if we have the GPU metric registered in the internal registry + auc_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_)); + } + + if (auc_gpu_) { + return auc_gpu_->Eval(preds, info, distributed); + } + } + if (is_ranking_task) { return Eval(preds, info, distributed, gptr); } else { @@ -251,6 +283,10 @@ struct EvalAuc : public Metric { /*! \brief Evaluate rank list */ struct EvalRank : public Metric, public EvalRankConfig { + private: + // This is used to compute the ranking metrics on the GPU - for training jobs that run on the GPU. + std::unique_ptr rank_gpu_; + public: bst_float Eval(const HostDeviceVector &preds, const MetaInfo &info, @@ -271,20 +307,32 @@ struct EvalRank : public Metric, public EvalRankConfig { // sum statistics double sum_metric = 0.0f; - const auto &labels = info.labels_.ConstHostVector(); - const auto &h_preds = preds.ConstHostVector(); + // Check and see if we have the GPU metric registered in the internal registry + if (tparam_->gpu_id >= 0) { + if (!rank_gpu_) { + rank_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_)); + } + if (rank_gpu_) { + sum_metric = rank_gpu_->Eval(preds, info, distributed); + } + } - #pragma omp parallel reduction(+:sum_metric) - { - // each thread takes a local rec - PredIndPairContainer rec; - #pragma omp for schedule(static) - for (bst_omp_uint k = 0; k < ngroups; ++k) { - rec.clear(); - for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) { - rec.emplace_back(h_preds[j], static_cast(labels[j])); + if (!rank_gpu_ || tparam_->gpu_id < 0) { + const auto &labels = info.labels_.ConstHostVector(); + const auto &h_preds = preds.ConstHostVector(); + + #pragma omp parallel reduction(+:sum_metric) + { + // each thread takes a local rec + PredIndPairContainer rec; + #pragma omp for schedule(static) + for (bst_omp_uint k = 0; k < ngroups; ++k) { + rec.clear(); + for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) { + rec.emplace_back(h_preds[j], static_cast(labels[j])); + } + sum_metric += this->EvalGroup(&rec); } - sum_metric += this->EvalGroup(&rec); } } @@ -465,6 +513,10 @@ struct EvalAucPR : public Metric { // translated from PRROC R Package // see https://doi.org/10.1371/journal.pone.0092209 private: + // This is used to compute the AUCPR metrics on the GPU - for ranking tasks and + // for training jobs that run on the GPU. + std::unique_ptr aucpr_gpu_; + template bst_float Eval(const HostDeviceVector &preds, const MetaInfo &info, @@ -580,6 +632,18 @@ struct EvalAucPR : public Metric { const bool is_ranking_task = !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_; + // Check if we have a GPU assignment; else, revert back to CPU + if (tparam_->gpu_id >= 0 && is_ranking_task) { + if (!aucpr_gpu_) { + // Check and see if we have the GPU metric registered in the internal registry + aucpr_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_)); + } + + if (aucpr_gpu_) { + return aucpr_gpu_->Eval(preds, info, distributed); + } + } + if (is_ranking_task) { return Eval(preds, info, distributed, gptr); } else {