From 1ba67061675a81b7f0cd8527338c54d05b92ff79 Mon Sep 17 00:00:00 2001
From: sriramch <33358417+sriramch@users.noreply.github.com>
Date: Fri, 6 Mar 2020 18:31:35 -0800
Subject: [PATCH] - create a gpu metrics (internal) registry (#5387)

* - create a gpu metrics (internal) registry
  - the objective is to separate the cpu and gpu implementations such that they evolve
    indepedently. to that end, this approach will:
    - preserve the same metrics configuration (from the end user perspective)
    - internally delegate the responsibility to the gpu metrics builder when there is a
      valid device present
    - decouple the gpu metrics builder from the cpu ones to prevent misuse
    - move away from including the cuda file from within the cc file and segregate the code
      via ifdef's
---
 src/metric/metric.cc       | 50 +++++++++++++++-----
 src/metric/metric_common.h | 44 +++++++++++++++--
 src/metric/rank_metric.cc  | 96 +++++++++++++++++++++++++++++++-------
 3 files changed, 159 insertions(+), 31 deletions(-)
diff --git a/src/metric/metric.cc b/src/metric/metric.cc
index a501fded9..7d495adb4 100644
--- a/src/metric/metric.cc
+++ b/src/metric/metric.cc
@@ -7,12 +7,11 @@
 #include <xgboost/metric.h>
 #include <xgboost/generic_parameters.h>
 
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
-}
+#include "metric_common.h"
 
 namespace xgboost {
-Metric* Metric::Create(const std::string& name, GenericParameter const* tparam) {
+template <typename MetricRegistry>
+Metric* CreateMetricImpl(const std::string& name, GenericParameter const* tparam) {
   std::string buf = name;
   std::string prefix = name;
   const char* param;
@@ -26,29 +25,58 @@ Metric* Metric::Create(const std::string& name, GenericParameter const* tparam)
       prefix = buf;
       param = nullptr;
     }
-    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
+    auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
     if (e == nullptr) {
-      LOG(FATAL) << "Unknown metric function " << name;
+      return nullptr;
     }
     auto p_metric = (e->body)(param);
-    p_metric->tparam_ = tparam;
     return p_metric;
   } else {
     std::string prefix = buf.substr(0, pos);
-    auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
+    auto *e = ::dmlc::Registry<MetricRegistry>::Get()->Find(prefix.c_str());
     if (e == nullptr) {
-      LOG(FATAL) << "Unknown metric function " << name;
+      return nullptr;
     }
     auto p_metric = (e->body)(buf.substr(pos + 1, buf.length()).c_str());
-    p_metric->tparam_ = tparam;
     return p_metric;
   }
 }
+
+Metric *
+Metric::Create(const std::string& name, GenericParameter const* tparam) {
+  auto metric = CreateMetricImpl<MetricReg>(name, tparam);
+  if (metric == nullptr) {
+    LOG(FATAL) << "Unknown metric function " << name;
+  }
+
+  metric->tparam_ = tparam;
+  return metric;
+}
+
+Metric *
+GPUMetric::CreateGPUMetric(const std::string& name, GenericParameter const* tparam) {
+  auto metric = CreateMetricImpl<MetricGPUReg>(name, tparam);
+  if (metric == nullptr) {
+    LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name
+                 << ". Resorting to the CPU builder";
+    return metric;
+  }
+
+  // Narrowing reference only for the compiler to allow assignment to a base class member.
+  // As such, using this narrowed reference to refer to derived members will be an illegal op.
+  // This is moot, as this type is stateless.
+  static_cast<GPUMetric *>(metric)->tparam_ = tparam;
+  return metric;
+}
 }  // namespace xgboost
 
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
+DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg);
+}
+
 namespace xgboost {
 namespace metric {
-
 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(elementwise_metric);
 DMLC_REGISTRY_LINK_TAG(multiclass_metric);
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 303118f9e..1549edf1e 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,18 +1,54 @@
 /*!
- * Copyright 2018-2019 by Contributors
- * \file metric_param.cc
+ * Copyright 2018-2020 by Contributors
+ * \file metric_common.h
  */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
 
-#include <utility>
-#include <vector>
 #include <limits>
 #include <string>
 
 #include "../common/common.h"
 
 namespace xgboost {
+
+// This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not
+// present already. This is created when there is a device ordinal present and if xgboost
+// is compiled with CUDA support
+struct GPUMetric : Metric {
+  static Metric *CreateGPUMetric(const std::string& name, GenericParameter const* tparam);
+};
+
+/*!
+ * \brief Internal registry entries for GPU Metric factory functions.
+ *  The additional parameter const char* param gives the value after @, can be null.
+ *  For example, metric map@3, then: param == "3".
+ */
+struct MetricGPUReg
+  : public dmlc::FunctionRegEntryBase<MetricGPUReg,
+                                      std::function<Metric * (const char*)> > {
+};
+
+/*!
+ * \brief Macro to register metric computed on GPU.
+ *
+ * \code
+ * // example of registering a objective ndcg@k
+ * XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg")
+ * .describe("NDCG metric computer on GPU.")
+ * .set_body([](const char* param) {
+ *     int at_k = atoi(param);
+ *     return new NDCG(at_k);
+ *   });
+ * \endcode
+ */
+
+// Note: Metric names registered in the GPU registry should follow this convention:
+// - GPU metric types should be registered with the same name as the non GPU metric types
+#define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name)                         \
+  ::xgboost::MetricGPUReg&  __make_ ## MetricGPUReg ## _ ## UniqueId ## __ =  \
+      ::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name)
+
 namespace metric {
 
 // Ranking config to be used on device and host
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 5ffdc003f..0a9310fc7 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -1,9 +1,25 @@
 /*!
- * Copyright 2015 by Contributors
- * \file rank_metric.cc
- * \brief prediction rank based metrics.
- * \author Kailong Chen, Tianqi Chen
+ * Copyright 2020 XGBoost contributors
  */
+// When device ordinal is present, we would want to build the metrics on the GPU. It is *not*
+// possible for a valid device ordinal to be present for non GPU builds. However, it is possible
+// for an invalid device ordinal to be specified in GPU builds - to train/predict and/or compute
+// the metrics on CPU. To accommodate these scenarios, the following is done for the metrics
+// accelarated on the GPU.
+// - An internal GPU registry holds all the GPU metric types (defined in the .cu file)
+// - An instance of the appropriate gpu metric type is created when a device ordinal is present
+// - If the creation is successful, the metric computation is done on the device
+// - else, it falls back on the CPU
+// - The GPU metric types are *only* registered when xgboost is built for GPUs
+//
+// This is done for 2 reasons:
+// - Clear separation of CPU and GPU logic
+// - Sorting datasets containing large number of rows is (much) faster when parallel sort
+//   semantics is used on the CPU. The __gnu_parallel/concurrency primitives needed to perform
+//   this cannot be used when the translation unit is compiled using the 'nvcc' compiler (as the
+//   corresponding headers that brings in those function declaration can't be included with CUDA).
+//   This precludes the CPU and GPU logic to coexist inside a .cu file
+
 #include <rabit/rabit.h>
 #include <xgboost/metric.h>
 #include <dmlc/registry.h>
@@ -143,6 +159,10 @@ struct EvalAMS : public Metric {
 /*! \brief Area Under Curve, for both classification and rank computed on CPU */
 struct EvalAuc : public Metric {
  private:
+  // This is used to compute the AUC metrics on the GPU - for ranking tasks and
+  // for training jobs that run on the GPU.
+  std::unique_ptr<xgboost::Metric> auc_gpu_;
+
   template <typename WeightPolicy>
   bst_float Eval(const HostDeviceVector<bst_float> &preds,
                  const MetaInfo &info,
@@ -239,6 +259,18 @@ struct EvalAuc : public Metric {
     const bool is_ranking_task =
       !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_;
 
+    // Check if we have a GPU assignment; else, revert back to CPU
+    if (tparam_->gpu_id >= 0 && is_ranking_task) {
+      if (!auc_gpu_) {
+        // Check and see if we have the GPU metric registered in the internal registry
+        auc_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_));
+      }
+
+      if (auc_gpu_) {
+        return auc_gpu_->Eval(preds, info, distributed);
+      }
+    }
+
     if (is_ranking_task) {
       return Eval<PerGroupWeightPolicy>(preds, info, distributed, gptr);
     } else {
@@ -251,6 +283,10 @@ struct EvalAuc : public Metric {
 
 /*! \brief Evaluate rank list */
 struct EvalRank : public Metric, public EvalRankConfig {
+ private:
+  // This is used to compute the ranking metrics on the GPU - for training jobs that run on the GPU.
+  std::unique_ptr<xgboost::Metric> rank_gpu_;
+
  public:
   bst_float Eval(const HostDeviceVector<bst_float> &preds,
                  const MetaInfo &info,
@@ -271,20 +307,32 @@ struct EvalRank : public Metric, public EvalRankConfig {
     // sum statistics
     double sum_metric = 0.0f;
 
-    const auto &labels = info.labels_.ConstHostVector();
-    const auto &h_preds = preds.ConstHostVector();
+    // Check and see if we have the GPU metric registered in the internal registry
+    if (tparam_->gpu_id >= 0) {
+      if (!rank_gpu_) {
+        rank_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_));
+      }
+      if (rank_gpu_) {
+        sum_metric = rank_gpu_->Eval(preds, info, distributed);
+      }
+    }
 
-    #pragma omp parallel reduction(+:sum_metric)
-    {
-      // each thread takes a local rec
-      PredIndPairContainer rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroups; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
+    if (!rank_gpu_ || tparam_->gpu_id < 0) {
+      const auto &labels = info.labels_.ConstHostVector();
+      const auto &h_preds = preds.ConstHostVector();
+
+      #pragma omp parallel reduction(+:sum_metric)
+      {
+        // each thread takes a local rec
+        PredIndPairContainer rec;
+        #pragma omp for schedule(static)
+        for (bst_omp_uint k = 0; k < ngroups; ++k) {
+          rec.clear();
+          for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
+            rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
+          }
+          sum_metric += this->EvalGroup(&rec);
         }
-        sum_metric += this->EvalGroup(&rec);
       }
     }
 
@@ -465,6 +513,10 @@ struct EvalAucPR : public Metric {
   // translated from PRROC R Package
   // see https://doi.org/10.1371/journal.pone.0092209
  private:
+  // This is used to compute the AUCPR metrics on the GPU - for ranking tasks and
+  // for training jobs that run on the GPU.
+  std::unique_ptr<xgboost::Metric> aucpr_gpu_;
+
   template <typename WeightPolicy>
   bst_float Eval(const HostDeviceVector<bst_float> &preds,
                  const MetaInfo &info,
@@ -580,6 +632,18 @@ struct EvalAucPR : public Metric {
     const bool is_ranking_task =
       !info.group_ptr_.empty() && info.weights_.Size() != info.num_row_;
 
+    // Check if we have a GPU assignment; else, revert back to CPU
+    if (tparam_->gpu_id >= 0 && is_ranking_task) {
+      if (!aucpr_gpu_) {
+        // Check and see if we have the GPU metric registered in the internal registry
+        aucpr_gpu_.reset(GPUMetric::CreateGPUMetric(this->Name(), tparam_));
+      }
+
+      if (aucpr_gpu_) {
+        return aucpr_gpu_->Eval(preds, info, distributed);
+      }
+    }
+
     if (is_ranking_task) {
       return Eval<PerGroupWeightPolicy>(preds, info, distributed, gptr);
     } else {