diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8f1252806..20e91a5d9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -63,6 +63,45 @@ jobs: cd build ctest --extra-verbose + gtest-cpu-sycl: + name: Test Google C++ unittest (CPU SYCL) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8"] + steps: + - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0 + with: + submodules: 'true' + - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14 + with: + cache-downloads: true + cache-env: true + environment-name: linux_sycl_test + environment-file: tests/ci_build/conda_env/linux_sycl_test.yml + + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX + make -j$(nproc) + - name: Run gtest binary for SYCL + run: | + cd build + ./testxgboost --gtest_filter=Sycl* + - name: Run gtest binary for non SYCL + run: | + cd build + ./testxgboost --gtest_filter=-Sycl* + c-api-demo: name: Test installing XGBoost lib + building the C API demo runs-on: ${{ matrix.os }} diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index e9704c75d..0fca76673 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -256,6 +256,47 @@ jobs: run: | pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark + python-sycl-tests-on-ubuntu: + name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} + runs-on: ${{ matrix.config.os }} + timeout-minutes: 90 + strategy: + matrix: + config: + - {os: ubuntu-latest, python-version: "3.8"} + + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + + - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14 + with: + cache-downloads: true + cache-env: true + environment-name: linux_sycl_test + environment-file: tests/ci_build/conda_env/linux_sycl_test.yml + + - name: Display Conda env + run: | + conda info + conda list + - name: Build XGBoost on Ubuntu + run: | + mkdir build + cd build + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + make -j$(nproc) + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + + python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on ${{ matrix.os }} runs-on: ${{ matrix.os }} diff --git a/CMakeLists.txt b/CMakeLists.txt index a9c6f7410..dbfa1cdc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,11 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +if(PLUGIN_SYCL) + set(CMAKE_CXX_COMPILER "g++") + set(CMAKE_C_COMPILER "gcc") + string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() + project(xgboost LANGUAGES CXX C VERSION 2.1.0) include(cmake/Utils.cmake) list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") @@ -102,7 +109,7 @@ address, leak, undefined and thread.") option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) option(PLUGIN_FEDERATED "Build with Federated Learning" OFF) ## TODO: 1. Add check if DPC++ compiler is used for building -option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF) +option(PLUGIN_SYCL "SYCL plugin" OFF) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) #-- Checks for building XGBoost @@ -313,6 +320,15 @@ if(PLUGIN_RMM) get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES) endif() +if(PLUGIN_SYCL) + set(CMAKE_CXX_LINK_EXECUTABLE + "icpx -qopenmp -o ") + set(CMAKE_CXX_CREATE_SHARED_LIBRARY + "icpx -qopenmp \ + , \ + -o ") +endif() + #-- library if(BUILD_STATIC_LIB) add_library(xgboost STATIC) diff --git a/include/xgboost/context.h b/include/xgboost/context.h index 6745bcb60..f32a07a03 100644 --- a/include/xgboost/context.h +++ b/include/xgboost/context.h @@ -250,9 +250,15 @@ struct Context : public XGBoostParameter { default: // Do not use the device name as this is likely an internal error, the name // wouldn't be valid. - LOG(FATAL) << "Unknown device type:" - << static_cast>(this->Device().device); - break; + if (this->Device().IsSycl()) { + LOG(WARNING) << "The requested feature doesn't have SYCL specific implementation yet. " + << "CPU implementation is used"; + return cpu_fn(); + } else { + LOG(FATAL) << "Unknown device type:" + << static_cast>(this->Device().device); + break; + } } return std::invoke_result_t(); } @@ -262,7 +268,6 @@ struct Context : public XGBoostParameter { */ template decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn, SYCLFn&& sycl_fn) const { - static_assert(std::is_same_v, std::invoke_result_t>); static_assert(std::is_same_v, std::invoke_result_t>); if (this->Device().IsSycl()) { return sycl_fn(); diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 25571213d..6a38d6496 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -92,8 +92,8 @@ class Predictor { * \param out_predt Prediction vector to be initialized. * \param model Tree model used for prediction. */ - void InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_predt, - const gbm::GBTreeModel& model) const; + virtual void InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_predt, + const gbm::GBTreeModel& model) const; /** * \brief Generate batch predictions for a given feature matrix. May use diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt index 58b31053f..0fecb4fb2 100644 --- a/plugin/CMakeLists.txt +++ b/plugin/CMakeLists.txt @@ -1,27 +1,29 @@ -if(PLUGIN_UPDATER_ONEAPI) - add_library(oneapi_plugin OBJECT - ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc - ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/predictor_oneapi.cc) - target_include_directories(oneapi_plugin +if(PLUGIN_SYCL) + set(CMAKE_CXX_COMPILER "icpx") + add_library(plugin_sycl OBJECT + ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc + ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc) + target_include_directories(plugin_sycl PRIVATE ${xgboost_SOURCE_DIR}/include ${xgboost_SOURCE_DIR}/dmlc-core/include ${xgboost_SOURCE_DIR}/rabit/include) - target_compile_definitions(oneapi_plugin PUBLIC -DXGBOOST_USE_ONEAPI=1) - target_link_libraries(oneapi_plugin PUBLIC -fsycl) - set_target_properties(oneapi_plugin PROPERTIES + target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1) + target_link_libraries(plugin_sycl PUBLIC -fsycl) + set_target_properties(plugin_sycl PROPERTIES COMPILE_FLAGS -fsycl CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON) if(USE_OPENMP) find_package(OpenMP REQUIRED) - target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX) + set_target_properties(plugin_sycl PROPERTIES + COMPILE_FLAGS "-fsycl -qopenmp") endif() - # Get compilation and link flags of oneapi_plugin and propagate to objxgboost - target_link_libraries(objxgboost PUBLIC oneapi_plugin) - # Add all objects of oneapi_plugin to objxgboost - target_sources(objxgboost INTERFACE $) + # Get compilation and link flags of plugin_sycl and propagate to objxgboost + target_link_libraries(objxgboost PUBLIC plugin_sycl) + # Add all objects of plugin_sycl to objxgboost + target_sources(objxgboost INTERFACE $) endif() # Add the Federate Learning plugin if enabled. diff --git a/plugin/sycl/README.md b/plugin/sycl/README.md new file mode 100755 index 000000000..b5dc07a1a --- /dev/null +++ b/plugin/sycl/README.md @@ -0,0 +1,40 @@ + + +# SYCL-based Algorithm for Tree Construction +This plugin adds support of SYCL programming model for prediction algorithms to XGBoost. + +## Usage +Specify the 'device' parameter as described in the table below to offload model training and inference on SYCL device. + +### Algorithms +| device | Description | +| --- | --- | +sycl | use default sycl device | +sycl:gpu | use default sycl gpu | +sycl:cpu | use default sycl cpu | +sycl:gpu:N | use sycl gpu number N | +sycl:cpu:N | use sycl cpu number N | + +Python example: +```python +param['device'] = 'sycl:gpu:0' +``` +Note: 'sycl:cpu' devices have full functional support but can't provide good enough performance. We recommend use 'sycl:cpu' devices only for test purposes. +Note: if device is specified to be 'sycl', device type will be automatically chosen. In case the system has both sycl GPU and sycl CPU, GPU will on use. + +## Dependencies +To build and use the plugin, install [IntelĀ® oneAPI DPC++/C++ Compiler](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html). +See also [IntelĀ® oneAPI Programming Guide](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/overview.html). + +## Build +From the ``xgboost`` directory, run: + +```bash +$ mkdir build +$ cd build +$ cmake .. -DPLUGIN_SYCL=ON +$ make -j +``` \ No newline at end of file diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h new file mode 100644 index 000000000..179c7cd1f --- /dev/null +++ b/plugin/sycl/data.h @@ -0,0 +1,256 @@ +/*! + * Copyright by Contributors 2017-2023 + */ +#ifndef PLUGIN_SYCL_DATA_H_ +#define PLUGIN_SYCL_DATA_H_ + +#include +#include +#include +#include +#include +#include + +#include "xgboost/base.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include "xgboost/data.h" +#pragma GCC diagnostic pop +#include "xgboost/logging.h" +#include "xgboost/host_device_vector.h" + +#include "../../src/common/threading_utils.h" + +#include "CL/sycl.hpp" + +namespace xgboost { +namespace sycl { +enum class MemoryType { shared, on_device}; + + +template +class USMDeleter { + public: + explicit USMDeleter(::sycl::queue qu) : qu_(qu) {} + + void operator()(T* data) const { + ::sycl::free(data, qu_); + } + + private: + ::sycl::queue qu_; +}; + +template +class USMVector { + static_assert(std::is_standard_layout::value, "USMVector admits only POD types"); + + std::shared_ptr allocate_memory_(::sycl::queue* qu, size_t size) { + if constexpr (memory_type == MemoryType::shared) { + return std::shared_ptr(::sycl::malloc_shared(size_, *qu), USMDeleter(*qu)); + } else { + return std::shared_ptr(::sycl::malloc_device(size_, *qu), USMDeleter(*qu)); + } + } + + void copy_vector_to_memory_(::sycl::queue* qu, const std::vector &vec) { + if constexpr (memory_type == MemoryType::shared) { + std::copy(vec.begin(), vec.end(), data_.get()); + } else { + qu->memcpy(data_.get(), vec.data(), size_ * sizeof(T)); + } + } + + + public: + USMVector() : size_(0), capacity_(0), data_(nullptr) {} + + USMVector(::sycl::queue& qu, size_t size) : size_(size), capacity_(size) { + data_ = allocate_memory_(qu, size_); + } + + USMVector(::sycl::queue& qu, size_t size, T v) : size_(size), capacity_(size) { + data_ = allocate_memory_(qu, size_); + qu.fill(data_.get(), v, size_).wait(); + } + + USMVector(::sycl::queue* qu, const std::vector &vec) { + size_ = vec.size(); + capacity_ = size_; + data_ = allocate_memory_(qu, size_); + copy_vector_to_memory_(qu, vec); + } + + ~USMVector() { + } + + USMVector& operator=(const USMVector& other) { + size_ = other.size_; + capacity_ = other.capacity_; + data_ = other.data_; + return *this; + } + + T* Data() { return data_.get(); } + const T* DataConst() const { return data_.get(); } + + size_t Size() const { return size_; } + + size_t Capacity() const { return capacity_; } + + T& operator[] (size_t i) { return data_.get()[i]; } + const T& operator[] (size_t i) const { return data_.get()[i]; } + + T* Begin () const { return data_.get(); } + T* End () const { return data_.get() + size_; } + + bool Empty() const { return (size_ == 0); } + + void Clear() { + data_.reset(); + size_ = 0; + capacity_ = 0; + } + + void Resize(::sycl::queue* qu, size_t size_new) { + if (size_new <= capacity_) { + size_ = size_new; + } else { + size_t size_old = size_; + auto data_old = data_; + size_ = size_new; + capacity_ = size_new; + data_ = allocate_memory_(qu, size_);; + if (size_old > 0) { + qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait(); + } + } + } + + void Resize(::sycl::queue* qu, size_t size_new, T v) { + if (size_new <= size_) { + size_ = size_new; + } else if (size_new <= capacity_) { + qu->fill(data_.get() + size_, v, size_new - size_).wait(); + size_ = size_new; + } else { + size_t size_old = size_; + auto data_old = data_; + size_ = size_new; + capacity_ = size_new; + data_ = allocate_memory_(qu, size_); + if (size_old > 0) { + qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait(); + } + qu->fill(data_.get() + size_old, v, size_new - size_old).wait(); + } + } + + ::sycl::event ResizeAsync(::sycl::queue* qu, size_t size_new, T v) { + if (size_new <= size_) { + size_ = size_new; + return ::sycl::event(); + } else if (size_new <= capacity_) { + auto event = qu->fill(data_.get() + size_, v, size_new - size_); + size_ = size_new; + return event; + } else { + size_t size_old = size_; + auto data_old = data_; + size_ = size_new; + capacity_ = size_new; + data_ = allocate_memory_(qu, size_); + ::sycl::event event; + if (size_old > 0) { + event = qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old); + } + return qu->fill(data_.get() + size_old, v, size_new - size_old, event); + } + } + + ::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) { + if (size_new <= size_) { + size_ = size_new; + return qu->memset(data_.get(), v, size_new * sizeof(T)); + } else if (size_new <= capacity_) { + size_ = size_new; + return qu->memset(data_.get(), v, size_new * sizeof(T)); + } else { + size_t size_old = size_; + auto data_old = data_; + size_ = size_new; + capacity_ = size_new; + data_ = allocate_memory_(qu, size_); + return qu->memset(data_.get(), v, size_new * sizeof(T)); + } + } + + ::sycl::event Fill(::sycl::queue* qu, T v) { + return qu->fill(data_.get(), v, size_); + } + + void Init(::sycl::queue* qu, const std::vector &vec) { + size_ = vec.size(); + capacity_ = size_; + data_ = allocate_memory_(qu, size_); + copy_vector_to_memory_(qu, vec); + } + + using value_type = T; // NOLINT + + private: + size_t size_; + size_t capacity_; + std::shared_ptr data_; +}; + +/* Wrapper for DMatrix which stores all batches in a single USM buffer */ +struct DeviceMatrix { + DMatrix* p_mat; // Pointer to the original matrix on the host + ::sycl::queue qu_; + USMVector row_ptr; + USMVector data; + size_t total_offset; + + DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) { + size_t num_row = 0; + size_t num_nonzero = 0; + for (auto &batch : dmat->GetBatches()) { + const auto& data_vec = batch.data.HostVector(); + const auto& offset_vec = batch.offset.HostVector(); + num_nonzero += data_vec.size(); + num_row += batch.Size(); + } + + row_ptr.Resize(&qu_, num_row + 1); + data.Resize(&qu_, num_nonzero); + + size_t data_offset = 0; + for (auto &batch : dmat->GetBatches()) { + const auto& data_vec = batch.data.HostVector(); + const auto& offset_vec = batch.offset.HostVector(); + size_t batch_size = batch.Size(); + if (batch_size > 0) { + std::copy(offset_vec.data(), offset_vec.data() + batch_size, + row_ptr.Data() + batch.base_rowid); + if (batch.base_rowid > 0) { + for (size_t i = 0; i < batch_size; i++) + row_ptr[i + batch.base_rowid] += batch.base_rowid; + } + std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size], + data.Data() + data_offset); + data_offset += offset_vec[batch_size]; + } + } + row_ptr[num_row] = data_offset; + total_offset = data_offset; + } + + ~DeviceMatrix() { + } +}; +} // namespace sycl +} // namespace xgboost + +#endif // PLUGIN_SYCL_DATA_H_ diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc new file mode 100644 index 000000000..0254cdd6a --- /dev/null +++ b/plugin/sycl/device_manager.cc @@ -0,0 +1,124 @@ +/*! + * Copyright 2017-2023 by Contributors + * \file device_manager.cc + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include +#pragma GCC diagnostic pop + +#include "../sycl/device_manager.h" + +namespace xgboost { +namespace sycl { + +::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const { + if (!device_spec.IsSycl()) { + LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " + << device_spec.Name() << ". " + << "Default sycl device_selector will be used."; + } + + bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || + (rabit::IsDistributed()); + if (not_use_default_selector) { + DeviceRegister& device_register = GetDevicesRegister(); + const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal; + if (device_spec.IsSyclDefault()) { + auto& devices = device_register.devices; + CHECK_LT(device_idx, devices.size()); + return devices[device_idx]; + } else if (device_spec.IsSyclCPU()) { + auto& cpu_devices = device_register.cpu_devices; + CHECK_LT(device_idx, cpu_devices.size()); + return cpu_devices[device_idx]; + } else { + auto& gpu_devices = device_register.gpu_devices; + CHECK_LT(device_idx, gpu_devices.size()); + return gpu_devices[device_idx]; + } + } else { + if (device_spec.IsSyclCPU()) { + return ::sycl::device(::sycl::cpu_selector_v); + } else if (device_spec.IsSyclGPU()) { + return ::sycl::device(::sycl::gpu_selector_v); + } else { + return ::sycl::device(::sycl::default_selector_v); + } + } +} + +::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const { + if (!device_spec.IsSycl()) { + LOG(WARNING) << "Sycl kernel is executed with non-sycl context: " + << device_spec.Name() << ". " + << "Default sycl device_selector will be used."; + } + + QueueRegister_t& queue_register = GetQueueRegister(); + if (queue_register.count(device_spec.Name()) > 0) { + return queue_register.at(device_spec.Name()); + } + + bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) || + (rabit::IsDistributed()); + std::lock_guard guard(queue_registering_mutex); + if (not_use_default_selector) { + DeviceRegister& device_register = GetDevicesRegister(); + const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal; + if (device_spec.IsSyclDefault()) { + auto& devices = device_register.devices; + CHECK_LT(device_idx, devices.size()); + queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]); + } else if (device_spec.IsSyclCPU()) { + auto& cpu_devices = device_register.cpu_devices; + CHECK_LT(device_idx, cpu_devices.size()); + queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]);; + } else if (device_spec.IsSyclGPU()) { + auto& gpu_devices = device_register.gpu_devices; + CHECK_LT(device_idx, gpu_devices.size()); + queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]); + } + } else { + if (device_spec.IsSyclCPU()) { + queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v); + } else if (device_spec.IsSyclGPU()) { + queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v); + } else { + queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v); + } + } + return queue_register.at(device_spec.Name()); +} + +DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const { + static DeviceRegister device_register; + + if (device_register.devices.size() == 0) { + std::lock_guard guard(device_registering_mutex); + std::vector<::sycl::device> devices = ::sycl::device::get_devices(); + for (size_t i = 0; i < devices.size(); i++) { + LOG(INFO) << "device_index = " << i << ", name = " + << devices[i].get_info<::sycl::info::device::name>(); + } + + for (size_t i = 0; i < devices.size(); i++) { + device_register.devices.push_back(devices[i]); + if (devices[i].is_cpu()) { + device_register.cpu_devices.push_back(devices[i]); + } else if (devices[i].is_gpu()) { + device_register.gpu_devices.push_back(devices[i]); + } + } + } + return device_register; +} + +DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const { + static QueueRegister_t queue_register; + return queue_register; +} + +} // namespace sycl +} // namespace xgboost diff --git a/plugin/sycl/device_manager.h b/plugin/sycl/device_manager.h new file mode 100644 index 000000000..0ae2ee9fe --- /dev/null +++ b/plugin/sycl/device_manager.h @@ -0,0 +1,47 @@ +/*! + * Copyright 2017-2023 by Contributors + * \file device_manager.h + */ +#ifndef PLUGIN_SYCL_DEVICE_MANAGER_H_ +#define PLUGIN_SYCL_DEVICE_MANAGER_H_ + +#include +#include +#include +#include + +#include + +#include "xgboost/context.h" + +namespace xgboost { +namespace sycl { + +class DeviceManager { + public: + ::sycl::queue GetQueue(const DeviceOrd& device_spec) const; + + ::sycl::device GetDevice(const DeviceOrd& device_spec) const; + + private: + using QueueRegister_t = std::unordered_map; + constexpr static int kDefaultOrdinal = -1; + + struct DeviceRegister { + std::vector<::sycl::device> devices; + std::vector<::sycl::device> cpu_devices; + std::vector<::sycl::device> gpu_devices; + }; + + QueueRegister_t& GetQueueRegister() const; + + DeviceRegister& GetDevicesRegister() const; + + mutable std::mutex queue_registering_mutex; + mutable std::mutex device_registering_mutex; +}; + +} // namespace sycl +} // namespace xgboost + +#endif // PLUGIN_SYCL_DEVICE_MANAGER_H_ diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc new file mode 100755 index 000000000..3ceb99f1e --- /dev/null +++ b/plugin/sycl/predictor/predictor.cc @@ -0,0 +1,342 @@ +/*! + * Copyright by Contributors 2017-2023 + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include +#pragma GCC diagnostic pop + +#include +#include +#include + +#include + +#include "../data.h" + +#include "dmlc/registry.h" + +#include "xgboost/tree_model.h" +#include "xgboost/predictor.h" +#include "xgboost/tree_updater.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#include "../../src/data/adapter.h" +#pragma GCC diagnostic pop +#include "../../src/common/math.h" +#include "../../src/gbm/gbtree_model.h" + +#include "../device_manager.h" + +namespace xgboost { +namespace sycl { +namespace predictor { + +DMLC_REGISTRY_FILE_TAG(predictor_sycl); + +/* Wrapper for descriptor of a tree node */ +struct DeviceNode { + DeviceNode() + : fidx(-1), left_child_idx(-1), right_child_idx(-1) {} + + union NodeValue { + float leaf_weight; + float fvalue; + }; + + int fidx; + int left_child_idx; + int right_child_idx; + NodeValue val; + + explicit DeviceNode(const RegTree::Node& n) { + this->left_child_idx = n.LeftChild(); + this->right_child_idx = n.RightChild(); + this->fidx = n.SplitIndex(); + if (n.DefaultLeft()) { + fidx |= (1U << 31); + } + + if (n.IsLeaf()) { + this->val.leaf_weight = n.LeafValue(); + } else { + this->val.fvalue = n.SplitCond(); + } + } + + bool IsLeaf() const { return left_child_idx == -1; } + + int GetFidx() const { return fidx & ((1U << 31) - 1U); } + + bool MissingLeft() const { return (fidx >> 31) != 0; } + + int MissingIdx() const { + if (MissingLeft()) { + return this->left_child_idx; + } else { + return this->right_child_idx; + } + } + + float GetFvalue() const { return val.fvalue; } + + float GetWeight() const { return val.leaf_weight; } +}; + +/* SYCL implementation of a device model, + * storing tree structure in USM buffers to provide access from device kernels + */ +class DeviceModel { + public: + ::sycl::queue qu_; + USMVector nodes_; + USMVector tree_segments_; + USMVector tree_group_; + size_t tree_beg_; + size_t tree_end_; + int num_group_; + + DeviceModel() {} + + ~DeviceModel() {} + + void Init(::sycl::queue qu, const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) { + qu_ = qu; + + tree_segments_.Resize(&qu_, (tree_end - tree_begin) + 1); + int sum = 0; + tree_segments_[0] = sum; + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + if (model.trees[tree_idx]->HasCategoricalSplit()) { + LOG(FATAL) << "Categorical features are not yet supported by sycl"; + } + sum += model.trees[tree_idx]->GetNodes().size(); + tree_segments_[tree_idx - tree_begin + 1] = sum; + } + + nodes_.Resize(&qu_, sum); + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + auto& src_nodes = model.trees[tree_idx]->GetNodes(); + for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++) + nodes_[node_idx + tree_segments_[tree_idx - tree_begin]] = + static_cast(src_nodes[node_idx]); + } + + tree_group_.Resize(&qu_, model.tree_info.size()); + for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++) + tree_group_[tree_idx] = model.tree_info[tree_idx]; + + tree_beg_ = tree_begin; + tree_end_ = tree_end; + num_group_ = model.learner_model_param->num_output_group; + } +}; + +float GetFvalue(int ridx, int fidx, Entry* data, size_t* row_ptr, bool* is_missing) { + // Binary search + auto begin_ptr = data + row_ptr[ridx]; + auto end_ptr = data + row_ptr[ridx + 1]; + Entry* previous_middle = nullptr; + while (end_ptr != begin_ptr) { + auto middle = begin_ptr + (end_ptr - begin_ptr) / 2; + if (middle == previous_middle) { + break; + } else { + previous_middle = middle; + } + + if (middle->index == fidx) { + *is_missing = false; + return middle->fvalue; + } else if (middle->index < fidx) { + begin_ptr = middle; + } else { + end_ptr = middle; + } + } + *is_missing = true; + return 0.0; +} + +float GetLeafWeight(int ridx, const DeviceNode* tree, Entry* data, size_t* row_ptr) { + DeviceNode n = tree[0]; + int node_id = 0; + bool is_missing; + while (!n.IsLeaf()) { + float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, &is_missing); + // Missing value + if (is_missing) { + n = tree[n.MissingIdx()]; + } else { + if (fvalue < n.GetFvalue()) { + node_id = n.left_child_idx; + n = tree[n.left_child_idx]; + } else { + node_id = n.right_child_idx; + n = tree[n.right_child_idx]; + } + } + } + return n.GetWeight(); +} + +void DevicePredictInternal(::sycl::queue qu, + sycl::DeviceMatrix* dmat, + HostDeviceVector* out_preds, + const gbm::GBTreeModel& model, + size_t tree_begin, + size_t tree_end) { + if (tree_end - tree_begin == 0) return; + if (out_preds->HostVector().size() == 0) return; + + DeviceModel device_model; + device_model.Init(qu, model, tree_begin, tree_end); + + auto& out_preds_vec = out_preds->HostVector(); + + DeviceNode* nodes = device_model.nodes_.Data(); + ::sycl::buffer out_preds_buf(out_preds_vec.data(), out_preds_vec.size()); + size_t* tree_segments = device_model.tree_segments_.Data(); + int* tree_group = device_model.tree_group_.Data(); + size_t* row_ptr = dmat->row_ptr.Data(); + Entry* data = dmat->data.Data(); + int num_features = dmat->p_mat->Info().num_col_; + int num_rows = dmat->row_ptr.Size() - 1; + int num_group = model.learner_model_param->num_output_group; + + qu.submit([&](::sycl::handler& cgh) { + auto out_predictions = out_preds_buf.template get_access<::sycl::access::mode::read_write>(cgh); + cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::id<1> pid) { + int global_idx = pid[0]; + if (global_idx >= num_rows) return; + if (num_group == 1) { + float sum = 0.0; + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin]; + sum += GetLeafWeight(global_idx, tree, data, row_ptr); + } + out_predictions[global_idx] += sum; + } else { + for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin]; + int out_prediction_idx = global_idx * num_group + tree_group[tree_idx]; + out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr); + } + } + }); + }).wait(); +} + +class Predictor : public xgboost::Predictor { + protected: + void InitOutPredictions(const MetaInfo& info, + HostDeviceVector* out_preds, + const gbm::GBTreeModel& model) const override { + CHECK_NE(model.learner_model_param->num_output_group, 0); + size_t n = model.learner_model_param->num_output_group * info.num_row_; + const auto& base_margin = info.base_margin_.Data()->HostVector(); + out_preds->Resize(n); + std::vector& out_preds_h = out_preds->HostVector(); + if (base_margin.size() == n) { + CHECK_EQ(out_preds->Size(), n); + std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); + } else { + auto base_score = model.learner_model_param->BaseScore(ctx_)(0); + if (!base_margin.empty()) { + std::ostringstream oss; + oss << "Ignoring the base margin, since it has incorrect length. " + << "The base margin must be an array of length "; + if (model.learner_model_param->num_output_group > 1) { + oss << "[num_class] * [number of data points], i.e. " + << model.learner_model_param->num_output_group << " * " << info.num_row_ + << " = " << n << ". "; + } else { + oss << "[number of data points], i.e. " << info.num_row_ << ". "; + } + oss << "Instead, all data points will use " + << "base_score = " << base_score; + LOG(WARNING) << oss.str(); + } + std::fill(out_preds_h.begin(), out_preds_h.end(), base_score); + } + } + + public: + explicit Predictor(Context const* context) : + xgboost::Predictor::Predictor{context}, + cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {} + + void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, + const gbm::GBTreeModel &model, uint32_t tree_begin, + uint32_t tree_end = 0) const override { + ::sycl::queue qu = device_manager.GetQueue(ctx_->Device()); + // TODO(razdoburdin): remove temporary workaround after cache fix + sycl::DeviceMatrix device_matrix(qu, dmat); + + auto* out_preds = &predts->predictions; + if (tree_end == 0) { + tree_end = model.trees.size(); + } + + if (tree_begin < tree_end) { + DevicePredictInternal(qu, &device_matrix, out_preds, model, tree_begin, tree_end); + } + } + + bool InplacePredict(std::shared_ptr p_m, + const gbm::GBTreeModel &model, float missing, + PredictionCacheEntry *out_preds, uint32_t tree_begin, + unsigned tree_end) const override { + LOG(WARNING) << "InplacePredict is not yet implemented for SYCL. CPU Predictor is used."; + return cpu_predictor->InplacePredict(p_m, model, missing, out_preds, tree_begin, tree_end); + } + + void PredictInstance(const SparsePage::Inst& inst, + std::vector* out_preds, + const gbm::GBTreeModel& model, unsigned ntree_limit, + bool is_column_split) const override { + LOG(WARNING) << "PredictInstance is not yet implemented for SYCL. CPU Predictor is used."; + cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit, is_column_split); + } + + void PredictLeaf(DMatrix* p_fmat, HostDeviceVector* out_preds, + const gbm::GBTreeModel& model, unsigned ntree_limit) const override { + LOG(WARNING) << "PredictLeaf is not yet implemented for SYCL. CPU Predictor is used."; + cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit); + } + + void PredictContribution(DMatrix* p_fmat, HostDeviceVector* out_contribs, + const gbm::GBTreeModel& model, uint32_t ntree_limit, + const std::vector* tree_weights, + bool approximate, int condition, + unsigned condition_feature) const override { + LOG(WARNING) << "PredictContribution is not yet implemented for SYCL. CPU Predictor is used."; + cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, + approximate, condition, condition_feature); + } + + void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector* out_contribs, + const gbm::GBTreeModel& model, unsigned ntree_limit, + const std::vector* tree_weights, + bool approximate) const override { + LOG(WARNING) << "PredictInteractionContributions is not yet implemented for SYCL. " + << "CPU Predictor is used."; + cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, + tree_weights, approximate); + } + + private: + DeviceManager device_manager; + + std::unique_ptr cpu_predictor; +}; + +XGBOOST_REGISTER_PREDICTOR(Predictor, "sycl_predictor") +.describe("Make predictions using SYCL.") +.set_body([](Context const* ctx) { return new Predictor(ctx); }); + +} // namespace predictor +} // namespace sycl +} // namespace xgboost diff --git a/plugin/updater_oneapi/README.md b/plugin/updater_oneapi/README.md deleted file mode 100755 index c2faf6574..000000000 --- a/plugin/updater_oneapi/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# DPC++-based Algorithm for Tree Construction -This plugin adds support of OneAPI programming model for tree construction and prediction algorithms to XGBoost. - -## Usage -Specify the 'objective' parameter as one of the following options to offload computation of objective function on OneAPI device. - -### Algorithms -| objective | Description | -| --- | --- | -reg:squarederror_oneapi | regression with squared loss | -reg:squaredlogerror_oneapi | regression with root mean squared logarithmic loss | -reg:logistic_oneapi | logistic regression for probability regression task | -binary:logistic_oneapi | logistic regression for binary classification task | -binary:logitraw_oneapi | logistic regression for classification, output score before logistic transformation | - -Specify the 'predictor' parameter as one of the following options to offload prediction stage on OneAPI device. - -### Algorithms -| predictor | Description | -| --- | --- | -predictor_oneapi | prediction using OneAPI device | - -Please note that parameter names are not finalized and can be changed during further integration of OneAPI support. - -Python example: -```python -param['predictor'] = 'predictor_oneapi' -param['objective'] = 'reg:squarederror_oneapi' -``` - -## Dependencies -Building the plugin requires Data Parallel C++ Compiler (https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/dpc-compiler.html) - -## Build -From the command line on Linux starting from the xgboost directory: - -```bash -$ mkdir build -$ cd build -$ EXPORT CXX=dpcpp && cmake .. -DPLUGIN_UPDATER_ONEAPI=ON -$ make -j -``` diff --git a/plugin/updater_oneapi/predictor_oneapi.cc b/plugin/updater_oneapi/predictor_oneapi.cc deleted file mode 100755 index 25a14186c..000000000 --- a/plugin/updater_oneapi/predictor_oneapi.cc +++ /dev/null @@ -1,447 +0,0 @@ -/*! - * Copyright by Contributors 2017-2020 - */ -#include // for any -#include -#include -#include - -#include "../../src/common/math.h" -#include "../../src/data/adapter.h" -#include "../../src/gbm/gbtree_model.h" -#include "CL/sycl.hpp" -#include "xgboost/base.h" -#include "xgboost/data.h" -#include "xgboost/host_device_vector.h" -#include "xgboost/logging.h" -#include "xgboost/predictor.h" -#include "xgboost/tree_model.h" -#include "xgboost/tree_updater.h" - -namespace xgboost { -namespace predictor { - -DMLC_REGISTRY_FILE_TAG(predictor_oneapi); - -/*! \brief Element from a sparse vector */ -struct EntryOneAPI { - /*! \brief feature index */ - bst_feature_t index; - /*! \brief feature value */ - bst_float fvalue; - /*! \brief default constructor */ - EntryOneAPI() = default; - /*! - * \brief constructor with index and value - * \param index The feature or row index. - * \param fvalue The feature value. - */ - EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {} - - EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {} - - /*! \brief reversely compare feature values */ - inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) { - return a.fvalue < b.fvalue; - } - inline bool operator==(const EntryOneAPI& other) const { - return (this->index == other.index && this->fvalue == other.fvalue); - } -}; - -struct DeviceMatrixOneAPI { - DMatrix* p_mat; // Pointer to the original matrix on the host - cl::sycl::queue qu_; - size_t* row_ptr; - size_t row_ptr_size; - EntryOneAPI* data; - - DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) { - size_t num_row = 0; - size_t num_nonzero = 0; - for (auto &batch : dmat->GetBatches()) { - const auto& data_vec = batch.data.HostVector(); - const auto& offset_vec = batch.offset.HostVector(); - num_nonzero += data_vec.size(); - num_row += batch.Size(); - } - - row_ptr = cl::sycl::malloc_shared(num_row + 1, qu_); - data = cl::sycl::malloc_shared(num_nonzero, qu_); - - size_t data_offset = 0; - for (auto &batch : dmat->GetBatches()) { - const auto& data_vec = batch.data.HostVector(); - const auto& offset_vec = batch.offset.HostVector(); - size_t batch_size = batch.Size(); - if (batch_size > 0) { - std::copy(offset_vec.data(), offset_vec.data() + batch_size, - row_ptr + batch.base_rowid); - if (batch.base_rowid > 0) { - for(size_t i = 0; i < batch_size; i++) - row_ptr[i + batch.base_rowid] += batch.base_rowid; - } - std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size], - data + data_offset); - data_offset += offset_vec[batch_size]; - } - } - row_ptr[num_row] = data_offset; - row_ptr_size = num_row + 1; - } - - ~DeviceMatrixOneAPI() { - if (row_ptr) { - cl::sycl::free(row_ptr, qu_); - } - if (data) { - cl::sycl::free(data, qu_); - } - } -}; - -struct DeviceNodeOneAPI { - DeviceNodeOneAPI() - : fidx(-1), left_child_idx(-1), right_child_idx(-1) {} - - union NodeValue { - float leaf_weight; - float fvalue; - }; - - int fidx; - int left_child_idx; - int right_child_idx; - NodeValue val; - - DeviceNodeOneAPI(const RegTree::Node& n) { // NOLINT - this->left_child_idx = n.LeftChild(); - this->right_child_idx = n.RightChild(); - this->fidx = n.SplitIndex(); - if (n.DefaultLeft()) { - fidx |= (1U << 31); - } - - if (n.IsLeaf()) { - this->val.leaf_weight = n.LeafValue(); - } else { - this->val.fvalue = n.SplitCond(); - } - } - - bool IsLeaf() const { return left_child_idx == -1; } - - int GetFidx() const { return fidx & ((1U << 31) - 1U); } - - bool MissingLeft() const { return (fidx >> 31) != 0; } - - int MissingIdx() const { - if (MissingLeft()) { - return this->left_child_idx; - } else { - return this->right_child_idx; - } - } - - float GetFvalue() const { return val.fvalue; } - - float GetWeight() const { return val.leaf_weight; } -}; - -class DeviceModelOneAPI { - public: - cl::sycl::queue qu_; - DeviceNodeOneAPI* nodes; - size_t* tree_segments; - int* tree_group; - size_t tree_beg_; - size_t tree_end_; - int num_group; - - DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {} - - ~DeviceModelOneAPI() { - Reset(); - } - - void Reset() { - if (nodes) - cl::sycl::free(nodes, qu_); - if (tree_segments) - cl::sycl::free(tree_segments, qu_); - if (tree_group) - cl::sycl::free(tree_group, qu_); - } - - void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) { - qu_ = qu; - CHECK_EQ(model.param.size_leaf_vector, 0); - Reset(); - - tree_segments = cl::sycl::malloc_shared((tree_end - tree_begin) + 1, qu_); - int sum = 0; - tree_segments[0] = sum; - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - sum += model.trees[tree_idx]->GetNodes().size(); - tree_segments[tree_idx - tree_begin + 1] = sum; - } - - nodes = cl::sycl::malloc_shared(sum, qu_); - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - auto& src_nodes = model.trees[tree_idx]->GetNodes(); - for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++) - nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx]; - } - - tree_group = cl::sycl::malloc_shared(model.tree_info.size(), qu_); - for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++) - tree_group[tree_idx] = model.tree_info[tree_idx]; - - tree_beg_ = tree_begin; - tree_end_ = tree_end; - num_group = model.learner_model_param->num_output_group; - } -}; - -float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) { - // Binary search - auto begin_ptr = data + row_ptr[ridx]; - auto end_ptr = data + row_ptr[ridx + 1]; - EntryOneAPI* previous_middle = nullptr; - while (end_ptr != begin_ptr) { - auto middle = begin_ptr + (end_ptr - begin_ptr) / 2; - if (middle == previous_middle) { - break; - } else { - previous_middle = middle; - } - - if (middle->index == fidx) { - is_missing = false; - return middle->fvalue; - } else if (middle->index < fidx) { - begin_ptr = middle; - } else { - end_ptr = middle; - } - } - is_missing = true; - return 0.0; -} - -float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) { - DeviceNodeOneAPI n = tree[0]; - int node_id = 0; - bool is_missing; - while (!n.IsLeaf()) { - float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing); - // Missing value - if (is_missing) { - n = tree[n.MissingIdx()]; - } else { - if (fvalue < n.GetFvalue()) { - node_id = n.left_child_idx; - n = tree[n.left_child_idx]; - } else { - node_id = n.right_child_idx; - n = tree[n.right_child_idx]; - } - } - } - return n.GetWeight(); -} - -class PredictorOneAPI : public Predictor { - protected: - void InitOutPredictions(const MetaInfo& info, - HostDeviceVector* out_preds, - const gbm::GBTreeModel& model) const { - CHECK_NE(model.learner_model_param->num_output_group, 0); - size_t n = model.learner_model_param->num_output_group * info.num_row_; - const auto& base_margin = info.base_margin_.HostVector(); - out_preds->Resize(n); - std::vector& out_preds_h = out_preds->HostVector(); - if (base_margin.size() == n) { - CHECK_EQ(out_preds->Size(), n); - std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); - } else { - if (!base_margin.empty()) { - std::ostringstream oss; - oss << "Ignoring the base margin, since it has incorrect length. " - << "The base margin must be an array of length "; - if (model.learner_model_param->num_output_group > 1) { - oss << "[num_class] * [number of data points], i.e. " - << model.learner_model_param->num_output_group << " * " << info.num_row_ - << " = " << n << ". "; - } else { - oss << "[number of data points], i.e. " << info.num_row_ << ". "; - } - oss << "Instead, all data points will use " - << "base_score = " << model.learner_model_param->base_score; - LOG(WARNING) << oss.str(); - } - std::fill(out_preds_h.begin(), out_preds_h.end(), - model.learner_model_param->base_score); - } - } - - void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector* out_preds, - const gbm::GBTreeModel& model, size_t tree_begin, - size_t tree_end) { - if (tree_end - tree_begin == 0) { - return; - } - model_.Init(model, tree_begin, tree_end, qu_); - - auto& out_preds_vec = out_preds->HostVector(); - - DeviceNodeOneAPI* nodes = model_.nodes; - cl::sycl::buffer out_preds_buf(out_preds_vec.data(), out_preds_vec.size()); - size_t* tree_segments = model_.tree_segments; - int* tree_group = model_.tree_group; - size_t* row_ptr = dmat->row_ptr; - EntryOneAPI* data = dmat->data; - int num_features = dmat->p_mat->Info().num_col_; - int num_rows = dmat->row_ptr_size - 1; - int num_group = model.learner_model_param->num_output_group; - - qu_.submit([&](cl::sycl::handler& cgh) { - auto out_predictions = out_preds_buf.get_access(cgh); - cgh.parallel_for(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) { - int global_idx = pid[0]; - if (global_idx >= num_rows) return; - if (num_group == 1) { - float sum = 0.0; - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin]; - sum += GetLeafWeight(global_idx, tree, data, row_ptr); - } - out_predictions[global_idx] += sum; - } else { - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { - const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin]; - int out_prediction_idx = global_idx * num_group + tree_group[tree_idx]; - out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr); - } - } - }); - }).wait(); - } - - public: - explicit PredictorOneAPI(Context const* generic_param) : - Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) { - cl::sycl::default_selector selector; - qu_ = cl::sycl::queue(selector); - } - - // ntree_limit is a very problematic parameter, as it's ambiguous in the context of - // multi-output and forest. Same problem exists for tree_begin - void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts, - const gbm::GBTreeModel& model, int tree_begin, - uint32_t const ntree_limit = 0) override { - if (this->device_matrix_cache_.find(dmat) == - this->device_matrix_cache_.end()) { - this->device_matrix_cache_.emplace( - dmat, std::unique_ptr( - new DeviceMatrixOneAPI(dmat, qu_))); - } - DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get(); - - // tree_begin is not used, right now we just enforce it to be 0. - CHECK_EQ(tree_begin, 0); - auto* out_preds = &predts->predictions; - CHECK_GE(predts->version, tree_begin); - if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) { - CHECK_EQ(predts->version, 0); - } - if (predts->version == 0) { - // out_preds->Size() can be non-zero as it's initialized here before any tree is - // built at the 0^th iterator. - this->InitOutPredictions(dmat->Info(), out_preds, model); - } - - uint32_t const output_groups = model.learner_model_param->num_output_group; - CHECK_NE(output_groups, 0); - // Right now we just assume ntree_limit provided by users means number of tree layers - // in the context of multi-output model - uint32_t real_ntree_limit = ntree_limit * output_groups; - if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) { - real_ntree_limit = static_cast(model.trees.size()); - } - - uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups; - // When users have provided ntree_limit, end_version can be lesser, cache is violated - if (predts->version > end_version) { - CHECK_NE(ntree_limit, 0); - this->InitOutPredictions(dmat->Info(), out_preds, model); - predts->version = 0; - } - uint32_t const beg_version = predts->version; - CHECK_LE(beg_version, end_version); - - if (beg_version < end_version) { - DevicePredictInternal(device_matrix, out_preds, model, - beg_version * output_groups, - end_version * output_groups); - } - - // delta means {size of forest} * {number of newly accumulated layers} - uint32_t delta = end_version - beg_version; - CHECK_LE(delta, model.trees.size()); - predts->Update(delta); - - CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ || - out_preds->Size() == dmat->Info().num_row_); - } - - void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing, - PredictionCacheEntry* out_preds, uint32_t tree_begin, - unsigned tree_end) const override { - cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end); - } - - void PredictInstance(const SparsePage::Inst& inst, - std::vector* out_preds, - const gbm::GBTreeModel& model, unsigned ntree_limit) override { - cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit); - } - - void PredictLeaf(DMatrix* p_fmat, std::vector* out_preds, - const gbm::GBTreeModel& model, unsigned ntree_limit) override { - cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit); - } - - void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - const gbm::GBTreeModel& model, uint32_t ntree_limit, - std::vector* tree_weights, - bool approximate, int condition, - unsigned condition_feature) override { - cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature); - } - - void PredictInteractionContributions(DMatrix* p_fmat, std::vector* out_contribs, - const gbm::GBTreeModel& model, unsigned ntree_limit, - std::vector* tree_weights, - bool approximate) override { - cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate); - } - - private: - cl::sycl::queue qu_; - DeviceModelOneAPI model_; - - std::mutex lock_; - std::unique_ptr cpu_predictor; - - std::unordered_map> - device_matrix_cache_; -}; - -XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor") -.describe("Make predictions using DPC++.") -.set_body([](Context const* generic_param) { - return new PredictorOneAPI(generic_param); - }); -} // namespace predictor -} // namespace xgboost diff --git a/plugin/updater_oneapi/regression_loss_oneapi.h b/plugin/updater_oneapi/regression_loss_oneapi.h deleted file mode 100755 index b0299ff7f..000000000 --- a/plugin/updater_oneapi/regression_loss_oneapi.h +++ /dev/null @@ -1,145 +0,0 @@ -/*! - * Copyright 2017-2020 XGBoost contributors - */ -#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_ -#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_ - -#include -#include -#include - -#include "CL/sycl.hpp" - -namespace xgboost { -namespace obj { - -/*! - * \brief calculate the sigmoid of the input. - * \param x input parameter - * \return the transformed value. - */ -inline float SigmoidOneAPI(float x) { - return 1.0f / (1.0f + cl::sycl::exp(-x)); -} - -// common regressions -// linear regression -struct LinearSquareLossOneAPI { - static bst_float PredTransform(bst_float x) { return x; } - static bool CheckLabel(bst_float x) { return true; } - static bst_float FirstOrderGradient(bst_float predt, bst_float label) { - return predt - label; - } - static bst_float SecondOrderGradient(bst_float predt, bst_float label) { - return 1.0f; - } - static bst_float ProbToMargin(bst_float base_score) { return base_score; } - static const char* LabelErrorMsg() { return ""; } - static const char* DefaultEvalMetric() { return "rmse"; } - - static const char* Name() { return "reg:squarederror_oneapi"; } -}; - -// TODO: DPC++ does not fully support std math inside offloaded kernels -struct SquaredLogErrorOneAPI { - static bst_float PredTransform(bst_float x) { return x; } - static bool CheckLabel(bst_float label) { - return label > -1; - } - static bst_float FirstOrderGradient(bst_float predt, bst_float label) { - predt = std::max(predt, (bst_float)(-1 + 1e-6)); // ensure correct value for log1p - return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1); - } - static bst_float SecondOrderGradient(bst_float predt, bst_float label) { - predt = std::max(predt, (bst_float)(-1 + 1e-6)); - float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) / - cl::sycl::pow(predt + 1, (bst_float)2); - res = std::max(res, (bst_float)1e-6f); - return res; - } - static bst_float ProbToMargin(bst_float base_score) { return base_score; } - static const char* LabelErrorMsg() { - return "label must be greater than -1 for rmsle so that log(label + 1) can be valid."; - } - static const char* DefaultEvalMetric() { return "rmsle"; } - - static const char* Name() { return "reg:squaredlogerror_oneapi"; } -}; - -// logistic loss for probability regression task -struct LogisticRegressionOneAPI { - // duplication is necessary, as __device__ specifier - // cannot be made conditional on template parameter - static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); } - static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; } - static bst_float FirstOrderGradient(bst_float predt, bst_float label) { - return predt - label; - } - static bst_float SecondOrderGradient(bst_float predt, bst_float label) { - const bst_float eps = 1e-16f; - return std::max(predt * (1.0f - predt), eps); - } - template - static T PredTransform(T x) { return SigmoidOneAPI(x); } - template - static T FirstOrderGradient(T predt, T label) { return predt - label; } - template - static T SecondOrderGradient(T predt, T label) { - const T eps = T(1e-16f); - return std::max(predt * (T(1.0f) - predt), eps); - } - static bst_float ProbToMargin(bst_float base_score) { - CHECK(base_score > 0.0f && base_score < 1.0f) - << "base_score must be in (0,1) for logistic loss, got: " << base_score; - return -logf(1.0f / base_score - 1.0f); - } - static const char* LabelErrorMsg() { - return "label must be in [0,1] for logistic regression"; - } - static const char* DefaultEvalMetric() { return "rmse"; } - - static const char* Name() { return "reg:logistic_oneapi"; } -}; - -// logistic loss for binary classification task -struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI { - static const char* DefaultEvalMetric() { return "logloss"; } - static const char* Name() { return "binary:logistic_oneapi"; } -}; - -// logistic loss, but predict un-transformed margin -struct LogisticRawOneAPI : public LogisticRegressionOneAPI { - // duplication is necessary, as __device__ specifier - // cannot be made conditional on template parameter - static bst_float PredTransform(bst_float x) { return x; } - static bst_float FirstOrderGradient(bst_float predt, bst_float label) { - predt = SigmoidOneAPI(predt); - return predt - label; - } - static bst_float SecondOrderGradient(bst_float predt, bst_float label) { - const bst_float eps = 1e-16f; - predt = SigmoidOneAPI(predt); - return std::max(predt * (1.0f - predt), eps); - } - template - static T PredTransform(T x) { return x; } - template - static T FirstOrderGradient(T predt, T label) { - predt = SigmoidOneAPI(predt); - return predt - label; - } - template - static T SecondOrderGradient(T predt, T label) { - const T eps = T(1e-16f); - predt = SigmoidOneAPI(predt); - return std::max(predt * (T(1.0f) - predt), eps); - } - static const char* DefaultEvalMetric() { return "logloss"; } - - static const char* Name() { return "binary:logitraw_oneapi"; } -}; - -} // namespace obj -} // namespace xgboost - -#endif // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_ diff --git a/plugin/updater_oneapi/regression_obj_oneapi.cc b/plugin/updater_oneapi/regression_obj_oneapi.cc deleted file mode 100755 index 3ee5741e7..000000000 --- a/plugin/updater_oneapi/regression_obj_oneapi.cc +++ /dev/null @@ -1,182 +0,0 @@ -#include -#include -#include -#include -#include - -#include "xgboost/host_device_vector.h" -#include "xgboost/json.h" -#include "xgboost/parameter.h" -#include "xgboost/span.h" - -#include "../../src/common/transform.h" -#include "../../src/common/common.h" -#include "./regression_loss_oneapi.h" - -#include "CL/sycl.hpp" - -namespace xgboost { -namespace obj { - -DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi); - -struct RegLossParamOneAPI : public XGBoostParameter { - float scale_pos_weight; - // declare parameters - DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) { - DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f) - .describe("Scale the weight of positive examples by this factor"); - } -}; - -template -class RegLossObjOneAPI : public ObjFunction { - protected: - HostDeviceVector label_correct_; - - public: - RegLossObjOneAPI() = default; - - void Configure(const std::vector >& args) override { - param_.UpdateAllowUnknown(args); - - cl::sycl::default_selector selector; - qu_ = cl::sycl::queue(selector); - } - - void GetGradient(const HostDeviceVector& preds, - const MetaInfo &info, - int iter, - HostDeviceVector* out_gpair) override { - if (info.labels_.Size() == 0U) { - LOG(WARNING) << "Label set is empty."; - } - CHECK_EQ(preds.Size(), info.labels_.Size()) - << " " << "labels are not correctly provided" - << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", " - << "Loss: " << Loss::Name(); - - size_t const ndata = preds.Size(); - out_gpair->Resize(ndata); - - // TODO: add label_correct check - label_correct_.Resize(1); - label_correct_.Fill(1); - - bool is_null_weight = info.weights_.Size() == 0; - - cl::sycl::buffer preds_buf(preds.HostPointer(), preds.Size()); - cl::sycl::buffer labels_buf(info.labels_.HostPointer(), info.labels_.Size()); - cl::sycl::buffer out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size()); - cl::sycl::buffer weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(), - is_null_weight ? 1 : info.weights_.Size()); - - cl::sycl::buffer additional_input_buf(1); - { - auto additional_input_acc = additional_input_buf.get_access(); - additional_input_acc[0] = 1; // Fill the label_correct flag - } - - auto scale_pos_weight = param_.scale_pos_weight; - if (!is_null_weight) { - CHECK_EQ(info.weights_.Size(), ndata) - << "Number of weights should be equal to number of data points."; - } - - qu_.submit([&](cl::sycl::handler& cgh) { - auto preds_acc = preds_buf.get_access(cgh); - auto labels_acc = labels_buf.get_access(cgh); - auto weights_acc = weights_buf.get_access(cgh); - auto out_gpair_acc = out_gpair_buf.get_access(cgh); - auto additional_input_acc = additional_input_buf.get_access(cgh); - cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) { - int idx = pid[0]; - bst_float p = Loss::PredTransform(preds_acc[idx]); - bst_float w = is_null_weight ? 1.0f : weights_acc[idx]; - bst_float label = labels_acc[idx]; - if (label == 1.0f) { - w *= scale_pos_weight; - } - if (!Loss::CheckLabel(label)) { - // If there is an incorrect label, the host code will know. - additional_input_acc[0] = 0; - } - out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w, - Loss::SecondOrderGradient(p, label) * w); - }); - }).wait(); - - int flag = 1; - { - auto additional_input_acc = additional_input_buf.get_access(); - flag = additional_input_acc[0]; - } - - if (flag == 0) { - LOG(FATAL) << Loss::LabelErrorMsg(); - } - - } - - public: - const char* DefaultEvalMetric() const override { - return Loss::DefaultEvalMetric(); - } - - void PredTransform(HostDeviceVector *io_preds) override { - size_t const ndata = io_preds->Size(); - - cl::sycl::buffer io_preds_buf(io_preds->HostPointer(), io_preds->Size()); - - qu_.submit([&](cl::sycl::handler& cgh) { - auto io_preds_acc = io_preds_buf.get_access(cgh); - cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) { - int idx = pid[0]; - io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]); - }); - }).wait(); - } - - float ProbToMargin(float base_score) const override { - return Loss::ProbToMargin(base_score); - } - - void SaveConfig(Json* p_out) const override { - auto& out = *p_out; - out["name"] = String(Loss::Name()); - out["reg_loss_param"] = ToJson(param_); - } - - void LoadConfig(Json const& in) override { - FromJson(in["reg_loss_param"], ¶m_); - } - - protected: - RegLossParamOneAPI param_; - - cl::sycl::queue qu_; -}; - -// register the objective functions -DMLC_REGISTER_PARAMETER(RegLossParamOneAPI); - -// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function -XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name()) -.describe("Regression with squared error with DPC++ backend.") -.set_body([]() { return new RegLossObjOneAPI(); }); -XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name()) -.describe("Regression with root mean squared logarithmic error with DPC++ backend.") -.set_body([]() { return new RegLossObjOneAPI(); }); -XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name()) -.describe("Logistic regression for probability regression task with DPC++ backend.") -.set_body([]() { return new RegLossObjOneAPI(); }); -XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name()) -.describe("Logistic regression for binary classification task with DPC++ backend.") -.set_body([]() { return new RegLossObjOneAPI(); }); -XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name()) -.describe("Logistic regression for classification, output score " - "before logistic transformation with DPC++ backend.") -.set_body([]() { return new RegLossObjOneAPI(); }); - -} // namespace obj -} // namespace xgboost diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f0dfe061f..161889f9e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,10 @@ if(USE_CUDA) target_sources(objxgboost PRIVATE ${CUDA_SOURCES}) endif() +if(PLUGIN_SYCL) + target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_SYCL=1) +endif() + target_include_directories(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/include diff --git a/src/common/common.h b/src/common/common.h index ed6ceceb8..4b20ce7c2 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -169,10 +169,10 @@ inline void AssertNCCLSupport() { #endif // !defined(XGBOOST_USE_NCCL) } -inline void AssertOneAPISupport() { -#ifndef XGBOOST_USE_ONEAPI - LOG(FATAL) << "XGBoost version not compiled with OneAPI support."; -#endif // XGBOOST_USE_ONEAPI +inline void AssertSYCLSupport() { +#ifndef XGBOOST_USE_SYCL + LOG(FATAL) << "XGBoost version not compiled with SYCL support."; +#endif // XGBOOST_USE_SYCL } void SetDevice(std::int32_t device); diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index b0327da15..9ff4abb4d 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -113,13 +113,13 @@ void GBTree::Configure(Args const& cfg) { } #endif // defined(XGBOOST_USE_CUDA) -#if defined(XGBOOST_USE_ONEAPI) - if (!oneapi_predictor_) { - oneapi_predictor_ = - std::unique_ptr(Predictor::Create("oneapi_predictor", this->ctx_)); +#if defined(XGBOOST_USE_SYCL) + if (!sycl_predictor_) { + sycl_predictor_ = + std::unique_ptr(Predictor::Create("sycl_predictor", this->ctx_)); } - oneapi_predictor_->Configure(cfg); -#endif // defined(XGBOOST_USE_ONEAPI) + sycl_predictor_->Configure(cfg); +#endif // defined(XGBOOST_USE_SYCL) // `updater` parameter was manually specified specified_updater_ = @@ -553,6 +553,11 @@ void GBTree::InplacePredict(std::shared_ptr p_m, float missing, }, [&, begin = tree_begin, end = tree_end] { return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end); +#if defined(XGBOOST_USE_SYCL) + }, + [&, begin = tree_begin, end = tree_end] { + return this->sycl_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end); +#endif // defined(XGBOOST_USE_SYCL) }); if (!known_type) { auto proxy = std::dynamic_pointer_cast(p_m); @@ -568,10 +573,16 @@ void GBTree::InplacePredict(std::shared_ptr p_m, float missing, if (f_dmat && !f_dmat->SingleColBlock()) { if (ctx_->IsCPU()) { return cpu_predictor_; - } else { + } else if (ctx_->IsCUDA()) { common::AssertGPUSupport(); CHECK(gpu_predictor_); return gpu_predictor_; + } else { +#if defined(XGBOOST_USE_SYCL) + common::AssertSYCLSupport(); + CHECK(sycl_predictor_); + return sycl_predictor_; +#endif // defined(XGBOOST_USE_SYCL) } } @@ -606,10 +617,16 @@ void GBTree::InplacePredict(std::shared_ptr p_m, float missing, if (ctx_->IsCPU()) { return cpu_predictor_; - } else { + } else if (ctx_->IsCUDA()) { common::AssertGPUSupport(); CHECK(gpu_predictor_); return gpu_predictor_; + } else { +#if defined(XGBOOST_USE_SYCL) + common::AssertSYCLSupport(); + CHECK(sycl_predictor_); + return sycl_predictor_; +#endif // defined(XGBOOST_USE_SYCL) } return cpu_predictor_; @@ -814,6 +831,11 @@ class Dart : public GBTree { }, [&] { return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1); +#if defined(XGBOOST_USE_SYCL) + }, + [&] { + return sycl_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1); +#endif // defined(XGBOOST_USE_SYCL) }); CHECK(success) << msg; }; @@ -830,6 +852,12 @@ class Dart : public GBTree { [&] { this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_); +#if defined(XGBOOST_USE_SYCL) + }, + [&] { + this->sycl_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, + model_); +#endif // defined(XGBOOST_USE_SYCL) }); } // Multiple the tree weight diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 827d85217..a2d84d848 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -349,9 +349,9 @@ class GBTree : public GradientBooster { // Predictors std::unique_ptr cpu_predictor_; std::unique_ptr gpu_predictor_{nullptr}; -#if defined(XGBOOST_USE_ONEAPI) - std::unique_ptr oneapi_predictor_; -#endif // defined(XGBOOST_USE_ONEAPI) +#if defined(XGBOOST_USE_SYCL) + std::unique_ptr sycl_predictor_; +#endif // defined(XGBOOST_USE_SYCL) common::Monitor monitor_; }; diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml new file mode 100644 index 000000000..bb14c1e77 --- /dev/null +++ b/tests/ci_build/conda_env/linux_sycl_test.yml @@ -0,0 +1,20 @@ +name: linux_sycl_test +channels: +- conda-forge +- intel +dependencies: +- python=3.8 +- cmake +- c-compiler +- cxx-compiler +- pip +- wheel +- numpy +- scipy +- scikit-learn +- pandas +- hypothesis>=6.46 +- pytest +- pytest-timeout +- pytest-cov +- dpcpp_linux-64 diff --git a/tests/ci_build/lint_cpp.py b/tests/ci_build/lint_cpp.py index 6ec2b4e7f..d4775d6b6 100644 --- a/tests/ci_build/lint_cpp.py +++ b/tests/ci_build/lint_cpp.py @@ -138,7 +138,7 @@ def main(): "path", nargs="*", help="Path to traverse", - default=["src", "include", os.path.join("R-package", "src"), "python-package"], + default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"], ) parser.add_argument( "--exclude_path", diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index e0d16efd4..fdd643da0 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -33,6 +33,7 @@ class LintersPaths: "tests/python-gpu/test_gpu_pickling.py", "tests/python-gpu/test_gpu_eval_metrics.py", "tests/python-gpu/test_gpu_with_sklearn.py", + "tests/python-sycl/test_sycl_prediction.py", "tests/test_distributed/test_with_spark/", "tests/test_distributed/test_gpu_with_spark/", # demo diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index ab82b6494..08862feee 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -13,9 +13,9 @@ if(USE_CUDA) list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES}) endif() -file(GLOB_RECURSE ONEAPI_TEST_SOURCES "plugin/*_oneapi.cc") -if(NOT PLUGIN_UPDATER_ONEAPI) - list(REMOVE_ITEM TEST_SOURCES ${ONEAPI_TEST_SOURCES}) +file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc") +if(NOT PLUGIN_SYCL) + list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES}) endif() if(PLUGIN_FEDERATED) diff --git a/tests/cpp/plugin/test_predictor_oneapi.cc b/tests/cpp/plugin/test_predictor_oneapi.cc deleted file mode 100755 index 52edd4a12..000000000 --- a/tests/cpp/plugin/test_predictor_oneapi.cc +++ /dev/null @@ -1,168 +0,0 @@ -/*! - * Copyright 2017-2020 XGBoost contributors - */ -#include -#include - -#include "../../../src/data/adapter.h" -#include "../../../src/gbm/gbtree_model.h" -#include "../filesystem.h" // dmlc::TemporaryDirectory -#include "../helpers.h" -#include "../predictor/test_predictor.h" - -namespace xgboost { -TEST(Plugin, OneAPIPredictorBasic) { - auto lparam = MakeCUDACtx(0); - std::unique_ptr oneapi_predictor = - std::unique_ptr(Predictor::Create("oneapi_predictor", &lparam)); - - int kRows = 5; - int kCols = 5; - - LearnerModelParam param; - param.num_feature = kCols; - param.base_score = 0.0; - param.num_output_group = 1; - - gbm::GBTreeModel model = CreateTestModel(¶m); - - auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); - - // Test predict batch - PredictionCacheEntry out_predictions; - oneapi_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0); - ASSERT_EQ(model.trees.size(), out_predictions.version); - std::vector& out_predictions_h = out_predictions.predictions.HostVector(); - for (size_t i = 0; i < out_predictions.predictions.Size(); i++) { - ASSERT_EQ(out_predictions_h[i], 1.5); - } - - // Test predict instance - auto const &batch = *dmat->GetBatches().begin(); - for (size_t i = 0; i < batch.Size(); i++) { - std::vector instance_out_predictions; - oneapi_predictor->PredictInstance(batch[i], &instance_out_predictions, model); - ASSERT_EQ(instance_out_predictions[0], 1.5); - } - - // Test predict leaf - std::vector leaf_out_predictions; - oneapi_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model); - for (auto v : leaf_out_predictions) { - ASSERT_EQ(v, 0); - } - - // Test predict contribution - std::vector out_contribution; - oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model); - ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1)); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue(). - if ((i+1) % (kCols+1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } - // Test predict contribution (approximate method) - oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model, 0, nullptr, true); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue(). - if ((i+1) % (kCols+1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } -} - -TEST(Plugin, OneAPIPredictorExternalMemory) { - dmlc::TemporaryDirectory tmpdir; - std::string filename = tmpdir.path + "/big.libsvm"; - std::unique_ptr dmat = CreateSparsePageDMatrix(12, 64, filename); - auto lparam = MakeCUDACtx(0); - - std::unique_ptr oneapi_predictor = - std::unique_ptr(Predictor::Create("oneapi_predictor", &lparam)); - - LearnerModelParam param; - param.base_score = 0; - param.num_feature = dmat->Info().num_col_; - param.num_output_group = 1; - - gbm::GBTreeModel model = CreateTestModel(¶m); - - // Test predict batch - PredictionCacheEntry out_predictions; - oneapi_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0); - std::vector &out_predictions_h = out_predictions.predictions.HostVector(); - ASSERT_EQ(out_predictions.predictions.Size(), dmat->Info().num_row_); - for (const auto& v : out_predictions_h) { - ASSERT_EQ(v, 1.5); - } - - // Test predict leaf - std::vector leaf_out_predictions; - oneapi_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model); - ASSERT_EQ(leaf_out_predictions.size(), dmat->Info().num_row_); - for (const auto& v : leaf_out_predictions) { - ASSERT_EQ(v, 0); - } - - // Test predict contribution - std::vector out_contribution; - oneapi_predictor->PredictContribution(dmat.get(), &out_contribution, model); - ASSERT_EQ(out_contribution.size(), dmat->Info().num_row_ * (dmat->Info().num_col_ + 1)); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue(). - if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } - - // Test predict contribution (approximate method) - std::vector out_contribution_approximate; - oneapi_predictor->PredictContribution(dmat.get(), &out_contribution_approximate, model, 0, nullptr, true); - ASSERT_EQ(out_contribution_approximate.size(), - dmat->Info().num_row_ * (dmat->Info().num_col_ + 1)); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue(). - if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } -} - -TEST(Plugin, OneAPIPredictorInplacePredict) { - bst_row_t constexpr kRows{128}; - bst_feature_t constexpr kCols{64}; - auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1); - { - HostDeviceVector data; - gen.GenerateDense(&data); - ASSERT_EQ(data.Size(), kRows * kCols); - std::shared_ptr x{ - new data::DenseAdapter(data.HostPointer(), kRows, kCols)}; - TestInplacePrediction(x, "oneapi_predictor", kRows, kCols, -1); - } - - { - HostDeviceVector data; - HostDeviceVector rptrs; - HostDeviceVector columns; - gen.GenerateCSR(&data, &rptrs, &columns); - std::shared_ptr x{new data::CSRAdapter( - rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), kRows, - data.Size(), kCols)}; - TestInplacePrediction(x, "oneapi_predictor", kRows, kCols, -1); - } -} -} // namespace xgboost diff --git a/tests/cpp/plugin/test_regression_obj_oneapi.cc b/tests/cpp/plugin/test_regression_obj_oneapi.cc deleted file mode 100755 index c01d9d951..000000000 --- a/tests/cpp/plugin/test_regression_obj_oneapi.cc +++ /dev/null @@ -1,176 +0,0 @@ -/*! - * Copyright 2017-2019 XGBoost contributors - */ -#include -#include -#include -#include -#include "../helpers.h" -namespace xgboost { - -TEST(Plugin, LinearRegressionGPairOneAPI) { - Context tparam = MakeCUDACtx(0); - std::vector> args; - - std::unique_ptr obj { - ObjFunction::Create("reg:squarederror_oneapi", &tparam) - }; - - obj->Configure(args); - CheckObjFunction(obj, - {0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, - {0, 0, 0, 0, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1}, - {0, 0.1f, 0.9f, 1.0f, -1.0f, -0.9f, -0.1f, 0}, - {1, 1, 1, 1, 1, 1, 1, 1}); - CheckObjFunction(obj, - {0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, - {0, 0, 0, 0, 1, 1, 1, 1}, - {}, // empty weight - {0, 0.1f, 0.9f, 1.0f, -1.0f, -0.9f, -0.1f, 0}, - {1, 1, 1, 1, 1, 1, 1, 1}); - ASSERT_NO_THROW(obj->DefaultEvalMetric()); -} - -TEST(Plugin, SquaredLogOneAPI) { - Context tparam = MakeCUDACtx(0); - std::vector> args; - - std::unique_ptr obj { ObjFunction::Create("reg:squaredlogerror_oneapi", &tparam) }; - obj->Configure(args); - CheckConfigReload(obj, "reg:squaredlogerror_oneapi"); - - CheckObjFunction(obj, - {0.1f, 0.2f, 0.4f, 0.8f, 1.6f}, // pred - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, // labels - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, // weights - {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f}, - { 1.3205f, 1.0492f, 0.69215f, 0.34115f, 0.1091f}); - CheckObjFunction(obj, - {0.1f, 0.2f, 0.4f, 0.8f, 1.6f}, // pred - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, // labels - {}, // empty weights - {-0.5435f, -0.4257f, -0.25475f, -0.05855f, 0.1009f}, - { 1.3205f, 1.0492f, 0.69215f, 0.34115f, 0.1091f}); - ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"rmsle"}); -} - -TEST(Plugin, LogisticRegressionGPairOneAPI) { - Context tparam = MakeCUDACtx(0); - std::vector> args; - std::unique_ptr obj { ObjFunction::Create("reg:logistic_oneapi", &tparam) }; - - obj->Configure(args); - CheckConfigReload(obj, "reg:logistic_oneapi"); - - CheckObjFunction(obj, - { 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, // preds - { 0, 0, 0, 0, 1, 1, 1, 1}, // labels - { 1, 1, 1, 1, 1, 1, 1, 1}, // weights - { 0.5f, 0.52f, 0.71f, 0.73f, -0.5f, -0.47f, -0.28f, -0.26f}, // out_grad - {0.25f, 0.24f, 0.20f, 0.19f, 0.25f, 0.24f, 0.20f, 0.19f}); // out_hess -} - -TEST(Plugin, LogisticRegressionBasicOneAPI) { - Context lparam = MakeCUDACtx(0); - std::vector> args; - std::unique_ptr obj { - ObjFunction::Create("reg:logistic_oneapi", &lparam) - }; - - obj->Configure(args); - CheckConfigReload(obj, "reg:logistic_oneapi"); - - // test label validation - EXPECT_ANY_THROW(CheckObjFunction(obj, {0}, {10}, {1}, {0}, {0})) - << "Expected error when label not in range [0,1f] for LogisticRegression"; - - // test ProbToMargin - EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.197f, 0.01f); - EXPECT_NEAR(obj->ProbToMargin(0.5f), 0, 0.01f); - EXPECT_NEAR(obj->ProbToMargin(0.9f), 2.197f, 0.01f); - EXPECT_ANY_THROW(obj->ProbToMargin(10)) - << "Expected error when base_score not in range [0,1f] for LogisticRegression"; - - // test PredTransform - HostDeviceVector io_preds = {0, 0.1f, 0.5f, 0.9f, 1}; - std::vector out_preds = {0.5f, 0.524f, 0.622f, 0.710f, 0.731f}; - obj->PredTransform(&io_preds); - auto& preds = io_preds.HostVector(); - for (int i = 0; i < static_cast(io_preds.Size()); ++i) { - EXPECT_NEAR(preds[i], out_preds[i], 0.01f); - } -} - -TEST(Plugin, LogisticRawGPairOneAPI) { - Context lparam = MakeCUDACtx(0); - std::vector> args; - std::unique_ptr obj { - ObjFunction::Create("binary:logitraw_oneapi", &lparam) - }; - - obj->Configure(args); - - CheckObjFunction(obj, - { 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, - { 0, 0, 0, 0, 1, 1, 1, 1}, - { 1, 1, 1, 1, 1, 1, 1, 1}, - { 0.5f, 0.52f, 0.71f, 0.73f, -0.5f, -0.47f, -0.28f, -0.26f}, - {0.25f, 0.24f, 0.20f, 0.19f, 0.25f, 0.24f, 0.20f, 0.19f}); -} - -TEST(Plugin, CPUvsOneAPI) { - Context ctx = MakeCUDACtx(0); - - ObjFunction * obj_cpu = - ObjFunction::Create("reg:squarederror", &ctx); - ObjFunction * obj_oneapi = - ObjFunction::Create("reg:squarederror_oneapi", &ctx); - HostDeviceVector cpu_out_preds; - HostDeviceVector oneapi_out_preds; - - constexpr size_t kRows = 400; - constexpr size_t kCols = 100; - auto pdmat = RandomDataGenerator(kRows, kCols, 0).Seed(0).GenerateDMatrix(); - HostDeviceVector preds; - preds.Resize(kRows); - auto& h_preds = preds.HostVector(); - for (size_t i = 0; i < h_preds.size(); ++i) { - h_preds[i] = static_cast(i); - } - auto& info = pdmat->Info(); - - info.labels.Reshape(kRows, 1); - auto& h_labels = info.labels.Data()->HostVector(); - for (size_t i = 0; i < h_labels.size(); ++i) { - h_labels[i] = 1 / static_cast(i+1); - } - - { - // CPU - ctx = ctx.MakeCPU(); - obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds); - } - { - // oneapi - ctx.gpu_id = 0; - obj_oneapi->GetGradient(preds, info, 0, &oneapi_out_preds); - } - - auto& h_cpu_out = cpu_out_preds.HostVector(); - auto& h_oneapi_out = oneapi_out_preds.HostVector(); - - float sgrad = 0; - float shess = 0; - for (size_t i = 0; i < kRows; ++i) { - sgrad += std::pow(h_cpu_out[i].GetGrad() - h_oneapi_out[i].GetGrad(), 2); - shess += std::pow(h_cpu_out[i].GetHess() - h_oneapi_out[i].GetHess(), 2); - } - ASSERT_NEAR(sgrad, 0.0f, kRtEps); - ASSERT_NEAR(shess, 0.0f, kRtEps); - - delete obj_cpu; - delete obj_oneapi; -} - -} // namespace xgboost diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc new file mode 100755 index 000000000..f82a9f33d --- /dev/null +++ b/tests/cpp/plugin/test_sycl_predictor.cc @@ -0,0 +1,101 @@ +/*! + * Copyright 2017-2023 XGBoost contributors + */ +#include +#include + +#include "../../../src/data/adapter.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../../../src/gbm/gbtree.h" +#include "../../../src/gbm/gbtree_model.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory +#include "../helpers.h" +#include "../predictor/test_predictor.h" + +namespace xgboost { + +TEST(SyclPredictor, Basic) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + size_t constexpr kRows = 5; + size_t constexpr kCols = 5; + auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); + TestBasic(dmat.get(), &ctx); +} + +TEST(SyclPredictor, ExternalMemory) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + size_t constexpr kPageSize = 64, kEntriesPerCol = 3; + size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; + std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries); + TestBasic(dmat.get(), &ctx); +} + +TEST(SyclPredictor, InplacePredict) { + bst_row_t constexpr kRows{128}; + bst_feature_t constexpr kCols{64}; + Context ctx; + auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device()); + { + HostDeviceVector data; + gen.GenerateDense(&data); + ASSERT_EQ(data.Size(), kRows * kCols); + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + std::shared_ptr x{new data::DMatrixProxy{}}; + auto array_interface = GetArrayInterface(&data, kRows, kCols); + std::string arr_str; + Json::Dump(array_interface, &arr_str); + x->SetArrayData(arr_str.data()); + TestInplacePrediction(&ctx, x, kRows, kCols); + } +} + +TEST(SyclPredictor, IterationRange) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + TestIterationRange(&ctx); +} + +TEST(SyclPredictor, GHistIndexTraining) { + size_t constexpr kRows{128}, kCols{16}, kBins{64}; + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateDMatrix(false); + HostDeviceVector storage(kRows * kCols); + auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage); + auto adapter = data::ArrayAdapter(columnar.c_str()); + std::shared_ptr p_full{ + DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1)}; + TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist); +} + +TEST(SyclPredictor, CategoricalPredictLeaf) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + TestCategoricalPredictLeaf(&ctx, false); +} + +TEST(SyclPredictor, LesserFeatures) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + TestPredictionWithLesserFeatures(&ctx); +} + +TEST(SyclPredictor, Sparse) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + TestSparsePrediction(&ctx, 0.2); + TestSparsePrediction(&ctx, 0.8); +} + +TEST(SyclPredictor, Multi) { + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + TestVectorLeafPrediction(&ctx); +} + +} // namespace xgboost \ No newline at end of file diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 07f33d72e..8f3955c05 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -18,92 +18,17 @@ namespace xgboost { -namespace { -void TestBasic(DMatrix* dmat) { - Context ctx; - std::unique_ptr cpu_predictor = - std::unique_ptr(Predictor::Create("cpu_predictor", &ctx)); - - size_t const kRows = dmat->Info().num_row_; - size_t const kCols = dmat->Info().num_col_; - - LearnerModelParam mparam{MakeMP(kCols, .0, 1)}; - - ctx.UpdateAllowUnknown(Args{}); - gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx); - - // Test predict batch - PredictionCacheEntry out_predictions; - cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model); - cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0); - - std::vector& out_predictions_h = out_predictions.predictions.HostVector(); - for (size_t i = 0; i < out_predictions.predictions.Size(); i++) { - ASSERT_EQ(out_predictions_h[i], 1.5); - } - - // Test predict instance - auto const& batch = *dmat->GetBatches().begin(); - auto page = batch.GetView(); - for (size_t i = 0; i < batch.Size(); i++) { - std::vector instance_out_predictions; - cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model, 0, - dmat->Info().IsColumnSplit()); - ASSERT_EQ(instance_out_predictions[0], 1.5); - } - - // Test predict leaf - HostDeviceVector leaf_out_predictions; - cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model); - auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector(); - for (auto v : h_leaf_out_predictions) { - ASSERT_EQ(v, 0); - } - - if (dmat->Info().IsColumnSplit()) { - // Predict contribution is not supported for column split. - return; - } - - // Test predict contribution - HostDeviceVector out_contribution_hdv; - auto& out_contribution = out_contribution_hdv.HostVector(); - cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model); - ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1)); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is - // filled with LeafValue(). - if ((i + 1) % (kCols + 1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } - // Test predict contribution (approximate method) - cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true); - for (size_t i = 0; i < out_contribution.size(); ++i) { - auto const& contri = out_contribution[i]; - // shift 1 for bias, as test tree is a decision dump, only global bias is - // filled with LeafValue(). - if ((i + 1) % (kCols + 1) == 0) { - ASSERT_EQ(out_contribution.back(), 1.5f); - } else { - ASSERT_EQ(contri, 0); - } - } -} -} // anonymous namespace - TEST(CpuPredictor, Basic) { + Context ctx; size_t constexpr kRows = 5; size_t constexpr kCols = 5; auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); - TestBasic(dmat.get()); + TestBasic(dmat.get(), &ctx); } namespace { void TestColumnSplit() { + Context ctx; size_t constexpr kRows = 5; size_t constexpr kCols = 5; auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); @@ -112,7 +37,7 @@ void TestColumnSplit() { auto const rank = collective::GetRank(); dmat = std::unique_ptr{dmat->SliceCol(world_size, rank)}; - TestBasic(dmat.get()); + TestBasic(dmat.get(), &ctx); } } // anonymous namespace @@ -132,10 +57,11 @@ TEST(CpuPredictor, IterationRangeColmnSplit) { } TEST(CpuPredictor, ExternalMemory) { + Context ctx; size_t constexpr kPageSize = 64, kEntriesPerCol = 3; size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; std::unique_ptr dmat = CreateSparsePageDMatrix(kEntries); - TestBasic(dmat.get()); + TestBasic(dmat.get(), &ctx); } TEST(CpuPredictor, InplacePredict) { @@ -235,12 +161,14 @@ TEST(CPUPredictor, CategoricalPredictionColumnSplit) { } TEST(CPUPredictor, CategoricalPredictLeaf) { - TestCategoricalPredictLeaf(false, false); + Context ctx; + TestCategoricalPredictLeaf(&ctx, false); } TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) { auto constexpr kWorldSize = 2; - RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, false, true); + Context ctx; + RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, &ctx, true); } TEST(CpuPredictor, UpdatePredictionCache) { diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 883e6e01c..50e036b90 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -289,11 +289,13 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) { } TEST(GPUPredictor, CategoricalPredictLeaf) { - TestCategoricalPredictLeaf(true, false); + auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); + TestCategoricalPredictLeaf(&ctx, false); } TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) { - RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, true, true); + auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); + RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, &ctx, true); } TEST(GPUPredictor, PredictLeafBasic) { diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index 21aa483e4..6ee34ae69 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -26,6 +26,79 @@ #include "xgboost/tree_model.h" // for RegTree namespace xgboost { + +void TestBasic(DMatrix* dmat, Context const *ctx) { + auto predictor = std::unique_ptr(CreatePredictorForTest(ctx)); + + size_t const kRows = dmat->Info().num_row_; + size_t const kCols = dmat->Info().num_col_; + + LearnerModelParam mparam{MakeMP(kCols, .0, 1)}; + + gbm::GBTreeModel model = CreateTestModel(&mparam, ctx); + + // Test predict batch + PredictionCacheEntry out_predictions; + predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model); + predictor->PredictBatch(dmat, &out_predictions, model, 0); + + std::vector& out_predictions_h = out_predictions.predictions.HostVector(); + for (size_t i = 0; i < out_predictions.predictions.Size(); i++) { + ASSERT_EQ(out_predictions_h[i], 1.5); + } + + // Test predict instance + auto const& batch = *dmat->GetBatches().begin(); + auto page = batch.GetView(); + for (size_t i = 0; i < batch.Size(); i++) { + std::vector instance_out_predictions; + predictor->PredictInstance(page[i], &instance_out_predictions, model, 0, + dmat->Info().IsColumnSplit()); + ASSERT_EQ(instance_out_predictions[0], 1.5); + } + + // Test predict leaf + HostDeviceVector leaf_out_predictions; + predictor->PredictLeaf(dmat, &leaf_out_predictions, model); + auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector(); + for (auto v : h_leaf_out_predictions) { + ASSERT_EQ(v, 0); + } + + if (dmat->Info().IsColumnSplit()) { + // Predict contribution is not supported for column split. + return; + } + + // Test predict contribution + HostDeviceVector out_contribution_hdv; + auto& out_contribution = out_contribution_hdv.HostVector(); + predictor->PredictContribution(dmat, &out_contribution_hdv, model); + ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1)); + for (size_t i = 0; i < out_contribution.size(); ++i) { + auto const& contri = out_contribution[i]; + // shift 1 for bias, as test tree is a decision dump, only global bias is + // filled with LeafValue(). + if ((i + 1) % (kCols + 1) == 0) { + ASSERT_EQ(out_contribution.back(), 1.5f); + } else { + ASSERT_EQ(contri, 0); + } + } + // Test predict contribution (approximate method) + predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true); + for (size_t i = 0; i < out_contribution.size(); ++i) { + auto const& contri = out_contribution[i]; + // shift 1 for bias, as test tree is a decision dump, only global bias is + // filled with LeafValue(). + if ((i + 1) % (kCols + 1) == 0) { + ASSERT_EQ(out_contribution.back(), 1.5f); + } else { + ASSERT_EQ(contri, 0); + } + } +} + TEST(Predictor, PredictionCache) { size_t constexpr kRows = 16, kCols = 4; @@ -64,7 +137,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins, {"num_feature", std::to_string(kCols)}, {"num_class", std::to_string(kClasses)}, {"max_bin", std::to_string(bins)}, - {"device", ctx->DeviceName()}}); + {"device", ctx->IsSycl() ? "cpu" : ctx->DeviceName()}}); learner->Configure(); for (size_t i = 0; i < kIters; ++i) { @@ -151,7 +224,7 @@ std::unique_ptr LearnerForTest(Context const *ctx, std::shared_ptr learner{Learner::Create({dmat})}; learner->SetParams( - Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}}); + Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->IsSycl() ? "cpu" : ctx->DeviceName()}}); for (size_t i = 0; i < iters; ++i) { learner->UpdateOneIter(i, dmat); } @@ -305,11 +378,7 @@ void TestCategoricalPrediction(bool use_gpu, bool is_column_split) { ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score); } -void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) { - Context ctx; - if (use_gpu) { - ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); - } +void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) { size_t constexpr kCols = 10; PredictionCacheEntry out_predictions; @@ -320,10 +389,10 @@ void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) { float left_weight = 1.3f; float right_weight = 1.7f; - gbm::GBTreeModel model(&mparam, &ctx); + gbm::GBTreeModel model(&mparam, ctx); GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight); - std::unique_ptr predictor{CreatePredictorForTest(&ctx)}; + std::unique_ptr predictor{CreatePredictorForTest(ctx)}; std::vector row(kCols); row[split_ind] = split_cat; @@ -363,7 +432,6 @@ void TestIterationRange(Context const* ctx) { HostDeviceVector out_predt_sliced; HostDeviceVector out_predt_ranged; - // margin { sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false, false, false); learner->Predict(dmat, true, &out_predt_ranged, 0, lend, false, false, false, false, false); @@ -519,6 +587,8 @@ void TestSparsePrediction(Context const *ctx, float sparsity) { learner.reset(Learner::Create({Xy})); learner->LoadModel(model); + learner->SetParam("device", ctx->DeviceName()); + learner->Configure(); if (ctx->IsCUDA()) { learner->SetParam("tree_method", "gpu_hist"); diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h index 9e0891d56..c2b28883a 100644 --- a/tests/cpp/predictor/test_predictor.h +++ b/tests/cpp/predictor/test_predictor.h @@ -34,6 +34,8 @@ inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context inline auto CreatePredictorForTest(Context const* ctx) { if (ctx->IsCPU()) { return Predictor::Create("cpu_predictor", ctx); + } else if (ctx->IsSycl()) { + return Predictor::Create("sycl_predictor", ctx); } else { return Predictor::Create("gpu_predictor", ctx); } @@ -83,6 +85,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col } } +void TestBasic(DMatrix* dmat, Context const * ctx); + // p_full and p_hist should come from the same data set. void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins, std::shared_ptr p_full, std::shared_ptr p_hist); @@ -98,7 +102,7 @@ void TestCategoricalPrediction(bool use_gpu, bool is_column_split); void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu); -void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split); +void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split); void TestIterationRange(Context const* ctx); diff --git a/tests/python-sycl/test_sycl_prediction.py b/tests/python-sycl/test_sycl_prediction.py new file mode 100644 index 000000000..06167c6c0 --- /dev/null +++ b/tests/python-sycl/test_sycl_prediction.py @@ -0,0 +1,165 @@ +import sys +import unittest +import pytest + +import numpy as np +import xgboost as xgb +from hypothesis import given, strategies, assume, settings, note + +from xgboost import testing as tm + +rng = np.random.RandomState(1994) + +shap_parameter_strategy = strategies.fixed_dictionaries( + { + "max_depth": strategies.integers(1, 11), + "max_leaves": strategies.integers(0, 256), + "num_parallel_tree": strategies.sampled_from([1, 10]), + } +).filter(lambda x: x["max_depth"] > 0 or x["max_leaves"] > 0) + + +class TestSYCLPredict(unittest.TestCase): + def test_predict(self): + iterations = 10 + np.random.seed(1) + test_num_rows = [10, 1000, 5000] + test_num_cols = [10, 50, 500] + for num_rows in test_num_rows: + for num_cols in test_num_cols: + dtrain = xgb.DMatrix( + np.random.randn(num_rows, num_cols), + label=[0, 1] * int(num_rows / 2), + ) + dval = xgb.DMatrix( + np.random.randn(num_rows, num_cols), + label=[0, 1] * int(num_rows / 2), + ) + dtest = xgb.DMatrix( + np.random.randn(num_rows, num_cols), + label=[0, 1] * int(num_rows / 2), + ) + watchlist = [(dtrain, "train"), (dval, "validation")] + res = {} + param = { + "objective": "binary:logistic", + "eval_metric": "logloss", + "tree_method": "hist", + "device": "cpu", + "max_depth": 1, + "verbosity": 0, + } + bst = xgb.train( + param, dtrain, iterations, evals=watchlist, evals_result=res + ) + assert tm.non_increasing(res["train"]["logloss"]) + cpu_pred_train = bst.predict(dtrain, output_margin=True) + cpu_pred_test = bst.predict(dtest, output_margin=True) + cpu_pred_val = bst.predict(dval, output_margin=True) + + bst.set_param({"device": "sycl"}) + sycl_pred_train = bst.predict(dtrain, output_margin=True) + sycl_pred_test = bst.predict(dtest, output_margin=True) + sycl_pred_val = bst.predict(dval, output_margin=True) + + np.testing.assert_allclose(cpu_pred_train, sycl_pred_train, rtol=1e-6) + np.testing.assert_allclose(cpu_pred_val, sycl_pred_val, rtol=1e-6) + np.testing.assert_allclose(cpu_pred_test, sycl_pred_test, rtol=1e-6) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_multi_predict(self): + from sklearn.datasets import make_regression + from sklearn.model_selection import train_test_split + + n = 1000 + X, y = make_regression(n, random_state=rng) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) + dtrain = xgb.DMatrix(X_train, label=y_train) + dtest = xgb.DMatrix(X_test) + + params = {} + params["tree_method"] = "hist" + params["device"] = "cpu" + + bst = xgb.train(params, dtrain) + cpu_predict = bst.predict(dtest) + + bst.set_param({"device": "sycl"}) + + predict0 = bst.predict(dtest) + predict1 = bst.predict(dtest) + + assert np.allclose(predict0, predict1) + assert np.allclose(predict0, cpu_predict) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_sklearn(self): + m, n = 15000, 14 + tr_size = 2500 + X = np.random.rand(m, n) + y = 200 * np.matmul(X, np.arange(-3, -3 + n)) + X_train, y_train = X[:tr_size, :], y[:tr_size] + X_test, y_test = X[tr_size:, :], y[tr_size:] + + # First with cpu_predictor + params = { + "tree_method": "hist", + "device": "cpu", + "n_jobs": -1, + "verbosity": 0, + "seed": 123, + } + m = xgb.XGBRegressor(**params).fit(X_train, y_train) + cpu_train_score = m.score(X_train, y_train) + cpu_test_score = m.score(X_test, y_test) + + # Now with sycl_predictor + params["device"] = "sycl" + m.set_params(**params) + + sycl_train_score = m.score(X_train, y_train) + sycl_test_score = m.score(X_test, y_test) + + assert np.allclose(cpu_train_score, sycl_train_score) + assert np.allclose(cpu_test_score, sycl_test_score) + + @given( + strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy + ) + @settings(deadline=None) + def test_shap(self, num_rounds, dataset, param): + if dataset.name.endswith("-l1"): # not supported by the exact tree method + return + param.update({"tree_method": "hist", "device": "cpu"}) + param = dataset.set_params(param) + dmat = dataset.get_dmat() + bst = xgb.train(param, dmat, num_rounds) + test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) + bst.set_param({"device": "sycl"}) + shap = bst.predict(test_dmat, pred_contribs=True) + margin = bst.predict(test_dmat, output_margin=True) + assume(len(dataset.y) > 0) + assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) + + @given( + strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy + ) + @settings(deadline=None, max_examples=20) + def test_shap_interactions(self, num_rounds, dataset, param): + if dataset.name.endswith("-l1"): # not supported by the exact tree method + return + param.update({"tree_method": "hist", "device": "cpu"}) + param = dataset.set_params(param) + dmat = dataset.get_dmat() + bst = xgb.train(param, dmat, num_rounds) + test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) + bst.set_param({"device": "sycl"}) + shap = bst.predict(test_dmat, pred_interactions=True) + margin = bst.predict(test_dmat, output_margin=True) + assume(len(dataset.y) > 0) + assert np.allclose( + np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), + margin, + 1e-3, + 1e-3, + )